diff options
-rw-r--r-- | pkg/sentry/hostmm/BUILD | 18 | ||||
-rw-r--r-- | pkg/sentry/hostmm/cgroup.go | 111 | ||||
-rw-r--r-- | pkg/sentry/hostmm/hostmm.go | 130 | ||||
-rw-r--r-- | pkg/sentry/pgalloc/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/pgalloc/pgalloc.go | 63 | ||||
-rw-r--r-- | runsc/boot/loader.go | 3 |
6 files changed, 317 insertions, 9 deletions
diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD new file mode 100644 index 000000000..1a4632a54 --- /dev/null +++ b/pkg/sentry/hostmm/BUILD @@ -0,0 +1,18 @@ +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "hostmm", + srcs = [ + "cgroup.go", + "hostmm.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/hostmm", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/fd", + "//pkg/log", + "//pkg/sentry/usermem", + ], +) diff --git a/pkg/sentry/hostmm/cgroup.go b/pkg/sentry/hostmm/cgroup.go new file mode 100644 index 000000000..e5cc26ab2 --- /dev/null +++ b/pkg/sentry/hostmm/cgroup.go @@ -0,0 +1,111 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostmm + +import ( + "bufio" + "fmt" + "os" + "path" + "strings" +) + +// currentCgroupDirectory returns the directory for the cgroup for the given +// controller in which the calling process resides. +func currentCgroupDirectory(ctrl string) (string, error) { + root, err := cgroupRootDirectory(ctrl) + if err != nil { + return "", err + } + cg, err := currentCgroup(ctrl) + if err != nil { + return "", err + } + return path.Join(root, cg), nil +} + +// cgroupRootDirectory returns the root directory for the cgroup hierarchy in +// which the given cgroup controller is mounted in the calling process' mount +// namespace. +func cgroupRootDirectory(ctrl string) (string, error) { + const path = "/proc/self/mounts" + file, err := os.Open(path) + if err != nil { + return "", err + } + defer file.Close() + + // Per proc(5) -> fstab(5): + // Each line of /proc/self/mounts describes a mount. + scanner := bufio.NewScanner(file) + for scanner.Scan() { + // Each line consists of 6 space-separated fields. Find the line for + // which the third field (fs_vfstype) is cgroup, and the fourth field + // (fs_mntops, a comma-separated list of mount options) contains + // ctrl. + var spec, file, vfstype, mntopts, freq, passno string + const nrfields = 6 + line := scanner.Text() + n, err := fmt.Sscan(line, &spec, &file, &vfstype, &mntopts, &freq, &passno) + if err != nil { + return "", fmt.Errorf("failed to parse %s: %v", path, err) + } + if n != nrfields { + return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, n, nrfields) + } + if vfstype != "cgroup" { + continue + } + for _, mntopt := range strings.Split(mntopts, ",") { + if mntopt == ctrl { + return file, nil + } + } + } + return "", fmt.Errorf("no cgroup hierarchy mounted for controller %s", ctrl) +} + +// currentCgroup returns the cgroup for the given controller in which the +// calling process resides. The returned string is a path that should be +// interpreted as relative to cgroupRootDirectory(ctrl). +func currentCgroup(ctrl string) (string, error) { + const path = "/proc/self/cgroup" + file, err := os.Open(path) + if err != nil { + return "", err + } + defer file.Close() + + // Per proc(5) -> cgroups(7): + // Each line of /proc/self/cgroups describes a cgroup hierarchy. + scanner := bufio.NewScanner(file) + for scanner.Scan() { + // Each line consists of 3 colon-separated fields. Find the line for + // which the second field (controller-list, a comma-separated list of + // cgroup controllers) contains ctrl. + line := scanner.Text() + const nrfields = 3 + fields := strings.Split(line, ":") + if len(fields) != nrfields { + return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, len(fields), nrfields) + } + for _, controller := range strings.Split(fields[1], ",") { + if controller == ctrl { + return fields[2], nil + } + } + } + return "", fmt.Errorf("not a member of a cgroup hierarchy for controller %s", ctrl) +} diff --git a/pkg/sentry/hostmm/hostmm.go b/pkg/sentry/hostmm/hostmm.go new file mode 100644 index 000000000..5432cada9 --- /dev/null +++ b/pkg/sentry/hostmm/hostmm.go @@ -0,0 +1,130 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package hostmm provides tools for interacting with the host Linux kernel's +// virtual memory management subsystem. +package hostmm + +import ( + "fmt" + "os" + "path" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// NotifyCurrentMemcgPressureCallback requests that f is called whenever the +// calling process' memory cgroup indicates memory pressure of the given level, +// as specified by Linux's Documentation/cgroup-v1/memory.txt. +// +// If NotifyCurrentMemcgPressureCallback succeeds, it returns a function that +// terminates the requested memory pressure notifications. This function may be +// called at most once. +func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) { + cgdir, err := currentCgroupDirectory("memory") + if err != nil { + return nil, err + } + + pressurePath := path.Join(cgdir, "memory.pressure_level") + pressureFile, err := os.Open(pressurePath) + if err != nil { + return nil, err + } + defer pressureFile.Close() + + eventControlPath := path.Join(cgdir, "cgroup.event_control") + eventControlFile, err := os.OpenFile(eventControlPath, os.O_WRONLY, 0) + if err != nil { + return nil, err + } + defer eventControlFile.Close() + + eventFD, err := newEventFD() + if err != nil { + return nil, err + } + + // Don't use fmt.Fprintf since the whole string needs to be written in a + // single syscall. + eventControlStr := fmt.Sprintf("%d %d %s", eventFD.FD(), pressureFile.Fd(), level) + if n, err := eventControlFile.Write([]byte(eventControlStr)); n != len(eventControlStr) || err != nil { + eventFD.Close() + return nil, fmt.Errorf("error writing %q to %s: got (%d, %v), wanted (%d, nil)", eventControlStr, eventControlPath, n, err, len(eventControlStr)) + } + + log.Debugf("Receiving memory pressure level notifications from %s at level %q", pressurePath, level) + const sizeofUint64 = 8 + // The most significant bit of the eventfd value is set by the stop + // function, which is practically unambiguous since it's not plausible for + // 2**63 pressure events to occur between eventfd reads. + const stopVal = 1 << 63 + stopCh := make(chan struct{}) + go func() { // S/R-SAFE: f provides synchronization if necessary + rw := fd.NewReadWriter(eventFD.FD()) + var buf [sizeofUint64]byte + for { + n, err := rw.Read(buf[:]) + if err != nil { + if err == syscall.EINTR { + continue + } + panic(fmt.Sprintf("failed to read from memory pressure level eventfd: %v", err)) + } + if n != sizeofUint64 { + panic(fmt.Sprintf("short read from memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64)) + } + val := usermem.ByteOrder.Uint64(buf[:]) + if val >= stopVal { + // Assume this was due to the notifier's "destructor" (the + // function returned by NotifyCurrentMemcgPressureCallback + // below) being called. + eventFD.Close() + close(stopCh) + return + } + f() + } + }() + return func() { + rw := fd.NewReadWriter(eventFD.FD()) + var buf [sizeofUint64]byte + usermem.ByteOrder.PutUint64(buf[:], stopVal) + for { + n, err := rw.Write(buf[:]) + if err != nil { + if err == syscall.EINTR { + continue + } + panic(fmt.Sprintf("failed to write to memory pressure level eventfd: %v", err)) + } + if n != sizeofUint64 { + panic(fmt.Sprintf("short write to memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64)) + } + break + } + <-stopCh + }, nil +} + +func newEventFD() (*fd.FD, error) { + f, _, e := syscall.Syscall(syscall.SYS_EVENTFD2, 0, 0, 0) + if e != 0 { + return nil, fmt.Errorf("failed to create eventfd: %v", e) + } + return fd.New(int(f)), nil +} diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index 8a8a0e4e4..bbdb1f922 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -65,6 +65,7 @@ go_library( "//pkg/log", "//pkg/sentry/arch", "//pkg/sentry/context", + "//pkg/sentry/hostmm", "//pkg/sentry/memutil", "//pkg/sentry/platform", "//pkg/sentry/safemem", diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index 2b9924ad7..6d91f1a7b 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -32,6 +32,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/hostmm" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" "gvisor.googlesource.com/gvisor/pkg/sentry/usage" @@ -162,6 +163,11 @@ type MemoryFile struct { // evictionWG counts the number of goroutines currently performing evictions. evictionWG sync.WaitGroup + + // stopNotifyPressure stops memory cgroup pressure level + // notifications used to drive eviction. stopNotifyPressure is + // immutable. + stopNotifyPressure func() } // MemoryFileOpts provides options to NewMemoryFile. @@ -169,6 +175,11 @@ type MemoryFileOpts struct { // DelayedEviction controls the extent to which the MemoryFile may delay // eviction of evictable allocations. DelayedEviction DelayedEvictionType + + // If UseHostMemcgPressure is true, use host memory cgroup pressure level + // notifications to determine when eviction is necessary. This option has + // no effect unless DelayedEviction is DelayedEvictionEnabled. + UseHostMemcgPressure bool } // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction. @@ -186,9 +197,14 @@ const ( // evictable allocations until doing so is considered necessary to avoid // performance degradation due to host memory pressure, or OOM kills. // - // As of this writing, DelayedEvictionEnabled delays evictions until the - // reclaimer goroutine is out of work (pages to reclaim), then evicts all - // pending evictable allocations immediately. + // As of this writing, the behavior of DelayedEvictionEnabled depends on + // whether or not MemoryFileOpts.UseHostMemcgPressure is enabled: + // + // - If UseHostMemcgPressure is true, evictions are delayed until memory + // pressure is indicated. + // + // - Otherwise, evictions are only delayed until the reclaimer goroutine + // is out of work (pages to reclaim). DelayedEvictionEnabled // DelayedEvictionManual requires that evictable allocations are only @@ -292,6 +308,22 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) { } f.mappings.Store(make([]uintptr, initialSize/chunkSize)) f.reclaimCond.L = &f.mu + + if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure { + stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() { + f.mu.Lock() + startedAny := f.startEvictionsLocked() + f.mu.Unlock() + if startedAny { + log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure") + } + }, "low") + if err != nil { + return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err) + } + f.stopNotifyPressure = stop + } + go f.runReclaim() // S/R-SAFE: f.mu // The Linux kernel contains an optional feature called "Integrity @@ -692,9 +724,11 @@ func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) // Kick off eviction immediately. f.startEvictionGoroutineLocked(user, info) case DelayedEvictionEnabled: - // Ensure that the reclaimer goroutine is running, so that it can - // start eviction when necessary. - f.reclaimCond.Signal() + if !f.opts.UseHostMemcgPressure { + // Ensure that the reclaimer goroutine is running, so that it + // can start eviction when necessary. + f.reclaimCond.Signal() + } } } } @@ -992,11 +1026,12 @@ func (f *MemoryFile) runReclaim() { } f.markReclaimed(fr) } + // We only get here if findReclaimable finds f.destroyed set and returns // false. f.mu.Lock() - defer f.mu.Unlock() if !f.destroyed { + f.mu.Unlock() panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set") } f.file.Close() @@ -1016,6 +1051,13 @@ func (f *MemoryFile) runReclaim() { } // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.) f.mappings.Store([]uintptr{}) + f.mu.Unlock() + + // This must be called without holding f.mu to avoid circular lock + // ordering. + if f.stopNotifyPressure != nil { + f.stopNotifyPressure() + } } func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { @@ -1029,7 +1071,7 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { if f.reclaimable { break } - if f.opts.DelayedEviction == DelayedEvictionEnabled { + if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure { // No work to do. Evict any pending evictable allocations to // get more reclaimable pages before going to sleep. f.startEvictionsLocked() @@ -1089,14 +1131,17 @@ func (f *MemoryFile) StartEvictions() { } // Preconditions: f.mu must be locked. -func (f *MemoryFile) startEvictionsLocked() { +func (f *MemoryFile) startEvictionsLocked() bool { + startedAny := false for user, info := range f.evictable { // Don't start multiple goroutines to evict the same user's // allocations. if !info.evicting { f.startEvictionGoroutineLocked(user, info) + startedAny = true } } + return startedAny } // Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index a997776f8..ef4ccd0bd 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -424,6 +424,9 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return nil, fmt.Errorf("error creating memfd: %v", err) } memfile := os.NewFile(uintptr(memfd), memfileName) + // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if + // there are memory cgroups specified, because at this point we're already + // in a mount namespace in which the relevant cgroupfs is not visible. mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) if err != nil { memfile.Close() |