summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorJamie Liu <jamieliu@google.com>2019-06-06 16:26:00 -0700
committerShentubot <shentubot@google.com>2019-06-06 16:27:55 -0700
commita26043ee53a2f38b81c9eaa098d115025e87f4c3 (patch)
tree666132f75a64348ca56628d635a7fc427ebc80d9
parent93aa7d11673392ca51ba69122ff5fe1aad7331b9 (diff)
Implement reclaim-driven MemoryFile eviction.
PiperOrigin-RevId: 251950660
-rw-r--r--pkg/sentry/hostmm/BUILD18
-rw-r--r--pkg/sentry/hostmm/cgroup.go111
-rw-r--r--pkg/sentry/hostmm/hostmm.go130
-rw-r--r--pkg/sentry/pgalloc/BUILD1
-rw-r--r--pkg/sentry/pgalloc/pgalloc.go63
-rw-r--r--runsc/boot/loader.go3
6 files changed, 317 insertions, 9 deletions
diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD
new file mode 100644
index 000000000..1a4632a54
--- /dev/null
+++ b/pkg/sentry/hostmm/BUILD
@@ -0,0 +1,18 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "hostmm",
+ srcs = [
+ "cgroup.go",
+ "hostmm.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/hostmm",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/fd",
+ "//pkg/log",
+ "//pkg/sentry/usermem",
+ ],
+)
diff --git a/pkg/sentry/hostmm/cgroup.go b/pkg/sentry/hostmm/cgroup.go
new file mode 100644
index 000000000..e5cc26ab2
--- /dev/null
+++ b/pkg/sentry/hostmm/cgroup.go
@@ -0,0 +1,111 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostmm
+
+import (
+ "bufio"
+ "fmt"
+ "os"
+ "path"
+ "strings"
+)
+
+// currentCgroupDirectory returns the directory for the cgroup for the given
+// controller in which the calling process resides.
+func currentCgroupDirectory(ctrl string) (string, error) {
+ root, err := cgroupRootDirectory(ctrl)
+ if err != nil {
+ return "", err
+ }
+ cg, err := currentCgroup(ctrl)
+ if err != nil {
+ return "", err
+ }
+ return path.Join(root, cg), nil
+}
+
+// cgroupRootDirectory returns the root directory for the cgroup hierarchy in
+// which the given cgroup controller is mounted in the calling process' mount
+// namespace.
+func cgroupRootDirectory(ctrl string) (string, error) {
+ const path = "/proc/self/mounts"
+ file, err := os.Open(path)
+ if err != nil {
+ return "", err
+ }
+ defer file.Close()
+
+ // Per proc(5) -> fstab(5):
+ // Each line of /proc/self/mounts describes a mount.
+ scanner := bufio.NewScanner(file)
+ for scanner.Scan() {
+ // Each line consists of 6 space-separated fields. Find the line for
+ // which the third field (fs_vfstype) is cgroup, and the fourth field
+ // (fs_mntops, a comma-separated list of mount options) contains
+ // ctrl.
+ var spec, file, vfstype, mntopts, freq, passno string
+ const nrfields = 6
+ line := scanner.Text()
+ n, err := fmt.Sscan(line, &spec, &file, &vfstype, &mntopts, &freq, &passno)
+ if err != nil {
+ return "", fmt.Errorf("failed to parse %s: %v", path, err)
+ }
+ if n != nrfields {
+ return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, n, nrfields)
+ }
+ if vfstype != "cgroup" {
+ continue
+ }
+ for _, mntopt := range strings.Split(mntopts, ",") {
+ if mntopt == ctrl {
+ return file, nil
+ }
+ }
+ }
+ return "", fmt.Errorf("no cgroup hierarchy mounted for controller %s", ctrl)
+}
+
+// currentCgroup returns the cgroup for the given controller in which the
+// calling process resides. The returned string is a path that should be
+// interpreted as relative to cgroupRootDirectory(ctrl).
+func currentCgroup(ctrl string) (string, error) {
+ const path = "/proc/self/cgroup"
+ file, err := os.Open(path)
+ if err != nil {
+ return "", err
+ }
+ defer file.Close()
+
+ // Per proc(5) -> cgroups(7):
+ // Each line of /proc/self/cgroups describes a cgroup hierarchy.
+ scanner := bufio.NewScanner(file)
+ for scanner.Scan() {
+ // Each line consists of 3 colon-separated fields. Find the line for
+ // which the second field (controller-list, a comma-separated list of
+ // cgroup controllers) contains ctrl.
+ line := scanner.Text()
+ const nrfields = 3
+ fields := strings.Split(line, ":")
+ if len(fields) != nrfields {
+ return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, len(fields), nrfields)
+ }
+ for _, controller := range strings.Split(fields[1], ",") {
+ if controller == ctrl {
+ return fields[2], nil
+ }
+ }
+ }
+ return "", fmt.Errorf("not a member of a cgroup hierarchy for controller %s", ctrl)
+}
diff --git a/pkg/sentry/hostmm/hostmm.go b/pkg/sentry/hostmm/hostmm.go
new file mode 100644
index 000000000..5432cada9
--- /dev/null
+++ b/pkg/sentry/hostmm/hostmm.go
@@ -0,0 +1,130 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package hostmm provides tools for interacting with the host Linux kernel's
+// virtual memory management subsystem.
+package hostmm
+
+import (
+ "fmt"
+ "os"
+ "path"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/fd"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// NotifyCurrentMemcgPressureCallback requests that f is called whenever the
+// calling process' memory cgroup indicates memory pressure of the given level,
+// as specified by Linux's Documentation/cgroup-v1/memory.txt.
+//
+// If NotifyCurrentMemcgPressureCallback succeeds, it returns a function that
+// terminates the requested memory pressure notifications. This function may be
+// called at most once.
+func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) {
+ cgdir, err := currentCgroupDirectory("memory")
+ if err != nil {
+ return nil, err
+ }
+
+ pressurePath := path.Join(cgdir, "memory.pressure_level")
+ pressureFile, err := os.Open(pressurePath)
+ if err != nil {
+ return nil, err
+ }
+ defer pressureFile.Close()
+
+ eventControlPath := path.Join(cgdir, "cgroup.event_control")
+ eventControlFile, err := os.OpenFile(eventControlPath, os.O_WRONLY, 0)
+ if err != nil {
+ return nil, err
+ }
+ defer eventControlFile.Close()
+
+ eventFD, err := newEventFD()
+ if err != nil {
+ return nil, err
+ }
+
+ // Don't use fmt.Fprintf since the whole string needs to be written in a
+ // single syscall.
+ eventControlStr := fmt.Sprintf("%d %d %s", eventFD.FD(), pressureFile.Fd(), level)
+ if n, err := eventControlFile.Write([]byte(eventControlStr)); n != len(eventControlStr) || err != nil {
+ eventFD.Close()
+ return nil, fmt.Errorf("error writing %q to %s: got (%d, %v), wanted (%d, nil)", eventControlStr, eventControlPath, n, err, len(eventControlStr))
+ }
+
+ log.Debugf("Receiving memory pressure level notifications from %s at level %q", pressurePath, level)
+ const sizeofUint64 = 8
+ // The most significant bit of the eventfd value is set by the stop
+ // function, which is practically unambiguous since it's not plausible for
+ // 2**63 pressure events to occur between eventfd reads.
+ const stopVal = 1 << 63
+ stopCh := make(chan struct{})
+ go func() { // S/R-SAFE: f provides synchronization if necessary
+ rw := fd.NewReadWriter(eventFD.FD())
+ var buf [sizeofUint64]byte
+ for {
+ n, err := rw.Read(buf[:])
+ if err != nil {
+ if err == syscall.EINTR {
+ continue
+ }
+ panic(fmt.Sprintf("failed to read from memory pressure level eventfd: %v", err))
+ }
+ if n != sizeofUint64 {
+ panic(fmt.Sprintf("short read from memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64))
+ }
+ val := usermem.ByteOrder.Uint64(buf[:])
+ if val >= stopVal {
+ // Assume this was due to the notifier's "destructor" (the
+ // function returned by NotifyCurrentMemcgPressureCallback
+ // below) being called.
+ eventFD.Close()
+ close(stopCh)
+ return
+ }
+ f()
+ }
+ }()
+ return func() {
+ rw := fd.NewReadWriter(eventFD.FD())
+ var buf [sizeofUint64]byte
+ usermem.ByteOrder.PutUint64(buf[:], stopVal)
+ for {
+ n, err := rw.Write(buf[:])
+ if err != nil {
+ if err == syscall.EINTR {
+ continue
+ }
+ panic(fmt.Sprintf("failed to write to memory pressure level eventfd: %v", err))
+ }
+ if n != sizeofUint64 {
+ panic(fmt.Sprintf("short write to memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64))
+ }
+ break
+ }
+ <-stopCh
+ }, nil
+}
+
+func newEventFD() (*fd.FD, error) {
+ f, _, e := syscall.Syscall(syscall.SYS_EVENTFD2, 0, 0, 0)
+ if e != 0 {
+ return nil, fmt.Errorf("failed to create eventfd: %v", e)
+ }
+ return fd.New(int(f)), nil
+}
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 8a8a0e4e4..bbdb1f922 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -65,6 +65,7 @@ go_library(
"//pkg/log",
"//pkg/sentry/arch",
"//pkg/sentry/context",
+ "//pkg/sentry/hostmm",
"//pkg/sentry/memutil",
"//pkg/sentry/platform",
"//pkg/sentry/safemem",
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 2b9924ad7..6d91f1a7b 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -32,6 +32,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/log"
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/hostmm"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -162,6 +163,11 @@ type MemoryFile struct {
// evictionWG counts the number of goroutines currently performing evictions.
evictionWG sync.WaitGroup
+
+ // stopNotifyPressure stops memory cgroup pressure level
+ // notifications used to drive eviction. stopNotifyPressure is
+ // immutable.
+ stopNotifyPressure func()
}
// MemoryFileOpts provides options to NewMemoryFile.
@@ -169,6 +175,11 @@ type MemoryFileOpts struct {
// DelayedEviction controls the extent to which the MemoryFile may delay
// eviction of evictable allocations.
DelayedEviction DelayedEvictionType
+
+ // If UseHostMemcgPressure is true, use host memory cgroup pressure level
+ // notifications to determine when eviction is necessary. This option has
+ // no effect unless DelayedEviction is DelayedEvictionEnabled.
+ UseHostMemcgPressure bool
}
// DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
@@ -186,9 +197,14 @@ const (
// evictable allocations until doing so is considered necessary to avoid
// performance degradation due to host memory pressure, or OOM kills.
//
- // As of this writing, DelayedEvictionEnabled delays evictions until the
- // reclaimer goroutine is out of work (pages to reclaim), then evicts all
- // pending evictable allocations immediately.
+ // As of this writing, the behavior of DelayedEvictionEnabled depends on
+ // whether or not MemoryFileOpts.UseHostMemcgPressure is enabled:
+ //
+ // - If UseHostMemcgPressure is true, evictions are delayed until memory
+ // pressure is indicated.
+ //
+ // - Otherwise, evictions are only delayed until the reclaimer goroutine
+ // is out of work (pages to reclaim).
DelayedEvictionEnabled
// DelayedEvictionManual requires that evictable allocations are only
@@ -292,6 +308,22 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
}
f.mappings.Store(make([]uintptr, initialSize/chunkSize))
f.reclaimCond.L = &f.mu
+
+ if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
+ stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() {
+ f.mu.Lock()
+ startedAny := f.startEvictionsLocked()
+ f.mu.Unlock()
+ if startedAny {
+ log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure")
+ }
+ }, "low")
+ if err != nil {
+ return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err)
+ }
+ f.stopNotifyPressure = stop
+ }
+
go f.runReclaim() // S/R-SAFE: f.mu
// The Linux kernel contains an optional feature called "Integrity
@@ -692,9 +724,11 @@ func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange)
// Kick off eviction immediately.
f.startEvictionGoroutineLocked(user, info)
case DelayedEvictionEnabled:
- // Ensure that the reclaimer goroutine is running, so that it can
- // start eviction when necessary.
- f.reclaimCond.Signal()
+ if !f.opts.UseHostMemcgPressure {
+ // Ensure that the reclaimer goroutine is running, so that it
+ // can start eviction when necessary.
+ f.reclaimCond.Signal()
+ }
}
}
}
@@ -992,11 +1026,12 @@ func (f *MemoryFile) runReclaim() {
}
f.markReclaimed(fr)
}
+
// We only get here if findReclaimable finds f.destroyed set and returns
// false.
f.mu.Lock()
- defer f.mu.Unlock()
if !f.destroyed {
+ f.mu.Unlock()
panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
}
f.file.Close()
@@ -1016,6 +1051,13 @@ func (f *MemoryFile) runReclaim() {
}
// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
f.mappings.Store([]uintptr{})
+ f.mu.Unlock()
+
+ // This must be called without holding f.mu to avoid circular lock
+ // ordering.
+ if f.stopNotifyPressure != nil {
+ f.stopNotifyPressure()
+ }
}
func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
@@ -1029,7 +1071,7 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
if f.reclaimable {
break
}
- if f.opts.DelayedEviction == DelayedEvictionEnabled {
+ if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure {
// No work to do. Evict any pending evictable allocations to
// get more reclaimable pages before going to sleep.
f.startEvictionsLocked()
@@ -1089,14 +1131,17 @@ func (f *MemoryFile) StartEvictions() {
}
// Preconditions: f.mu must be locked.
-func (f *MemoryFile) startEvictionsLocked() {
+func (f *MemoryFile) startEvictionsLocked() bool {
+ startedAny := false
for user, info := range f.evictable {
// Don't start multiple goroutines to evict the same user's
// allocations.
if !info.evicting {
f.startEvictionGoroutineLocked(user, info)
+ startedAny = true
}
}
+ return startedAny
}
// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index a997776f8..ef4ccd0bd 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -424,6 +424,9 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
return nil, fmt.Errorf("error creating memfd: %v", err)
}
memfile := os.NewFile(uintptr(memfd), memfileName)
+ // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if
+ // there are memory cgroups specified, because at this point we're already
+ // in a mount namespace in which the relevant cgroupfs is not visible.
mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
if err != nil {
memfile.Close()