summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/syscalls
diff options
context:
space:
mode:
authorJamie Liu <jamieliu@google.com>2018-12-17 11:37:38 -0800
committerShentubot <shentubot@google.com>2018-12-17 11:38:59 -0800
commit2421006426445a1827422c2dbdd6fc6a47087147 (patch)
tree49aa2bc113c208fc117aff8a036866a7260090e5 /pkg/sentry/syscalls
parent54694086dfb02a6f8453f043a44ffd10bb5a7070 (diff)
Implement mlock(), kind of.
Currently mlock() and friends do nothing whatsoever. However, mlocking is directly application-visible in a number of ways; for example, madvise(MADV_DONTNEED) and msync(MS_INVALIDATE) both fail on mlocked regions. We handle this inconsistently: MADV_DONTNEED is too important to not work, but MS_INVALIDATE is rejected. Change MM to track mlocked regions in a manner consistent with Linux. It still will not actually pin pages into host physical memory, but: - mlock() will now cause sentry memory management to precommit mlocked pages. - MADV_DONTNEED and MS_INVALIDATE will interact with mlocked pages as described above. PiperOrigin-RevId: 225861605 Change-Id: Iee187204979ac9a4d15d0e037c152c0902c8d0ee
Diffstat (limited to 'pkg/sentry/syscalls')
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go15
-rw-r--r--pkg/sentry/syscalls/linux/sys_mmap.go106
-rw-r--r--pkg/sentry/syscalls/linux/sys_rlimit.go1
3 files changed, 75 insertions, 47 deletions
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 2aab948da..cc5ebb955 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -196,11 +196,11 @@ var AMD64 = &kernel.SyscallTable{
145: SchedGetscheduler,
146: SchedGetPriorityMax,
147: SchedGetPriorityMin,
- 148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval,
- 149: syscalls.Error(nil), // Mlock, TODO
- 150: syscalls.Error(nil), // Munlock, TODO
- 151: syscalls.Error(nil), // Mlockall, TODO
- 152: syscalls.Error(nil), // Munlockall, TODO
+ 148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval,
+ 149: Mlock,
+ 150: Munlock,
+ 151: Mlockall,
+ 152: Munlockall,
153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup,
154: syscalls.Error(syscall.EPERM), // ModifyLdt,
155: syscalls.Error(syscall.EPERM), // PivotRoot,
@@ -373,8 +373,9 @@ var AMD64 = &kernel.SyscallTable{
// 322: Execveat, TODO
// 323: Userfaultfd, TODO
// 324: Membarrier, TODO
- // Syscalls after 325 are backports from 4.6.
- 325: syscalls.Error(nil), // Mlock2, TODO
+ 325: Mlock2,
+ // Syscalls after 325 are "backports" from versions of Linux after 4.4.
+ // 326: CopyFileRange,
327: Preadv2,
// 328: Pwritev2, // Pwritev2, TODO
},
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 145f7846c..8732861e0 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -69,6 +69,9 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
Precommit: linux.MAP_POPULATE&flags != 0,
}
+ if linux.MAP_LOCKED&flags != 0 {
+ opts.MLockMode = memmap.MLockEager
+ }
defer func() {
if opts.MappingIdentity != nil {
opts.MappingIdentity.DecRef()
@@ -384,16 +387,6 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
length := args[1].SizeT()
flags := args[2].Int()
- if addr != addr.RoundDown() {
- return 0, nil, syserror.EINVAL
- }
- if length == 0 {
- return 0, nil, nil
- }
- la, ok := usermem.Addr(length).RoundUp()
- if !ok {
- return 0, nil, syserror.ENOMEM
- }
// "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
// and may additionally include the MS_INVALIDATE bit. ... However, Linux
// permits a call to msync() that specifies neither of these flags, with
@@ -406,39 +399,72 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if sync && flags&linux.MS_ASYNC != 0 {
return 0, nil, syserror.EINVAL
}
+ err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{
+ Sync: sync,
+ Invalidate: flags&linux.MS_INVALIDATE != 0,
+ })
+ // MSync calls fsync, the same interrupt conversion rules apply, see
+ // mm/msync.c, fsync POSIX.1-2008.
+ return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// Mlock implements linux syscall mlock(2).
+func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].SizeT()
+
+ return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager)
+}
- // MS_INVALIDATE "asks to invalidate other mappings of the same file (so
- // that they can be updated with the fresh values just written)". This is a
- // no-op given that shared memory exists. However, MS_INVALIDATE can also
- // be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags,
- // and a memory lock exists for the specified address range." Given that
- // mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since
- // some user program could be using it for synchronization.
- if flags&linux.MS_INVALIDATE != 0 {
+// Mlock2 implements linux syscall mlock2(2).
+func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].SizeT()
+ flags := args[2].Int()
+
+ if flags&^(linux.MLOCK_ONFAULT) != 0 {
return 0, nil, syserror.EINVAL
}
- // MS_SYNC "requests an update and waits for it to complete."
- if sync {
- err := t.MemoryManager().Sync(t, addr, uint64(la))
- // Sync calls fsync, the same interrupt conversion rules apply, see
- // mm/msync.c, fsync POSIX.1-2008.
- return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
- }
- // MS_ASYNC "specifies that an update be scheduled, but the call returns
- // immediately". As long as dirty pages are tracked and eventually written
- // back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC
- // is in fact a no-op, since the kernel properly tracks dirty pages and
- // flushes them to storage as necessary.")
- //
- // However: "ENOMEM: The indicated memory (or part of it) was not mapped."
- // This applies even for MS_ASYNC.
- ar, ok := addr.ToRange(uint64(la))
- if !ok {
- return 0, nil, syserror.ENOMEM
+
+ mode := memmap.MLockEager
+ if flags&linux.MLOCK_ONFAULT != 0 {
+ mode = memmap.MLockLazy
}
- mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
- if mapped != uint64(la) {
- return 0, nil, syserror.ENOMEM
+ return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode)
+}
+
+// Munlock implements linux syscall munlock(2).
+func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].SizeT()
+
+ return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone)
+}
+
+// Mlockall implements linux syscall mlockall(2).
+func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := args[0].Int()
+
+ if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 {
+ return 0, nil, syserror.EINVAL
}
- return 0, nil, nil
+
+ mode := memmap.MLockEager
+ if flags&linux.MCL_ONFAULT != 0 {
+ mode = memmap.MLockLazy
+ }
+ return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+ Current: flags&linux.MCL_CURRENT != 0,
+ Future: flags&linux.MCL_FUTURE != 0,
+ Mode: mode,
+ })
+}
+
+// Munlockall implements linux syscall munlockall(2).
+func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+ Current: true,
+ Future: true,
+ Mode: memmap.MLockNone,
+ })
}
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index 2f16e1791..b0b216045 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -90,6 +90,7 @@ var setableLimits = map[limits.LimitType]struct{}{
limits.CPU: {},
limits.Data: {},
limits.FileSize: {},
+ limits.MemoryLocked: {},
limits.Stack: {},
// These are not enforced, but we include them here to avoid returning
// EPERM, since some apps expect them to succeed.