diff options
Diffstat (limited to 'pkg/sentry/syscalls/linux/sys_mmap.go')
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_mmap.go | 470 |
1 files changed, 470 insertions, 0 deletions
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go new file mode 100644 index 000000000..64a6e639c --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -0,0 +1,470 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "bytes" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Brk implements linux syscall brk(2). +func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr, _ := t.MemoryManager().Brk(t, args[0].Pointer()) + // "However, the actual Linux system call returns the new program break on + // success. On failure, the system call returns the current break." - + // brk(2) + return uintptr(addr), nil, nil +} + +// Mmap implements linux syscall mmap(2). +func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + prot := args[2].Int() + flags := args[3].Int() + fd := kdefs.FD(args[4].Int()) + fixed := flags&linux.MAP_FIXED != 0 + private := flags&linux.MAP_PRIVATE != 0 + shared := flags&linux.MAP_SHARED != 0 + anon := flags&linux.MAP_ANONYMOUS != 0 + map32bit := flags&linux.MAP_32BIT != 0 + + // Require exactly one of MAP_PRIVATE and MAP_SHARED. + if private == shared { + return 0, nil, syserror.EINVAL + } + + opts := memmap.MMapOpts{ + Length: args[1].Uint64(), + Offset: args[5].Uint64(), + Addr: args[0].Pointer(), + Fixed: fixed, + Unmap: fixed, + Map32Bit: map32bit, + Private: private, + Perms: usermem.AccessType{ + Read: linux.PROT_READ&prot != 0, + Write: linux.PROT_WRITE&prot != 0, + Execute: linux.PROT_EXEC&prot != 0, + }, + MaxPerms: usermem.AnyAccess, + GrowsDown: linux.MAP_GROWSDOWN&flags != 0, + Precommit: linux.MAP_POPULATE&flags != 0, + } + if linux.MAP_LOCKED&flags != 0 { + opts.MLockMode = memmap.MLockEager + } + defer func() { + if opts.MappingIdentity != nil { + opts.MappingIdentity.DecRef() + } + }() + + if !anon { + // Convert the passed FD to a file reference. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + flags := file.Flags() + // mmap unconditionally requires that the FD is readable. + if !flags.Read { + return 0, nil, syserror.EACCES + } + // MAP_SHARED requires that the FD be writable for PROT_WRITE. + if shared && !flags.Write { + opts.MaxPerms.Write = false + } + + if err := file.ConfigureMMap(t, &opts); err != nil { + return 0, nil, err + } + } + + rv, err := t.MemoryManager().MMap(t, opts) + return uintptr(rv), nil, err +} + +// Munmap implements linux syscall munmap(2). +func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64()) +} + +// Mremap implements linux syscall mremap(2). +func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldAddr := args[0].Pointer() + oldSize := args[1].Uint64() + newSize := args[2].Uint64() + flags := args[3].Uint64() + newAddr := args[4].Pointer() + + if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 { + return 0, nil, syserror.EINVAL + } + mayMove := flags&linux.MREMAP_MAYMOVE != 0 + fixed := flags&linux.MREMAP_FIXED != 0 + var moveMode mm.MRemapMoveMode + switch { + case !mayMove && !fixed: + moveMode = mm.MRemapNoMove + case mayMove && !fixed: + moveMode = mm.MRemapMayMove + case mayMove && fixed: + moveMode = mm.MRemapMustMove + case !mayMove && fixed: + // "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be + // specified." - mremap(2) + return 0, nil, syserror.EINVAL + } + + rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{ + Move: moveMode, + NewAddr: newAddr, + }) + return uintptr(rv), nil, err +} + +// Mprotect implements linux syscall mprotect(2). +func Mprotect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + length := args[1].Uint64() + prot := args[2].Int() + err := t.MemoryManager().MProtect(args[0].Pointer(), length, usermem.AccessType{ + Read: linux.PROT_READ&prot != 0, + Write: linux.PROT_WRITE&prot != 0, + Execute: linux.PROT_EXEC&prot != 0, + }, linux.PROT_GROWSDOWN&prot != 0) + return 0, nil, err +} + +// Madvise implements linux syscall madvise(2). +func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := uint64(args[1].SizeT()) + adv := args[2].Int() + + // "The Linux implementation requires that the address addr be + // page-aligned, and allows length to be zero." - madvise(2) + if addr.RoundDown() != addr { + return 0, nil, syserror.EINVAL + } + if length == 0 { + return 0, nil, nil + } + // Not explicitly stated: length need not be page-aligned. + lenAddr, ok := usermem.Addr(length).RoundUp() + if !ok { + return 0, nil, syserror.EINVAL + } + length = uint64(lenAddr) + + switch adv { + case linux.MADV_DONTNEED: + return 0, nil, t.MemoryManager().Decommit(addr, length) + case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE: + fallthrough + case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE: + fallthrough + case linux.MADV_DONTDUMP, linux.MADV_DODUMP: + // TODO(b/72045799): Core dumping isn't implemented, so these are + // no-ops. + fallthrough + case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED: + // Do nothing, we totally ignore the suggestions above. + return 0, nil, nil + case linux.MADV_REMOVE, linux.MADV_DOFORK, linux.MADV_DONTFORK: + // These "suggestions" have application-visible side effects, so we + // have to indicate that we don't support them. + return 0, nil, syserror.ENOSYS + case linux.MADV_HWPOISON: + // Only privileged processes are allowed to poison pages. + return 0, nil, syserror.EPERM + default: + // If adv is not a valid value tell the caller. + return 0, nil, syserror.EINVAL + } +} + +func copyOutIfNotNull(t *kernel.Task, ptr usermem.Addr, val interface{}) (int, error) { + if ptr != 0 { + return t.CopyOut(ptr, val) + } + return 0, nil +} + +// GetMempolicy implements the syscall get_mempolicy(2). +func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + mode := args[0].Pointer() + nodemask := args[1].Pointer() + maxnode := args[2].Uint() + addr := args[3].Pointer() + flags := args[4].Uint() + + memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0 + nodeFlag := flags&linux.MPOL_F_NODE != 0 + addrFlag := flags&linux.MPOL_F_ADDR != 0 + + // TODO(rahat): Once sysfs is implemented, report a single numa node in + // /sys/devices/system/node. + if nodemask != 0 && maxnode < 1 { + return 0, nil, syserror.EINVAL + } + + // 'addr' provided iff 'addrFlag' set. + if addrFlag == (addr == 0) { + return 0, nil, syserror.EINVAL + } + + // Default policy for the thread. + if flags == 0 { + policy, nodemaskVal := t.NumaPolicy() + if _, err := copyOutIfNotNull(t, mode, policy); err != nil { + return 0, nil, syserror.EFAULT + } + if _, err := copyOutIfNotNull(t, nodemask, nodemaskVal); err != nil { + return 0, nil, syserror.EFAULT + } + return 0, nil, nil + } + + // Report all nodes available to caller. + if memsAllowed { + // MPOL_F_NODE and MPOL_F_ADDR not allowed with MPOL_F_MEMS_ALLOWED. + if nodeFlag || addrFlag { + return 0, nil, syserror.EINVAL + } + + // Report a single numa node. + if _, err := copyOutIfNotNull(t, nodemask, uint32(0x1)); err != nil { + return 0, nil, syserror.EFAULT + } + return 0, nil, nil + } + + if addrFlag { + if nodeFlag { + // Return the id for the node where 'addr' resides, via 'mode'. + // + // The real get_mempolicy(2) allocates the page referenced by 'addr' + // by simulating a read, if it is unallocated before the call. It + // then returns the node the page is allocated on through the mode + // pointer. + b := t.CopyScratchBuffer(1) + _, err := t.CopyInBytes(addr, b) + if err != nil { + return 0, nil, syserror.EFAULT + } + if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil { + return 0, nil, syserror.EFAULT + } + } else { + storedPolicy, _ := t.NumaPolicy() + // Return the policy governing the memory referenced by 'addr'. + if _, err := copyOutIfNotNull(t, mode, int32(storedPolicy)); err != nil { + return 0, nil, syserror.EFAULT + } + } + return 0, nil, nil + } + + storedPolicy, _ := t.NumaPolicy() + if nodeFlag && (storedPolicy&^linux.MPOL_MODE_FLAGS == linux.MPOL_INTERLEAVE) { + // Policy for current thread is to interleave memory between + // nodes. Return the next node we'll allocate on. Since we only have a + // single node, this is always node 0. + if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil { + return 0, nil, syserror.EFAULT + } + return 0, nil, nil + } + + return 0, nil, syserror.EINVAL +} + +func allowedNodesMask() uint32 { + const maxNodes = 1 + return ^uint32((1 << maxNodes) - 1) +} + +// SetMempolicy implements the syscall set_mempolicy(2). +func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + modeWithFlags := args[0].Int() + nodemask := args[1].Pointer() + maxnode := args[2].Uint() + + if nodemask != 0 && maxnode < 1 { + return 0, nil, syserror.EINVAL + } + + if modeWithFlags&linux.MPOL_MODE_FLAGS == linux.MPOL_MODE_FLAGS { + // Can't specify multiple modes simultaneously. + return 0, nil, syserror.EINVAL + } + + mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS + if mode < 0 || mode >= linux.MPOL_MAX { + // Must specify a valid mode. + return 0, nil, syserror.EINVAL + } + + var nodemaskVal uint32 + // Nodemask may be empty for some policy modes. + if nodemask != 0 && maxnode > 0 { + if _, err := t.CopyIn(nodemask, &nodemaskVal); err != nil { + return 0, nil, syserror.EFAULT + } + } + + if (mode == linux.MPOL_INTERLEAVE || mode == linux.MPOL_BIND) && nodemaskVal == 0 { + // Mode requires a non-empty nodemask, but got an empty nodemask. + return 0, nil, syserror.EINVAL + } + + if nodemaskVal&allowedNodesMask() != 0 { + // Invalid node specified. + return 0, nil, syserror.EINVAL + } + + t.SetNumaPolicy(int32(modeWithFlags), nodemaskVal) + + return 0, nil, nil +} + +// Mincore implements the syscall mincore(2). +func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + vec := args[2].Pointer() + + if addr != addr.RoundDown() { + return 0, nil, syserror.EINVAL + } + // "The length argument need not be a multiple of the page size, but since + // residency information is returned for whole pages, length is effectively + // rounded up to the next multiple of the page size." - mincore(2) + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return 0, nil, syserror.ENOMEM + } + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return 0, nil, syserror.ENOMEM + } + + // Pretend that all mapped pages are "resident in core". + mapped := t.MemoryManager().VirtualMemorySizeRange(ar) + // "ENOMEM: addr to addr + length contained unmapped memory." + if mapped != uint64(la) { + return 0, nil, syserror.ENOMEM + } + resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize)) + _, err := t.CopyOut(vec, resident) + return 0, nil, err +} + +// Msync implements Linux syscall msync(2). +func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + flags := args[2].Int() + + // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC, + // and may additionally include the MS_INVALIDATE bit. ... However, Linux + // permits a call to msync() that specifies neither of these flags, with + // semantics that are (currently) equivalent to specifying MS_ASYNC." - + // msync(2) + if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 { + return 0, nil, syserror.EINVAL + } + sync := flags&linux.MS_SYNC != 0 + if sync && flags&linux.MS_ASYNC != 0 { + return 0, nil, syserror.EINVAL + } + err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{ + Sync: sync, + Invalidate: flags&linux.MS_INVALIDATE != 0, + }) + // MSync calls fsync, the same interrupt conversion rules apply, see + // mm/msync.c, fsync POSIX.1-2008. + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) +} + +// Mlock implements linux syscall mlock(2). +func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + + return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager) +} + +// Mlock2 implements linux syscall mlock2(2). +func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + flags := args[2].Int() + + if flags&^(linux.MLOCK_ONFAULT) != 0 { + return 0, nil, syserror.EINVAL + } + + mode := memmap.MLockEager + if flags&linux.MLOCK_ONFAULT != 0 { + mode = memmap.MLockLazy + } + return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode) +} + +// Munlock implements linux syscall munlock(2). +func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + + return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone) +} + +// Mlockall implements linux syscall mlockall(2). +func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + + if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 { + return 0, nil, syserror.EINVAL + } + + mode := memmap.MLockEager + if flags&linux.MCL_ONFAULT != 0 { + mode = memmap.MLockLazy + } + return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ + Current: flags&linux.MCL_CURRENT != 0, + Future: flags&linux.MCL_FUTURE != 0, + Mode: mode, + }) +} + +// Munlockall implements linux syscall munlockall(2). +func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ + Current: true, + Future: true, + Mode: memmap.MLockNone, + }) +} |