From d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 Mon Sep 17 00:00:00 2001 From: Googler Date: Fri, 27 Apr 2018 10:37:02 -0700 Subject: Check in gVisor. PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463 --- pkg/sentry/syscalls/linux/sys_mmap.go | 435 ++++++++++++++++++++++++++++++++++ 1 file changed, 435 insertions(+) create mode 100644 pkg/sentry/syscalls/linux/sys_mmap.go (limited to 'pkg/sentry/syscalls/linux/sys_mmap.go') diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go new file mode 100644 index 000000000..2c7d41de0 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -0,0 +1,435 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "bytes" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Brk implements linux syscall brk(2). +func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr, _ := t.MemoryManager().Brk(t, args[0].Pointer()) + // "However, the actual Linux system call returns the new program break on + // success. On failure, the system call returns the current break." - + // brk(2) + return uintptr(addr), nil, nil +} + +// Mmap implements linux syscall mmap(2). +func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + prot := args[2].Int() + flags := args[3].Int() + fd := kdefs.FD(args[4].Int()) + fixed := flags&linux.MAP_FIXED != 0 + private := flags&linux.MAP_PRIVATE != 0 + shared := flags&linux.MAP_SHARED != 0 + anon := flags&linux.MAP_ANONYMOUS != 0 + + // Require exactly one of MAP_PRIVATE and MAP_SHARED. + if private == shared { + return 0, nil, syserror.EINVAL + } + + opts := memmap.MMapOpts{ + Length: args[1].Uint64(), + Offset: args[5].Uint64(), + Addr: args[0].Pointer(), + Fixed: fixed, + Unmap: fixed, + Private: private, + Perms: usermem.AccessType{ + Read: linux.PROT_READ&prot != 0, + Write: linux.PROT_WRITE&prot != 0, + Execute: linux.PROT_EXEC&prot != 0, + }, + MaxPerms: usermem.AnyAccess, + GrowsDown: linux.MAP_GROWSDOWN&flags != 0, + Precommit: linux.MAP_POPULATE&flags != 0, + } + defer func() { + if opts.MappingIdentity != nil { + opts.MappingIdentity.DecRef() + } + }() + + if !anon { + // Convert the passed FD to a file reference. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + flags := file.Flags() + // mmap unconditionally requires that the FD is readable. + if !flags.Read { + return 0, nil, syserror.EACCES + } + // MAP_SHARED requires that the FD be writable for PROT_WRITE. + if shared && !flags.Write { + opts.MaxPerms.Write = false + } + + if err := file.ConfigureMMap(t, &opts); err != nil { + return 0, nil, err + } + } + + rv, err := t.MemoryManager().MMap(t, opts) + return uintptr(rv), nil, err +} + +// Munmap implements linux syscall munmap(2). +func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64()) +} + +// Mremap implements linux syscall mremap(2). +func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldAddr := args[0].Pointer() + oldSize := args[1].Uint64() + newSize := args[2].Uint64() + flags := args[3].Uint64() + newAddr := args[4].Pointer() + + if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 { + return 0, nil, syserror.EINVAL + } + mayMove := flags&linux.MREMAP_MAYMOVE != 0 + fixed := flags&linux.MREMAP_FIXED != 0 + var moveMode mm.MRemapMoveMode + switch { + case !mayMove && !fixed: + moveMode = mm.MRemapNoMove + case mayMove && !fixed: + moveMode = mm.MRemapMayMove + case mayMove && fixed: + moveMode = mm.MRemapMustMove + case !mayMove && fixed: + // "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be + // specified." - mremap(2) + return 0, nil, syserror.EINVAL + } + + rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{ + Move: moveMode, + NewAddr: newAddr, + }) + return uintptr(rv), nil, err +} + +// Mprotect implements linux syscall mprotect(2). +func Mprotect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + length := args[1].Uint64() + prot := args[2].Int() + err := t.MemoryManager().MProtect(args[0].Pointer(), length, usermem.AccessType{ + Read: linux.PROT_READ&prot != 0, + Write: linux.PROT_WRITE&prot != 0, + Execute: linux.PROT_EXEC&prot != 0, + }, linux.PROT_GROWSDOWN&prot != 0) + return 0, nil, err +} + +// Madvise implements linux syscall madvise(2). +func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := uint64(args[1].SizeT()) + adv := args[2].Int() + + // "The Linux implementation requires that the address addr be + // page-aligned, and allows length to be zero." - madvise(2) + if addr.RoundDown() != addr { + return 0, nil, syserror.EINVAL + } + if length == 0 { + return 0, nil, nil + } + // Not explicitly stated: length need not be page-aligned. + lenAddr, ok := usermem.Addr(length).RoundUp() + if !ok { + return 0, nil, syserror.EINVAL + } + length = uint64(lenAddr) + + switch adv { + case linux.MADV_DONTNEED: + return 0, nil, t.MemoryManager().Decommit(addr, length) + case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE: + fallthrough + case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE: + fallthrough + case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED: + // Do nothing, we totally ignore the suggestions above. + return 0, nil, nil + case linux.MADV_REMOVE, linux.MADV_DOFORK, linux.MADV_DONTFORK: + // These "suggestions" have application-visible side effects, so we + // have to indicate that we don't support them. + return 0, nil, syserror.ENOSYS + case linux.MADV_HWPOISON: + // Only privileged processes are allowed to poison pages. + return 0, nil, syserror.EPERM + default: + // If adv is not a valid value tell the caller. + return 0, nil, syserror.EINVAL + } +} + +func copyOutIfNotNull(t *kernel.Task, ptr usermem.Addr, val interface{}) (int, error) { + if ptr != 0 { + return t.CopyOut(ptr, val) + } + return 0, nil +} + +// GetMempolicy implements the syscall get_mempolicy(2). +func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + mode := args[0].Pointer() + nodemask := args[1].Pointer() + maxnode := args[2].Uint() + addr := args[3].Pointer() + flags := args[4].Uint() + + memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0 + nodeFlag := flags&linux.MPOL_F_NODE != 0 + addrFlag := flags&linux.MPOL_F_ADDR != 0 + + // TODO: Once sysfs is implemented, report a single numa node in + // /sys/devices/system/node. + if nodemask != 0 && maxnode < 1 { + return 0, nil, syserror.EINVAL + } + + // 'addr' provided iff 'addrFlag' set. + if addrFlag == (addr == 0) { + return 0, nil, syserror.EINVAL + } + + // Default policy for the thread. + if flags == 0 { + policy, nodemaskVal := t.NumaPolicy() + if _, err := copyOutIfNotNull(t, mode, policy); err != nil { + return 0, nil, syserror.EFAULT + } + if _, err := copyOutIfNotNull(t, nodemask, nodemaskVal); err != nil { + return 0, nil, syserror.EFAULT + } + return 0, nil, nil + } + + // Report all nodes available to caller. + if memsAllowed { + // MPOL_F_NODE and MPOL_F_ADDR not allowed with MPOL_F_MEMS_ALLOWED. + if nodeFlag || addrFlag { + return 0, nil, syserror.EINVAL + } + + // Report a single numa node. + if _, err := copyOutIfNotNull(t, nodemask, uint32(0x1)); err != nil { + return 0, nil, syserror.EFAULT + } + return 0, nil, nil + } + + if addrFlag { + if nodeFlag { + // Return the id for the node where 'addr' resides, via 'mode'. + // + // The real get_mempolicy(2) allocates the page referenced by 'addr' + // by simulating a read, if it is unallocated before the call. It + // then returns the node the page is allocated on through the mode + // pointer. + b := t.CopyScratchBuffer(1) + _, err := t.CopyInBytes(addr, b) + if err != nil { + return 0, nil, syserror.EFAULT + } + if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil { + return 0, nil, syserror.EFAULT + } + } else { + storedPolicy, _ := t.NumaPolicy() + // Return the policy governing the memory referenced by 'addr'. + if _, err := copyOutIfNotNull(t, mode, int32(storedPolicy)); err != nil { + return 0, nil, syserror.EFAULT + } + } + return 0, nil, nil + } + + storedPolicy, _ := t.NumaPolicy() + if nodeFlag && (storedPolicy&^linux.MPOL_MODE_FLAGS == linux.MPOL_INTERLEAVE) { + // Policy for current thread is to interleave memory between + // nodes. Return the next node we'll allocate on. Since we only have a + // single node, this is always node 0. + if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil { + return 0, nil, syserror.EFAULT + } + return 0, nil, nil + } + + return 0, nil, syserror.EINVAL +} + +func allowedNodesMask() uint32 { + const maxNodes = 1 + return ^uint32((1 << maxNodes) - 1) +} + +// SetMempolicy implements the syscall set_mempolicy(2). +func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + modeWithFlags := args[0].Int() + nodemask := args[1].Pointer() + maxnode := args[2].Uint() + + if maxnode < 1 { + return 0, nil, syserror.EINVAL + } + + if modeWithFlags&linux.MPOL_MODE_FLAGS == linux.MPOL_MODE_FLAGS { + // Can't specify multiple modes simultaneously. Must also contain a + // valid mode, which we check below. + return 0, nil, syserror.EINVAL + } + + mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS + if mode < 0 || mode >= linux.MPOL_MAX { + return 0, nil, syserror.EINVAL + } + + var nodemaskVal uint32 + if _, err := t.CopyIn(nodemask, &nodemaskVal); err != nil { + return 0, nil, syserror.EFAULT + } + + // When setting MPOL_INTERLEAVE, nodemask must not be empty. + if mode == linux.MPOL_INTERLEAVE && nodemaskVal == 0 { + return 0, nil, syserror.EINVAL + } + + if nodemaskVal&allowedNodesMask() != 0 { + // Invalid node specified. + return 0, nil, syserror.EINVAL + } + + t.SetNumaPolicy(int32(modeWithFlags), nodemaskVal) + + return 0, nil, nil +} + +// Mincore implements the syscall mincore(2). +func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + vec := args[2].Pointer() + + if addr != addr.RoundDown() { + return 0, nil, syserror.EINVAL + } + // "The length argument need not be a multiple of the page size, but since + // residency information is returned for whole pages, length is effectively + // rounded up to the next multiple of the page size." - mincore(2) + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return 0, nil, syserror.ENOMEM + } + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return 0, nil, syserror.ENOMEM + } + + // Pretend that all mapped pages are "resident in core". + mapped := t.MemoryManager().VirtualMemorySizeRange(ar) + // "ENOMEM: addr to addr + length contained unmapped memory." + if mapped != uint64(la) { + return 0, nil, syserror.ENOMEM + } + resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize)) + _, err := t.CopyOut(vec, resident) + return 0, nil, err +} + +// Msync implements Linux syscall msync(2). +func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + flags := args[2].Int() + + if addr != addr.RoundDown() { + return 0, nil, syserror.EINVAL + } + if length == 0 { + return 0, nil, nil + } + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return 0, nil, syserror.ENOMEM + } + // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC, + // and may additionally include the MS_INVALIDATE bit. ... However, Linux + // permits a call to msync() that specifies neither of these flags, with + // semantics that are (currently) equivalent to specifying MS_ASYNC." - + // msync(2) + if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 { + return 0, nil, syserror.EINVAL + } + sync := flags&linux.MS_SYNC != 0 + if sync && flags&linux.MS_ASYNC != 0 { + return 0, nil, syserror.EINVAL + } + + // MS_INVALIDATE "asks to invalidate other mappings of the same file (so + // that they can be updated with the fresh values just written)". This is a + // no-op given that shared memory exists. However, MS_INVALIDATE can also + // be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags, + // and a memory lock exists for the specified address range." Given that + // mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since + // some user program could be using it for synchronization. + if flags&linux.MS_INVALIDATE != 0 { + return 0, nil, syserror.EINVAL + } + // MS_SYNC "requests an update and waits for it to complete." + if sync { + err := t.MemoryManager().Sync(t, addr, uint64(la)) + // Sync calls fsync, the same interrupt conversion rules apply, see + // mm/msync.c, fsync POSIX.1-2008. + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) + } + // MS_ASYNC "specifies that an update be scheduled, but the call returns + // immediately". As long as dirty pages are tracked and eventually written + // back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC + // is in fact a no-op, since the kernel properly tracks dirty pages and + // flushes them to storage as necessary.") + // + // However: "ENOMEM: The indicated memory (or part of it) was not mapped." + // This applies even for MS_ASYNC. + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return 0, nil, syserror.ENOMEM + } + mapped := t.MemoryManager().VirtualMemorySizeRange(ar) + if mapped != uint64(la) { + return 0, nil, syserror.ENOMEM + } + return 0, nil, nil +} -- cgit v1.2.3