// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "bytes" "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" "gvisor.googlesource.com/gvisor/pkg/sentry/mm" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" "gvisor.googlesource.com/gvisor/pkg/syserror" ) // Brk implements linux syscall brk(2). func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr, _ := t.MemoryManager().Brk(t, args[0].Pointer()) // "However, the actual Linux system call returns the new program break on // success. On failure, the system call returns the current break." - // brk(2) return uintptr(addr), nil, nil } // Mmap implements linux syscall mmap(2). func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { prot := args[2].Int() flags := args[3].Int() fd := kdefs.FD(args[4].Int()) fixed := flags&linux.MAP_FIXED != 0 private := flags&linux.MAP_PRIVATE != 0 shared := flags&linux.MAP_SHARED != 0 anon := flags&linux.MAP_ANONYMOUS != 0 map32bit := flags&linux.MAP_32BIT != 0 // Require exactly one of MAP_PRIVATE and MAP_SHARED. if private == shared { return 0, nil, syserror.EINVAL } opts := memmap.MMapOpts{ Length: args[1].Uint64(), Offset: args[5].Uint64(), Addr: args[0].Pointer(), Fixed: fixed, Unmap: fixed, Map32Bit: map32bit, Private: private, Perms: usermem.AccessType{ Read: linux.PROT_READ&prot != 0, Write: linux.PROT_WRITE&prot != 0, Execute: linux.PROT_EXEC&prot != 0, }, MaxPerms: usermem.AnyAccess, GrowsDown: linux.MAP_GROWSDOWN&flags != 0, Precommit: linux.MAP_POPULATE&flags != 0, } if linux.MAP_LOCKED&flags != 0 { opts.MLockMode = memmap.MLockEager } defer func() { if opts.MappingIdentity != nil { opts.MappingIdentity.DecRef() } }() if !anon { // Convert the passed FD to a file reference. file := t.FDMap().GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() flags := file.Flags() // mmap unconditionally requires that the FD is readable. if !flags.Read { return 0, nil, syserror.EACCES } // MAP_SHARED requires that the FD be writable for PROT_WRITE. if shared && !flags.Write { opts.MaxPerms.Write = false } if err := file.ConfigureMMap(t, &opts); err != nil { return 0, nil, err } } rv, err := t.MemoryManager().MMap(t, opts) return uintptr(rv), nil, err } // Munmap implements linux syscall munmap(2). func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64()) } // Mremap implements linux syscall mremap(2). func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldAddr := args[0].Pointer() oldSize := args[1].Uint64() newSize := args[2].Uint64() flags := args[3].Uint64() newAddr := args[4].Pointer() if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 { return 0, nil, syserror.EINVAL } mayMove := flags&linux.MREMAP_MAYMOVE != 0 fixed := flags&linux.MREMAP_FIXED != 0 var moveMode mm.MRemapMoveMode switch { case !mayMove && !fixed: moveMode = mm.MRemapNoMove case mayMove && !fixed: moveMode = mm.MRemapMayMove case mayMove && fixed: moveMode = mm.MRemapMustMove case !mayMove && fixed: // "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be // specified." - mremap(2) return 0, nil, syserror.EINVAL } rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{ Move: moveMode, NewAddr: newAddr, }) return uintptr(rv), nil, err } // Mprotect implements linux syscall mprotect(2). func Mprotect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { length := args[1].Uint64() prot := args[2].Int() err := t.MemoryManager().MProtect(args[0].Pointer(), length, usermem.AccessType{ Read: linux.PROT_READ&prot != 0, Write: linux.PROT_WRITE&prot != 0, Execute: linux.PROT_EXEC&prot != 0, }, linux.PROT_GROWSDOWN&prot != 0) return 0, nil, err } // Madvise implements linux syscall madvise(2). func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := uint64(args[1].SizeT()) adv := args[2].Int() // "The Linux implementation requires that the address addr be // page-aligned, and allows length to be zero." - madvise(2) if addr.RoundDown() != addr { return 0, nil, syserror.EINVAL } if length == 0 { return 0, nil, nil } // Not explicitly stated: length need not be page-aligned. lenAddr, ok := usermem.Addr(length).RoundUp() if !ok { return 0, nil, syserror.EINVAL } length = uint64(lenAddr) switch adv { case linux.MADV_DONTNEED: return 0, nil, t.MemoryManager().Decommit(addr, length) case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE: fallthrough case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE: fallthrough case linux.MADV_DONTDUMP, linux.MADV_DODUMP: // TODO(b/72045799): Core dumping isn't implemented, so these are // no-ops. fallthrough case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED: // Do nothing, we totally ignore the suggestions above. return 0, nil, nil case linux.MADV_REMOVE, linux.MADV_DOFORK, linux.MADV_DONTFORK: // These "suggestions" have application-visible side effects, so we // have to indicate that we don't support them. return 0, nil, syserror.ENOSYS case linux.MADV_HWPOISON: // Only privileged processes are allowed to poison pages. return 0, nil, syserror.EPERM default: // If adv is not a valid value tell the caller. return 0, nil, syserror.EINVAL } } func copyOutIfNotNull(t *kernel.Task, ptr usermem.Addr, val interface{}) (int, error) { if ptr != 0 { return t.CopyOut(ptr, val) } return 0, nil } // GetMempolicy implements the syscall get_mempolicy(2). func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { mode := args[0].Pointer() nodemask := args[1].Pointer() maxnode := args[2].Uint() addr := args[3].Pointer() flags := args[4].Uint() memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0 nodeFlag := flags&linux.MPOL_F_NODE != 0 addrFlag := flags&linux.MPOL_F_ADDR != 0 // TODO(rahat): Once sysfs is implemented, report a single numa node in // /sys/devices/system/node. if nodemask != 0 && maxnode < 1 { return 0, nil, syserror.EINVAL } // 'addr' provided iff 'addrFlag' set. if addrFlag == (addr == 0) { return 0, nil, syserror.EINVAL } // Default policy for the thread. if flags == 0 { policy, nodemaskVal := t.NumaPolicy() if _, err := copyOutIfNotNull(t, mode, policy); err != nil { return 0, nil, syserror.EFAULT } if _, err := copyOutIfNotNull(t, nodemask, nodemaskVal); err != nil { return 0, nil, syserror.EFAULT } return 0, nil, nil } // Report all nodes available to caller. if memsAllowed { // MPOL_F_NODE and MPOL_F_ADDR not allowed with MPOL_F_MEMS_ALLOWED. if nodeFlag || addrFlag { return 0, nil, syserror.EINVAL } // Report a single numa node. if _, err := copyOutIfNotNull(t, nodemask, uint32(0x1)); err != nil { return 0, nil, syserror.EFAULT } return 0, nil, nil } if addrFlag { if nodeFlag { // Return the id for the node where 'addr' resides, via 'mode'. // // The real get_mempolicy(2) allocates the page referenced by 'addr' // by simulating a read, if it is unallocated before the call. It // then returns the node the page is allocated on through the mode // pointer. b := t.CopyScratchBuffer(1) _, err := t.CopyInBytes(addr, b) if err != nil { return 0, nil, syserror.EFAULT } if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil { return 0, nil, syserror.EFAULT } } else { storedPolicy, _ := t.NumaPolicy() // Return the policy governing the memory referenced by 'addr'. if _, err := copyOutIfNotNull(t, mode, int32(storedPolicy)); err != nil { return 0, nil, syserror.EFAULT } } return 0, nil, nil } storedPolicy, _ := t.NumaPolicy() if nodeFlag && (storedPolicy&^linux.MPOL_MODE_FLAGS == linux.MPOL_INTERLEAVE) { // Policy for current thread is to interleave memory between // nodes. Return the next node we'll allocate on. Since we only have a // single node, this is always node 0. if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil { return 0, nil, syserror.EFAULT } return 0, nil, nil } return 0, nil, syserror.EINVAL } func allowedNodesMask() uint32 { const maxNodes = 1 return ^uint32((1 << maxNodes) - 1) } // SetMempolicy implements the syscall set_mempolicy(2). func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { modeWithFlags := args[0].Int() nodemask := args[1].Pointer() maxnode := args[2].Uint() if nodemask != 0 && maxnode < 1 { return 0, nil, syserror.EINVAL } if modeWithFlags&linux.MPOL_MODE_FLAGS == linux.MPOL_MODE_FLAGS { // Can't specify multiple modes simultaneously. return 0, nil, syserror.EINVAL } mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS if mode < 0 || mode >= linux.MPOL_MAX { // Must specify a valid mode. return 0, nil, syserror.EINVAL } var nodemaskVal uint32 // Nodemask may be empty for some policy modes. if nodemask != 0 && maxnode > 0 { if _, err := t.CopyIn(nodemask, &nodemaskVal); err != nil { return 0, nil, syserror.EFAULT } } if (mode == linux.MPOL_INTERLEAVE || mode == linux.MPOL_BIND) && nodemaskVal == 0 { // Mode requires a non-empty nodemask, but got an empty nodemask. return 0, nil, syserror.EINVAL } if nodemaskVal&allowedNodesMask() != 0 { // Invalid node specified. return 0, nil, syserror.EINVAL } t.SetNumaPolicy(int32(modeWithFlags), nodemaskVal) return 0, nil, nil } // Mincore implements the syscall mincore(2). func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].SizeT() vec := args[2].Pointer() if addr != addr.RoundDown() { return 0, nil, syserror.EINVAL } // "The length argument need not be a multiple of the page size, but since // residency information is returned for whole pages, length is effectively // rounded up to the next multiple of the page size." - mincore(2) la, ok := usermem.Addr(length).RoundUp() if !ok { return 0, nil, syserror.ENOMEM } ar, ok := addr.ToRange(uint64(la)) if !ok { return 0, nil, syserror.ENOMEM } // Pretend that all mapped pages are "resident in core". mapped := t.MemoryManager().VirtualMemorySizeRange(ar) // "ENOMEM: addr to addr + length contained unmapped memory." if mapped != uint64(la) { return 0, nil, syserror.ENOMEM } resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize)) _, err := t.CopyOut(vec, resident) return 0, nil, err } // Msync implements Linux syscall msync(2). func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].SizeT() flags := args[2].Int() // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC, // and may additionally include the MS_INVALIDATE bit. ... However, Linux // permits a call to msync() that specifies neither of these flags, with // semantics that are (currently) equivalent to specifying MS_ASYNC." - // msync(2) if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 { return 0, nil, syserror.EINVAL } sync := flags&linux.MS_SYNC != 0 if sync && flags&linux.MS_ASYNC != 0 { return 0, nil, syserror.EINVAL } err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{ Sync: sync, Invalidate: flags&linux.MS_INVALIDATE != 0, }) // MSync calls fsync, the same interrupt conversion rules apply, see // mm/msync.c, fsync POSIX.1-2008. return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) } // Mlock implements linux syscall mlock(2). func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].SizeT() return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager) } // Mlock2 implements linux syscall mlock2(2). func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].SizeT() flags := args[2].Int() if flags&^(linux.MLOCK_ONFAULT) != 0 { return 0, nil, syserror.EINVAL } mode := memmap.MLockEager if flags&linux.MLOCK_ONFAULT != 0 { mode = memmap.MLockLazy } return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode) } // Munlock implements linux syscall munlock(2). func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].SizeT() return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone) } // Mlockall implements linux syscall mlockall(2). func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 { return 0, nil, syserror.EINVAL } mode := memmap.MLockEager if flags&linux.MCL_ONFAULT != 0 { mode = memmap.MLockLazy } return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ Current: flags&linux.MCL_CURRENT != 0, Future: flags&linux.MCL_FUTURE != 0, Mode: mode, }) } // Munlockall implements linux syscall munlockall(2). func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ Current: true, Future: true, Mode: memmap.MLockNone, }) }