From d3ed9baac0dc967eaf6d3e3f986cafe60604121a Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Wed, 5 Jun 2019 13:59:01 -0700 Subject: Implement dumpability tracking and checks We don't actually support core dumps, but some applications want to get/set dumpability, which still has an effect in procfs. Lack of support for set-uid binaries or fs creds simplifies things a bit. As-is, processes started via CreateProcess (i.e., init and sentryctl exec) have normal dumpability. I'm a bit torn on whether sentryctl exec tasks should be dumpable, but at least since they have no parent normal UID/GID checks should protect them. PiperOrigin-RevId: 251712714 --- pkg/sentry/syscalls/linux/sys_prctl.go | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'pkg/sentry/syscalls') diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 117ae1a0e..1b7e5616b 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -15,6 +15,7 @@ package linux import ( + "fmt" "syscall" "gvisor.googlesource.com/gvisor/pkg/abi/linux" @@ -23,6 +24,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" ) // Prctl implements linux syscall prctl(2). @@ -44,6 +46,33 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall _, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal())) return 0, nil, err + case linux.PR_GET_DUMPABLE: + d := t.MemoryManager().Dumpability() + switch d { + case mm.NotDumpable: + return linux.SUID_DUMP_DISABLE, nil, nil + case mm.UserDumpable: + return linux.SUID_DUMP_USER, nil, nil + case mm.RootDumpable: + return linux.SUID_DUMP_ROOT, nil, nil + default: + panic(fmt.Sprintf("Unknown dumpability %v", d)) + } + + case linux.PR_SET_DUMPABLE: + var d mm.Dumpability + switch args[1].Int() { + case linux.SUID_DUMP_DISABLE: + d = mm.NotDumpable + case linux.SUID_DUMP_USER: + d = mm.UserDumpable + default: + // N.B. Userspace may not pass SUID_DUMP_ROOT. + return 0, nil, syscall.EINVAL + } + t.MemoryManager().SetDumpability(d) + return 0, nil, nil + case linux.PR_GET_KEEPCAPS: if t.Credentials().KeepCaps { return 1, nil, nil @@ -171,9 +200,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } return 0, nil, t.DropBoundingCapability(cp) - case linux.PR_GET_DUMPABLE, - linux.PR_SET_DUMPABLE, - linux.PR_GET_TIMING, + case linux.PR_GET_TIMING, linux.PR_SET_TIMING, linux.PR_GET_TSC, linux.PR_SET_TSC, -- cgit v1.2.3 From b3f104507d7a04c0ca058cbcacc5ff78d853f4ba Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 6 Jun 2019 16:27:09 -0700 Subject: "Implement" mbind(2). We still only advertise a single NUMA node, and ignore mempolicy accordingly, but mbind() at least now succeeds and has effects reflected by get_mempolicy(). Also fix handling of nodemasks: round sizes to unsigned long (as documented and done by Linux), and zero trailing bits when copying them out. PiperOrigin-RevId: 251950859 --- pkg/abi/linux/mm.go | 9 + pkg/sentry/kernel/task.go | 7 +- pkg/sentry/kernel/task_sched.go | 4 +- pkg/sentry/mm/mm.go | 6 + pkg/sentry/mm/syscalls.go | 53 +++++ pkg/sentry/mm/vma.go | 3 + pkg/sentry/syscalls/linux/BUILD | 1 + pkg/sentry/syscalls/linux/linux64.go | 3 +- pkg/sentry/syscalls/linux/sys_mempolicy.go | 312 +++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/sys_mmap.go | 145 -------------- test/syscalls/linux/BUILD | 1 + test/syscalls/linux/mempolicy.cc | 37 +++- 12 files changed, 426 insertions(+), 155 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/sys_mempolicy.go (limited to 'pkg/sentry/syscalls') diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go index 0b02f938a..cd043dac3 100644 --- a/pkg/abi/linux/mm.go +++ b/pkg/abi/linux/mm.go @@ -114,3 +114,12 @@ const ( MPOL_MODE_FLAGS = (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES) ) + +// Flags for mbind(2). +const ( + MPOL_MF_STRICT = 1 << 0 + MPOL_MF_MOVE = 1 << 1 + MPOL_MF_MOVE_ALL = 1 << 2 + + MPOL_MF_VALID = MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL +) diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index f9378c2de..4d889422f 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -455,12 +455,13 @@ type Task struct { // single numa node, all policies are no-ops. We only track this information // so that we can return reasonable values if the application calls // get_mempolicy(2) after setting a non-default policy. Note that in the - // real syscall, nodemask can be longer than 4 bytes, but we always report a - // single node so never need to save more than a single bit. + // real syscall, nodemask can be longer than a single unsigned long, but we + // always report a single node so never need to save more than a single + // bit. // // numaPolicy and numaNodeMask are protected by mu. numaPolicy int32 - numaNodeMask uint32 + numaNodeMask uint64 // If netns is true, the task is in a non-root network namespace. Network // namespaces aren't currently implemented in full; being in a network diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index 5455f6ea9..1c94ab11b 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -622,14 +622,14 @@ func (t *Task) SetNiceness(n int) { } // NumaPolicy returns t's current numa policy. -func (t *Task) NumaPolicy() (policy int32, nodeMask uint32) { +func (t *Task) NumaPolicy() (policy int32, nodeMask uint64) { t.mu.Lock() defer t.mu.Unlock() return t.numaPolicy, t.numaNodeMask } // SetNumaPolicy sets t's numa policy. -func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32) { +func (t *Task) SetNumaPolicy(policy int32, nodeMask uint64) { t.mu.Lock() defer t.mu.Unlock() t.numaPolicy = policy diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 0a026ff8c..604866d04 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -276,6 +276,12 @@ type vma struct { mlockMode memmap.MLockMode + // numaPolicy is the NUMA policy for this vma set by mbind(). + numaPolicy int32 + + // numaNodemask is the NUMA nodemask for this vma set by mbind(). + numaNodemask uint64 + // If id is not nil, it controls the lifecycle of mappable and provides vma // metadata shown in /proc/[pid]/maps, and the vma holds a reference. id memmap.MappingIdentity diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index af1e53f5d..9cf136532 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -973,6 +973,59 @@ func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error return nil } +// NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR). +func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (int32, uint64, error) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + vseg := mm.vmas.FindSegment(addr) + if !vseg.Ok() { + return 0, 0, syserror.EFAULT + } + vma := vseg.ValuePtr() + return vma.numaPolicy, vma.numaNodemask, nil +} + +// SetNumaPolicy implements the semantics of Linux's mbind(). +func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy int32, nodemask uint64) error { + if !addr.IsPageAligned() { + return syserror.EINVAL + } + // Linux allows this to overflow. + la, _ := usermem.Addr(length).RoundUp() + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return syserror.EINVAL + } + if ar.Length() == 0 { + return nil + } + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + defer func() { + mm.vmas.MergeRange(ar) + mm.vmas.MergeAdjacent(ar) + }() + vseg := mm.vmas.LowerBoundSegment(ar.Start) + lastEnd := ar.Start + for { + if !vseg.Ok() || lastEnd < vseg.Start() { + // "EFAULT: ... there was an unmapped hole in the specified memory + // range specified [sic] by addr and len." - mbind(2) + return syserror.EFAULT + } + vseg = mm.vmas.Isolate(vseg, ar) + vma := vseg.ValuePtr() + vma.numaPolicy = policy + vma.numaNodemask = nodemask + lastEnd = vseg.End() + if ar.End <= lastEnd { + return nil + } + vseg, _ = vseg.NextNonEmpty() + } +} + // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED). func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { ar, ok := addr.ToRange(length) diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 02203f79f..0af8de5b0 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -107,6 +107,7 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp private: opts.Private, growsDown: opts.GrowsDown, mlockMode: opts.MLockMode, + numaPolicy: linux.MPOL_DEFAULT, id: opts.MappingIdentity, hint: opts.Hint, } @@ -436,6 +437,8 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa vma1.private != vma2.private || vma1.growsDown != vma2.growsDown || vma1.mlockMode != vma2.mlockMode || + vma1.numaPolicy != vma2.numaPolicy || + vma1.numaNodemask != vma2.numaNodemask || vma1.id != vma2.id || vma1.hint != vma2.hint { return vma{}, false diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index f76989ae2..1c057526b 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -19,6 +19,7 @@ go_library( "sys_identity.go", "sys_inotify.go", "sys_lseek.go", + "sys_mempolicy.go", "sys_mmap.go", "sys_mount.go", "sys_pipe.go", diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 3e4d312af..ad88b1391 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -360,8 +360,7 @@ var AMD64 = &kernel.SyscallTable{ 235: Utimes, // @Syscall(Vserver, note:Not implemented by Linux) 236: syscalls.Error(syscall.ENOSYS), // Vserver, not implemented by Linux - // @Syscall(Mbind, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise), TODO(b/117792295) - 237: syscalls.CapError(linux.CAP_SYS_NICE), // may require cap_sys_nice + 237: Mbind, 238: SetMempolicy, 239: GetMempolicy, // 240: @Syscall(MqOpen), TODO(b/29354921) diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go new file mode 100644 index 000000000..652b2c206 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go @@ -0,0 +1,312 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// We unconditionally report a single NUMA node. This also means that our +// "nodemask_t" is a single unsigned long (uint64). +const ( + maxNodes = 1 + allowedNodemask = (1 << maxNodes) - 1 +) + +func copyInNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32) (uint64, error) { + // "nodemask points to a bit mask of node IDs that contains up to maxnode + // bits. The bit mask size is rounded to the next multiple of + // sizeof(unsigned long), but the kernel will use bits only up to maxnode. + // A NULL value of nodemask or a maxnode value of zero specifies the empty + // set of nodes. If the value of maxnode is zero, the nodemask argument is + // ignored." - set_mempolicy(2). Unfortunately, most of this is inaccurate + // because of what appears to be a bug: mm/mempolicy.c:get_nodes() uses + // maxnode-1, not maxnode, as the number of bits. + bits := maxnode - 1 + if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0 + return 0, syserror.EINVAL + } + if bits == 0 { + return 0, nil + } + // Copy in the whole nodemask. + numUint64 := (bits + 63) / 64 + buf := t.CopyScratchBuffer(int(numUint64) * 8) + if _, err := t.CopyInBytes(addr, buf); err != nil { + return 0, err + } + val := usermem.ByteOrder.Uint64(buf) + // Check that only allowed bits in the first unsigned long in the nodemask + // are set. + if val&^allowedNodemask != 0 { + return 0, syserror.EINVAL + } + // Check that all remaining bits in the nodemask are 0. + for i := 8; i < len(buf); i++ { + if buf[i] != 0 { + return 0, syserror.EINVAL + } + } + return val, nil +} + +func copyOutNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32, val uint64) error { + // mm/mempolicy.c:copy_nodes_to_user() also uses maxnode-1 as the number of + // bits. + bits := maxnode - 1 + if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0 + return syserror.EINVAL + } + if bits == 0 { + return nil + } + // Copy out the first unsigned long in the nodemask. + buf := t.CopyScratchBuffer(8) + usermem.ByteOrder.PutUint64(buf, val) + if _, err := t.CopyOutBytes(addr, buf); err != nil { + return err + } + // Zero out remaining unsigned longs in the nodemask. + if bits > 64 { + remAddr, ok := addr.AddLength(8) + if !ok { + return syserror.EFAULT + } + remUint64 := (bits - 1) / 64 + if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return err + } + } + return nil +} + +// GetMempolicy implements the syscall get_mempolicy(2). +func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + mode := args[0].Pointer() + nodemask := args[1].Pointer() + maxnode := args[2].Uint() + addr := args[3].Pointer() + flags := args[4].Uint() + + if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 { + return 0, nil, syserror.EINVAL + } + nodeFlag := flags&linux.MPOL_F_NODE != 0 + addrFlag := flags&linux.MPOL_F_ADDR != 0 + memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0 + + // "EINVAL: The value specified by maxnode is less than the number of node + // IDs supported by the system." - get_mempolicy(2) + if nodemask != 0 && maxnode < maxNodes { + return 0, nil, syserror.EINVAL + } + + // "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is + // ignored and the set of nodes (memories) that the thread is allowed to + // specify in subsequent calls to mbind(2) or set_mempolicy(2) (in the + // absence of any mode flags) is returned in nodemask." + if memsAllowed { + // "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either + // MPOL_F_ADDR or MPOL_F_NODE." + if nodeFlag || addrFlag { + return 0, nil, syserror.EINVAL + } + if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil { + return 0, nil, err + } + return 0, nil, nil + } + + // "If flags specifies MPOL_F_ADDR, then information is returned about the + // policy governing the memory address given in addr. ... If the mode + // argument is not NULL, then get_mempolicy() will store the policy mode + // and any optional mode flags of the requested NUMA policy in the location + // pointed to by this argument. If nodemask is not NULL, then the nodemask + // associated with the policy will be stored in the location pointed to by + // this argument." + if addrFlag { + policy, nodemaskVal, err := t.MemoryManager().NumaPolicy(addr) + if err != nil { + return 0, nil, err + } + if nodeFlag { + // "If flags specifies both MPOL_F_NODE and MPOL_F_ADDR, + // get_mempolicy() will return the node ID of the node on which the + // address addr is allocated into the location pointed to by mode. + // If no page has yet been allocated for the specified address, + // get_mempolicy() will allocate a page as if the thread had + // performed a read (load) access to that address, and return the + // ID of the node where that page was allocated." + buf := t.CopyScratchBuffer(1) + _, err := t.CopyInBytes(addr, buf) + if err != nil { + return 0, nil, err + } + policy = 0 // maxNodes == 1 + } + if mode != 0 { + if _, err := t.CopyOut(mode, policy); err != nil { + return 0, nil, err + } + } + if nodemask != 0 { + if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil { + return 0, nil, err + } + } + return 0, nil, nil + } + + // "EINVAL: ... flags specified MPOL_F_ADDR and addr is NULL, or flags did + // not specify MPOL_F_ADDR and addr is not NULL." This is partially + // inaccurate: if flags specifies MPOL_F_ADDR, + // mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will + // just (usually) fail to find a VMA at address 0 and return EFAULT. + if addr != 0 { + return 0, nil, syserror.EINVAL + } + + // "If flags is specified as 0, then information about the calling thread's + // default policy (as set by set_mempolicy(2)) is returned, in the buffers + // pointed to by mode and nodemask. ... If flags specifies MPOL_F_NODE, but + // not MPOL_F_ADDR, and the thread's current policy is MPOL_INTERLEAVE, + // then get_mempolicy() will return in the location pointed to by a + // non-NULL mode argument, the node ID of the next node that will be used + // for interleaving of internal kernel pages allocated on behalf of the + // thread." + policy, nodemaskVal := t.NumaPolicy() + if nodeFlag { + if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE { + return 0, nil, syserror.EINVAL + } + policy = 0 // maxNodes == 1 + } + if mode != 0 { + if _, err := t.CopyOut(mode, policy); err != nil { + return 0, nil, err + } + } + if nodemask != 0 { + if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil { + return 0, nil, err + } + } + return 0, nil, nil +} + +// SetMempolicy implements the syscall set_mempolicy(2). +func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + modeWithFlags := args[0].Int() + nodemask := args[1].Pointer() + maxnode := args[2].Uint() + + modeWithFlags, nodemaskVal, err := copyInMempolicyNodemask(t, modeWithFlags, nodemask, maxnode) + if err != nil { + return 0, nil, err + } + + t.SetNumaPolicy(modeWithFlags, nodemaskVal) + return 0, nil, nil +} + +// Mbind implements the syscall mbind(2). +func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].Uint64() + mode := args[2].Int() + nodemask := args[3].Pointer() + maxnode := args[4].Uint() + flags := args[5].Uint() + + if flags&^linux.MPOL_MF_VALID != 0 { + return 0, nil, syserror.EINVAL + } + // "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be + // privileged (CAP_SYS_NICE) to use this flag." - mbind(2) + if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) { + return 0, nil, syserror.EPERM + } + + mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode) + if err != nil { + return 0, nil, err + } + + // Since we claim to have only a single node, all flags can be ignored + // (since all pages must already be on that single node). + err = t.MemoryManager().SetNumaPolicy(addr, length, mode, nodemaskVal) + return 0, nil, err +} + +func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags int32, nodemask usermem.Addr, maxnode uint32) (int32, uint64, error) { + flags := modeWithFlags & linux.MPOL_MODE_FLAGS + mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS + if flags == linux.MPOL_MODE_FLAGS { + // Can't specify both mode flags simultaneously. + return 0, 0, syserror.EINVAL + } + if mode < 0 || mode >= linux.MPOL_MAX { + // Must specify a valid mode. + return 0, 0, syserror.EINVAL + } + + var nodemaskVal uint64 + if nodemask != 0 { + var err error + nodemaskVal, err = copyInNodemask(t, nodemask, maxnode) + if err != nil { + return 0, 0, err + } + } + + switch mode { + case linux.MPOL_DEFAULT: + // "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate; + // Linux allows a nodemask to be specified, as long as it is empty. + if nodemaskVal != 0 { + return 0, 0, syserror.EINVAL + } + case linux.MPOL_BIND, linux.MPOL_INTERLEAVE: + // These require a non-empty nodemask. + if nodemaskVal == 0 { + return 0, 0, syserror.EINVAL + } + case linux.MPOL_PREFERRED: + // This permits an empty nodemask, as long as no flags are set. + if nodemaskVal == 0 && flags != 0 { + return 0, 0, syserror.EINVAL + } + case linux.MPOL_LOCAL: + // This requires an empty nodemask and no flags set ... + if nodemaskVal != 0 || flags != 0 { + return 0, 0, syserror.EINVAL + } + // ... and is implemented as MPOL_PREFERRED. + mode = linux.MPOL_PREFERRED + default: + // Unknown mode, which we should have rejected above. + panic(fmt.Sprintf("unknown mode: %v", mode)) + } + + return mode | flags, nodemaskVal, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 64a6e639c..9926f0ac5 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -204,151 +204,6 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } } -func copyOutIfNotNull(t *kernel.Task, ptr usermem.Addr, val interface{}) (int, error) { - if ptr != 0 { - return t.CopyOut(ptr, val) - } - return 0, nil -} - -// GetMempolicy implements the syscall get_mempolicy(2). -func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - mode := args[0].Pointer() - nodemask := args[1].Pointer() - maxnode := args[2].Uint() - addr := args[3].Pointer() - flags := args[4].Uint() - - memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0 - nodeFlag := flags&linux.MPOL_F_NODE != 0 - addrFlag := flags&linux.MPOL_F_ADDR != 0 - - // TODO(rahat): Once sysfs is implemented, report a single numa node in - // /sys/devices/system/node. - if nodemask != 0 && maxnode < 1 { - return 0, nil, syserror.EINVAL - } - - // 'addr' provided iff 'addrFlag' set. - if addrFlag == (addr == 0) { - return 0, nil, syserror.EINVAL - } - - // Default policy for the thread. - if flags == 0 { - policy, nodemaskVal := t.NumaPolicy() - if _, err := copyOutIfNotNull(t, mode, policy); err != nil { - return 0, nil, syserror.EFAULT - } - if _, err := copyOutIfNotNull(t, nodemask, nodemaskVal); err != nil { - return 0, nil, syserror.EFAULT - } - return 0, nil, nil - } - - // Report all nodes available to caller. - if memsAllowed { - // MPOL_F_NODE and MPOL_F_ADDR not allowed with MPOL_F_MEMS_ALLOWED. - if nodeFlag || addrFlag { - return 0, nil, syserror.EINVAL - } - - // Report a single numa node. - if _, err := copyOutIfNotNull(t, nodemask, uint32(0x1)); err != nil { - return 0, nil, syserror.EFAULT - } - return 0, nil, nil - } - - if addrFlag { - if nodeFlag { - // Return the id for the node where 'addr' resides, via 'mode'. - // - // The real get_mempolicy(2) allocates the page referenced by 'addr' - // by simulating a read, if it is unallocated before the call. It - // then returns the node the page is allocated on through the mode - // pointer. - b := t.CopyScratchBuffer(1) - _, err := t.CopyInBytes(addr, b) - if err != nil { - return 0, nil, syserror.EFAULT - } - if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil { - return 0, nil, syserror.EFAULT - } - } else { - storedPolicy, _ := t.NumaPolicy() - // Return the policy governing the memory referenced by 'addr'. - if _, err := copyOutIfNotNull(t, mode, int32(storedPolicy)); err != nil { - return 0, nil, syserror.EFAULT - } - } - return 0, nil, nil - } - - storedPolicy, _ := t.NumaPolicy() - if nodeFlag && (storedPolicy&^linux.MPOL_MODE_FLAGS == linux.MPOL_INTERLEAVE) { - // Policy for current thread is to interleave memory between - // nodes. Return the next node we'll allocate on. Since we only have a - // single node, this is always node 0. - if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil { - return 0, nil, syserror.EFAULT - } - return 0, nil, nil - } - - return 0, nil, syserror.EINVAL -} - -func allowedNodesMask() uint32 { - const maxNodes = 1 - return ^uint32((1 << maxNodes) - 1) -} - -// SetMempolicy implements the syscall set_mempolicy(2). -func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - modeWithFlags := args[0].Int() - nodemask := args[1].Pointer() - maxnode := args[2].Uint() - - if nodemask != 0 && maxnode < 1 { - return 0, nil, syserror.EINVAL - } - - if modeWithFlags&linux.MPOL_MODE_FLAGS == linux.MPOL_MODE_FLAGS { - // Can't specify multiple modes simultaneously. - return 0, nil, syserror.EINVAL - } - - mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS - if mode < 0 || mode >= linux.MPOL_MAX { - // Must specify a valid mode. - return 0, nil, syserror.EINVAL - } - - var nodemaskVal uint32 - // Nodemask may be empty for some policy modes. - if nodemask != 0 && maxnode > 0 { - if _, err := t.CopyIn(nodemask, &nodemaskVal); err != nil { - return 0, nil, syserror.EFAULT - } - } - - if (mode == linux.MPOL_INTERLEAVE || mode == linux.MPOL_BIND) && nodemaskVal == 0 { - // Mode requires a non-empty nodemask, but got an empty nodemask. - return 0, nil, syserror.EINVAL - } - - if nodemaskVal&allowedNodesMask() != 0 { - // Invalid node specified. - return 0, nil, syserror.EINVAL - } - - t.SetNumaPolicy(int32(modeWithFlags), nodemaskVal) - - return 0, nil, nil -} - // Mincore implements the syscall mincore(2). func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 0cb7b47b6..9bafc6e4f 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -999,6 +999,7 @@ cc_binary( linkstatic = 1, deps = [ "//test/util:cleanup", + "//test/util:memory_util", "//test/util:test_main", "//test/util:test_util", "//test/util:thread_util", diff --git a/test/syscalls/linux/mempolicy.cc b/test/syscalls/linux/mempolicy.cc index 4ac4cb88f..9d5f47651 100644 --- a/test/syscalls/linux/mempolicy.cc +++ b/test/syscalls/linux/mempolicy.cc @@ -18,6 +18,7 @@ #include "gtest/gtest.h" #include "absl/memory/memory.h" #include "test/util/cleanup.h" +#include "test/util/memory_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -34,7 +35,7 @@ namespace { #define MPOL_PREFERRED 1 #define MPOL_BIND 2 #define MPOL_INTERLEAVE 3 -#define MPOL_MAX MPOL_INTERLEAVE +#define MPOL_LOCAL 4 #define MPOL_F_NODE (1 << 0) #define MPOL_F_ADDR (1 << 1) #define MPOL_F_MEMS_ALLOWED (1 << 2) @@ -44,11 +45,17 @@ namespace { int get_mempolicy(int *policy, uint64_t *nmask, uint64_t maxnode, void *addr, int flags) { - return syscall(__NR_get_mempolicy, policy, nmask, maxnode, addr, flags); + return syscall(SYS_get_mempolicy, policy, nmask, maxnode, addr, flags); } int set_mempolicy(int mode, uint64_t *nmask, uint64_t maxnode) { - return syscall(__NR_set_mempolicy, mode, nmask, maxnode); + return syscall(SYS_set_mempolicy, mode, nmask, maxnode); +} + +int mbind(void *addr, unsigned long len, int mode, + const unsigned long *nodemask, unsigned long maxnode, + unsigned flags) { + return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); } // Creates a cleanup object that resets the calling thread's mempolicy to the @@ -252,6 +259,30 @@ TEST(MempolicyTest, GetMempolicyNextInterleaveNode) { EXPECT_EQ(0, mode); } +TEST(MempolicyTest, Mbind) { + // Temporarily set the thread policy to MPOL_PREFERRED. + const auto cleanup_thread_policy = + ASSERT_NO_ERRNO_AND_VALUE(ScopedSetMempolicy(MPOL_PREFERRED, nullptr, 0)); + + const auto mapping = ASSERT_NO_ERRNO_AND_VALUE( + MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS)); + + // vmas default to MPOL_DEFAULT irrespective of the thread policy (currently + // MPOL_PREFERRED). + int mode; + ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, mapping.ptr(), MPOL_F_ADDR), + SyscallSucceeds()); + EXPECT_EQ(mode, MPOL_DEFAULT); + + // Set MPOL_PREFERRED for the vma and read it back. + ASSERT_THAT( + mbind(mapping.ptr(), mapping.len(), MPOL_PREFERRED, nullptr, 0, 0), + SyscallSucceeds()); + ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, mapping.ptr(), MPOL_F_ADDR), + SyscallSucceeds()); + EXPECT_EQ(mode, MPOL_PREFERRED); +} + } // namespace } // namespace testing -- cgit v1.2.3 From 315cf9a523d409dc6ddd5ce25f8f0315068ccc67 Mon Sep 17 00:00:00 2001 From: Rahat Mahmood Date: Thu, 6 Jun 2019 16:59:21 -0700 Subject: Use common definition of SockType. SockType isn't specific to unix domain sockets, and the current definition basically mirrors the linux ABI's definition. PiperOrigin-RevId: 251956740 --- pkg/abi/linux/socket.go | 18 +++++++++++------- pkg/sentry/fs/gofer/socket.go | 11 ++++++----- pkg/sentry/fs/host/socket.go | 11 ++++++----- pkg/sentry/socket/epsocket/BUILD | 1 - pkg/sentry/socket/epsocket/epsocket.go | 11 +++++------ pkg/sentry/socket/epsocket/provider.go | 7 +++---- pkg/sentry/socket/hostinet/BUILD | 1 - pkg/sentry/socket/hostinet/socket.go | 5 ++--- pkg/sentry/socket/netlink/provider.go | 7 +++---- pkg/sentry/socket/rpcinet/BUILD | 1 - pkg/sentry/socket/rpcinet/socket.go | 9 ++++----- pkg/sentry/socket/socket.go | 8 ++++---- pkg/sentry/socket/unix/transport/connectioned.go | 20 ++++++++++---------- pkg/sentry/socket/unix/transport/connectionless.go | 4 ++-- pkg/sentry/socket/unix/transport/unix.go | 22 ++++------------------ pkg/sentry/socket/unix/unix.go | 4 ++-- pkg/sentry/strace/socket.go | 14 +++++++------- pkg/sentry/syscalls/linux/sys_socket.go | 4 ++-- 18 files changed, 71 insertions(+), 87 deletions(-) (limited to 'pkg/sentry/syscalls') diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go index 44bd69df6..a714ac86d 100644 --- a/pkg/abi/linux/socket.go +++ b/pkg/abi/linux/socket.go @@ -102,15 +102,19 @@ const ( SOL_NETLINK = 270 ) +// A SockType is a type (as opposed to family) of sockets. These are enumerated +// below as SOCK_* constants. +type SockType int + // Socket types, from linux/net.h. const ( - SOCK_STREAM = 1 - SOCK_DGRAM = 2 - SOCK_RAW = 3 - SOCK_RDM = 4 - SOCK_SEQPACKET = 5 - SOCK_DCCP = 6 - SOCK_PACKET = 10 + SOCK_STREAM SockType = 1 + SOCK_DGRAM = 2 + SOCK_RAW = 3 + SOCK_RDM = 4 + SOCK_SEQPACKET = 5 + SOCK_DCCP = 6 + SOCK_PACKET = 10 ) // SOCK_TYPE_MASK covers all of the above socket types. The remaining bits are diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index 7376fd76f..7ac0a421f 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -15,6 +15,7 @@ package gofer import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/p9" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" @@ -61,13 +62,13 @@ type endpoint struct { path string } -func unixSockToP9(t transport.SockType) (p9.ConnectFlags, bool) { +func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) { switch t { - case transport.SockStream: + case linux.SOCK_STREAM: return p9.StreamSocket, true - case transport.SockSeqpacket: + case linux.SOCK_SEQPACKET: return p9.SeqpacketSocket, true - case transport.SockDgram: + case linux.SOCK_DGRAM: return p9.DgramSocket, true } return 0, false @@ -75,7 +76,7 @@ func unixSockToP9(t transport.SockType) (p9.ConnectFlags, bool) { // BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect. func (e *endpoint) BidirectionalConnect(ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error { - cf, ok := unixSockToP9(ce.Type()) + cf, ok := sockTypeToP9(ce.Type()) if !ok { return syserr.ErrConnectionRefused } diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index e4ec0f62c..6423ad938 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -19,6 +19,7 @@ import ( "sync" "syscall" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/fd" "gvisor.googlesource.com/gvisor/pkg/fdnotifier" "gvisor.googlesource.com/gvisor/pkg/log" @@ -56,7 +57,7 @@ type ConnectedEndpoint struct { srfd int `state:"wait"` // stype is the type of Unix socket. - stype transport.SockType + stype linux.SockType // sndbuf is the size of the send buffer. // @@ -105,7 +106,7 @@ func (c *ConnectedEndpoint) init() *syserr.Error { return syserr.ErrInvalidEndpointState } - c.stype = transport.SockType(stype) + c.stype = linux.SockType(stype) c.sndbuf = sndbuf return nil @@ -163,7 +164,7 @@ func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.F ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e) - return unixsocket.NewWithDirent(ctx, d, ep, e.stype != transport.SockStream, flags), nil + return unixsocket.NewWithDirent(ctx, d, ep, e.stype != linux.SOCK_STREAM, flags), nil } // newSocket allocates a new unix socket with host endpoint. @@ -195,7 +196,7 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error) ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e) - return unixsocket.New(ctx, ep, e.stype != transport.SockStream), nil + return unixsocket.New(ctx, ep, e.stype != linux.SOCK_STREAM), nil } // Send implements transport.ConnectedEndpoint.Send. @@ -209,7 +210,7 @@ func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.Contro // Since stream sockets don't preserve message boundaries, we can write // only as much of the message as fits in the send buffer. - truncate := c.stype == transport.SockStream + truncate := c.stype == linux.SOCK_STREAM n, totalLen, err := fdWriteVec(c.file.FD(), data, c.sndbuf, truncate) if n < totalLen && err == nil { diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD index 44bb97b5b..7e2679ea0 100644 --- a/pkg/sentry/socket/epsocket/BUILD +++ b/pkg/sentry/socket/epsocket/BUILD @@ -32,7 +32,6 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", - "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", "//pkg/sentry/usermem", "//pkg/syserr", diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go index f91c5127a..e1e29de35 100644 --- a/pkg/sentry/socket/epsocket/epsocket.go +++ b/pkg/sentry/socket/epsocket/epsocket.go @@ -44,7 +44,6 @@ import ( ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" "gvisor.googlesource.com/gvisor/pkg/sentry/socket" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" "gvisor.googlesource.com/gvisor/pkg/syserr" @@ -228,7 +227,7 @@ type SocketOperations struct { family int Endpoint tcpip.Endpoint - skType transport.SockType + skType linux.SockType // readMu protects access to the below fields. readMu sync.Mutex `state:"nosave"` @@ -253,8 +252,8 @@ type SocketOperations struct { } // New creates a new endpoint socket. -func New(t *kernel.Task, family int, skType transport.SockType, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) { - if skType == transport.SockStream { +func New(t *kernel.Task, family int, skType linux.SockType, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) { + if skType == linux.SOCK_STREAM { if err := endpoint.SetSockOpt(tcpip.DelayOption(1)); err != nil { return nil, syserr.TranslateNetstackError(err) } @@ -638,7 +637,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) ( // GetSockOpt can be used to implement the linux syscall getsockopt(2) for // sockets backed by a commonEndpoint. -func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, level, name, outLen int) (interface{}, *syserr.Error) { +func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) { switch level { case linux.SOL_SOCKET: return getSockOptSocket(t, s, ep, family, skType, name, outLen) @@ -664,7 +663,7 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, } // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. -func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, name, outLen int) (interface{}, *syserr.Error) { +func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) { // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. switch name { case linux.SO_TYPE: diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go index ec930d8d5..e48a106ea 100644 --- a/pkg/sentry/socket/epsocket/provider.go +++ b/pkg/sentry/socket/epsocket/provider.go @@ -23,7 +23,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/socket" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" "gvisor.googlesource.com/gvisor/pkg/syserr" "gvisor.googlesource.com/gvisor/pkg/tcpip" "gvisor.googlesource.com/gvisor/pkg/tcpip/header" @@ -42,7 +41,7 @@ type provider struct { // getTransportProtocol figures out transport protocol. Currently only TCP, // UDP, and ICMP are supported. -func getTransportProtocol(ctx context.Context, stype transport.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) { +func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) { switch stype { case linux.SOCK_STREAM: if protocol != 0 && protocol != syscall.IPPROTO_TCP { @@ -80,7 +79,7 @@ func getTransportProtocol(ctx context.Context, stype transport.SockType, protoco } // Socket creates a new socket object for the AF_INET or AF_INET6 family. -func (p *provider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) { +func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { // Fail right away if we don't have a stack. stack := t.NetworkContext() if stack == nil { @@ -116,7 +115,7 @@ func (p *provider) Socket(t *kernel.Task, stype transport.SockType, protocol int } // Pair just returns nil sockets (not supported). -func (*provider) Pair(*kernel.Task, transport.SockType, int) (*fs.File, *fs.File, *syserr.Error) { +func (*provider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) { return nil, nil, nil } diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index a469af7ac..975f47bc3 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -30,7 +30,6 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", - "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 0d75580a3..4517951a0 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -30,7 +30,6 @@ import ( ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" "gvisor.googlesource.com/gvisor/pkg/sentry/socket" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" "gvisor.googlesource.com/gvisor/pkg/syserr" "gvisor.googlesource.com/gvisor/pkg/syserror" @@ -548,7 +547,7 @@ type socketProvider struct { } // Socket implements socket.Provider.Socket. -func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, protocol int) (*fs.File, *syserr.Error) { +func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) { // Check that we are using the host network stack. stack := t.NetworkContext() if stack == nil { @@ -590,7 +589,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p } // Pair implements socket.Provider.Pair. -func (p *socketProvider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { +func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { // Not supported by AF_INET/AF_INET6. return nil, nil, nil } diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index 76cf12fd4..863edc241 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -22,7 +22,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/socket" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" "gvisor.googlesource.com/gvisor/pkg/syserr" ) @@ -66,10 +65,10 @@ type socketProvider struct { } // Socket implements socket.Provider.Socket. -func (*socketProvider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) { +func (*socketProvider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { // Netlink sockets must be specified as datagram or raw, but they // behave the same regardless of type. - if stype != transport.SockDgram && stype != transport.SockRaw { + if stype != linux.SOCK_DGRAM && stype != linux.SOCK_RAW { return nil, syserr.ErrSocketNotSupported } @@ -94,7 +93,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype transport.SockType, protocol } // Pair implements socket.Provider.Pair by returning an error. -func (*socketProvider) Pair(*kernel.Task, transport.SockType, int) (*fs.File, *fs.File, *syserr.Error) { +func (*socketProvider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) { // Netlink sockets never supports creating socket pairs. return nil, nil, syserr.ErrNotSupported } diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD index 4da14a1e0..33ba20de7 100644 --- a/pkg/sentry/socket/rpcinet/BUILD +++ b/pkg/sentry/socket/rpcinet/BUILD @@ -31,7 +31,6 @@ go_library( "//pkg/sentry/socket/hostinet", "//pkg/sentry/socket/rpcinet/conn", "//pkg/sentry/socket/rpcinet/notifier", - "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", "//pkg/sentry/usermem", "//pkg/syserr", diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go index bf42bdf69..2d5b5b58f 100644 --- a/pkg/sentry/socket/rpcinet/socket.go +++ b/pkg/sentry/socket/rpcinet/socket.go @@ -32,7 +32,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn" "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier" pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" "gvisor.googlesource.com/gvisor/pkg/syserr" @@ -70,7 +69,7 @@ type socketOperations struct { var _ = socket.Socket(&socketOperations{}) // New creates a new RPC socket. -func newSocketFile(ctx context.Context, stack *Stack, family int, skType int, protocol int) (*fs.File, *syserr.Error) { +func newSocketFile(ctx context.Context, stack *Stack, family int, skType linux.SockType, protocol int) (*fs.File, *syserr.Error) { id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(family), Type: int64(skType | syscall.SOCK_NONBLOCK), Protocol: int64(protocol)}}}, false /* ignoreResult */) <-c @@ -841,7 +840,7 @@ type socketProvider struct { } // Socket implements socket.Provider.Socket. -func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, protocol int) (*fs.File, *syserr.Error) { +func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) { // Check that we are using the RPC network stack. stack := t.NetworkContext() if stack == nil { @@ -857,7 +856,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p // // Try to restrict the flags we will accept to minimize backwards // incompatibility with netstack. - stype := int(stypeflags) & linux.SOCK_TYPE_MASK + stype := stypeflags & linux.SOCK_TYPE_MASK switch stype { case syscall.SOCK_STREAM: switch protocol { @@ -881,7 +880,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p } // Pair implements socket.Provider.Pair. -func (p *socketProvider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { +func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { // Not supported by AF_INET/AF_INET6. return nil, nil, nil } diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index a99423365..f1021ec67 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -130,12 +130,12 @@ type Provider interface { // If a nil Socket _and_ a nil error is returned, it means that the // protocol is not supported. A non-nil error should only be returned // if the protocol is supported, but an error occurs during creation. - Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) + Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) // Pair creates a pair of connected sockets. // // See Socket for error information. - Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) + Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) } // families holds a map of all known address families and their providers. @@ -149,7 +149,7 @@ func RegisterProvider(family int, provider Provider) { } // New creates a new socket with the given family, type and protocol. -func New(t *kernel.Task, family int, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) { +func New(t *kernel.Task, family int, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { for _, p := range families[family] { s, err := p.Socket(t, stype, protocol) if err != nil { @@ -166,7 +166,7 @@ func New(t *kernel.Task, family int, stype transport.SockType, protocol int) (*f // Pair creates a new connected socket pair with the given family, type and // protocol. -func Pair(t *kernel.Task, family int, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { +func Pair(t *kernel.Task, family int, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { providers, ok := families[family] if !ok { return nil, nil, syserr.ErrAddressFamilyNotSupported diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index 9c8ec0365..db79ac904 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -45,7 +45,7 @@ type ConnectingEndpoint interface { // Type returns the socket type, typically either SockStream or // SockSeqpacket. The connection attempt must be aborted if this // value doesn't match the ConnectableEndpoint's type. - Type() SockType + Type() linux.SockType // GetLocalAddress returns the bound path. GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) @@ -101,7 +101,7 @@ type connectionedEndpoint struct { // stype is used by connecting sockets to ensure that they are the // same type. The value is typically either tcpip.SockSeqpacket or // tcpip.SockStream. - stype SockType + stype linux.SockType // acceptedChan is per the TCP endpoint implementation. Note that the // sockets in this channel are _already in the connected state_, and @@ -112,7 +112,7 @@ type connectionedEndpoint struct { } // NewConnectioned creates a new unbound connectionedEndpoint. -func NewConnectioned(stype SockType, uid UniqueIDProvider) Endpoint { +func NewConnectioned(stype linux.SockType, uid UniqueIDProvider) Endpoint { return &connectionedEndpoint{ baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}}, id: uid.UniqueID(), @@ -122,7 +122,7 @@ func NewConnectioned(stype SockType, uid UniqueIDProvider) Endpoint { } // NewPair allocates a new pair of connected unix-domain connectionedEndpoints. -func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) { +func NewPair(stype linux.SockType, uid UniqueIDProvider) (Endpoint, Endpoint) { a := &connectionedEndpoint{ baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}}, id: uid.UniqueID(), @@ -139,7 +139,7 @@ func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) { q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit} q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit} - if stype == SockStream { + if stype == linux.SOCK_STREAM { a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}} b.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q2}} } else { @@ -163,7 +163,7 @@ func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) { // NewExternal creates a new externally backed Endpoint. It behaves like a // socketpair. -func NewExternal(stype SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint { +func NewExternal(stype linux.SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint { return &connectionedEndpoint{ baseEndpoint: baseEndpoint{Queue: queue, receiver: receiver, connected: connected}, id: uid.UniqueID(), @@ -178,7 +178,7 @@ func (e *connectionedEndpoint) ID() uint64 { } // Type implements ConnectingEndpoint.Type and Endpoint.Type. -func (e *connectionedEndpoint) Type() SockType { +func (e *connectionedEndpoint) Type() linux.SockType { return e.stype } @@ -294,7 +294,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur } writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit} - if e.stype == SockStream { + if e.stype == linux.SOCK_STREAM { ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}} } else { ne.receiver = &queueReceiver{readQueue: writeQueue} @@ -309,7 +309,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur writeQueue: writeQueue, } readQueue.IncRef() - if e.stype == SockStream { + if e.stype == linux.SOCK_STREAM { returnConnect(&streamQueueReceiver{queueReceiver: queueReceiver{readQueue: readQueue}}, connected) } else { returnConnect(&queueReceiver{readQueue: readQueue}, connected) @@ -429,7 +429,7 @@ func (e *connectionedEndpoint) Bind(addr tcpip.FullAddress, commit func() *syser func (e *connectionedEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *syserr.Error) { // Stream sockets do not support specifying the endpoint. Seqpacket // sockets ignore the passed endpoint. - if e.stype == SockStream && to != nil { + if e.stype == linux.SOCK_STREAM && to != nil { return 0, syserr.ErrNotSupported } return e.baseEndpoint.SendMsg(data, c, to) diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index c034cf984..81ebfba10 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -119,8 +119,8 @@ func (e *connectionlessEndpoint) SendMsg(data [][]byte, c ControlMessages, to Bo } // Type implements Endpoint.Type. -func (e *connectionlessEndpoint) Type() SockType { - return SockDgram +func (e *connectionlessEndpoint) Type() linux.SockType { + return linux.SOCK_DGRAM } // Connect attempts to connect directly to server. diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 5fc09af55..5c55c529e 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -19,6 +19,7 @@ import ( "sync" "sync/atomic" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/syserr" "gvisor.googlesource.com/gvisor/pkg/tcpip" "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" @@ -28,21 +29,6 @@ import ( // initialLimit is the starting limit for the socket buffers. const initialLimit = 16 * 1024 -// A SockType is a type (as opposed to family) of sockets. These are enumerated -// in the syscall package as syscall.SOCK_* constants. -type SockType int - -const ( - // SockStream corresponds to syscall.SOCK_STREAM. - SockStream SockType = 1 - // SockDgram corresponds to syscall.SOCK_DGRAM. - SockDgram SockType = 2 - // SockRaw corresponds to syscall.SOCK_RAW. - SockRaw SockType = 3 - // SockSeqpacket corresponds to syscall.SOCK_SEQPACKET. - SockSeqpacket SockType = 5 -) - // A RightsControlMessage is a control message containing FDs. type RightsControlMessage interface { // Clone returns a copy of the RightsControlMessage. @@ -175,7 +161,7 @@ type Endpoint interface { // Type return the socket type, typically either SockStream, SockDgram // or SockSeqpacket. - Type() SockType + Type() linux.SockType // GetLocalAddress returns the address to which the endpoint is bound. GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) @@ -629,7 +615,7 @@ type connectedEndpoint struct { GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) // Type implements Endpoint.Type. - Type() SockType + Type() linux.SockType } writeQueue *queue @@ -653,7 +639,7 @@ func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages, } truncate := false - if e.endpoint.Type() == SockStream { + if e.endpoint.Type() == linux.SOCK_STREAM { // Since stream sockets don't preserve message boundaries, we // can write only as much of the message as fits in the queue. truncate = true diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 375542350..56ed63e21 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -605,7 +605,7 @@ func (s *SocketOperations) State() uint32 { type provider struct{} // Socket returns a new unix domain socket. -func (*provider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) { +func (*provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { // Check arguments. if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { return nil, syserr.ErrProtocolNotSupported @@ -631,7 +631,7 @@ func (*provider) Socket(t *kernel.Task, stype transport.SockType, protocol int) } // Pair creates a new pair of AF_UNIX connected sockets. -func (*provider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { +func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { // Check arguments. if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { return nil, nil, syserr.ErrProtocolNotSupported diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index dbe53b9a2..0b5ef84c4 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -76,13 +76,13 @@ var SocketFamily = abi.ValueSet{ // SocketType are the possible socket(2) types. var SocketType = abi.ValueSet{ - linux.SOCK_STREAM: "SOCK_STREAM", - linux.SOCK_DGRAM: "SOCK_DGRAM", - linux.SOCK_RAW: "SOCK_RAW", - linux.SOCK_RDM: "SOCK_RDM", - linux.SOCK_SEQPACKET: "SOCK_SEQPACKET", - linux.SOCK_DCCP: "SOCK_DCCP", - linux.SOCK_PACKET: "SOCK_PACKET", + uint64(linux.SOCK_STREAM): "SOCK_STREAM", + uint64(linux.SOCK_DGRAM): "SOCK_DGRAM", + uint64(linux.SOCK_RAW): "SOCK_RAW", + uint64(linux.SOCK_RDM): "SOCK_RDM", + uint64(linux.SOCK_SEQPACKET): "SOCK_SEQPACKET", + uint64(linux.SOCK_DCCP): "SOCK_DCCP", + uint64(linux.SOCK_PACKET): "SOCK_PACKET", } // SocketFlagSet are the possible socket(2) flags. diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index 8f4dbf3bc..31295a6a9 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -188,7 +188,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } // Create the new socket. - s, e := socket.New(t, domain, transport.SockType(stype&0xf), protocol) + s, e := socket.New(t, domain, linux.SockType(stype&0xf), protocol) if e != nil { return 0, nil, e.ToError() } @@ -227,7 +227,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } // Create the socket pair. - s1, s2, e := socket.Pair(t, domain, transport.SockType(stype&0xf), protocol) + s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol) if e != nil { return 0, nil, e.ToError() } -- cgit v1.2.3 From 74e397e39accbeb9fcb5382ce963df55014ae7ce Mon Sep 17 00:00:00 2001 From: Ian Lewis Date: Mon, 10 Jun 2019 23:37:32 -0700 Subject: Add introspection for Linux/AMD64 syscalls Adds simple introspection for syscall compatibility information to Linux/AMD64. Syscalls registered in the syscall table now have associated metadata like name, support level, notes, and URLs to relevant issues. Syscall information can be exported as a table, JSON, or CSV using the new 'runsc help syscalls' command. Users can use this info to debug and get info on the compatibility of the version of runsc they are running or to generate documentation. PiperOrigin-RevId: 252558304 --- WORKSPACE | 2 +- go.mod | 8 +- pkg/abi/linux/capability.go | 84 ++++ pkg/sentry/kernel/syscalls.go | 60 ++- pkg/sentry/kernel/table_test.go | 8 +- pkg/sentry/syscalls/linux/linux64.go | 755 +++++++++++++++-------------------- pkg/sentry/syscalls/syscalls.go | 88 +++- runsc/cmd/BUILD | 3 + runsc/cmd/help.go | 126 ++++++ runsc/cmd/syscalls.go | 347 ++++++++++++++++ runsc/main.go | 4 +- 11 files changed, 1032 insertions(+), 453 deletions(-) create mode 100644 runsc/cmd/help.go create mode 100644 runsc/cmd/syscalls.go (limited to 'pkg/sentry/syscalls') diff --git a/WORKSPACE b/WORKSPACE index 421453a87..5155dc527 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -62,7 +62,7 @@ go_repository( go_repository( name = "com_github_google_subcommands", - commit = "ce3d4cfc062f", + commit = "636abe8753b8", importpath = "github.com/google/subcommands", ) diff --git a/go.mod b/go.mod index 9a6b27c6b..e58b84cfb 100644 --- a/go.mod +++ b/go.mod @@ -3,19 +3,19 @@ module gvisor.googlesource.com/gvisor go 1.12 require ( - github.com/cenkalti/backoff v2.2.0 + github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422 github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079 github.com/golang/mock v1.3.1 github.com/golang/protobuf v1.3.1 github.com/google/btree v1.0.0 github.com/google/go-cmp v0.2.0 - github.com/google/subcommands v0.0.0-20170224175846-ce3d4cfc062f + github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8 github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d github.com/kr/pty v1.1.1 github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78 github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2 github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 - golang.org/x/net v0.0.0-20180404174746-b3c676e531a6 - golang.org/x/sys v0.0.0-20171117071000-0dd5e194bbf5 + golang.org/x/net v0.0.0-20190311183353-d8887717615a + golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a ) diff --git a/pkg/abi/linux/capability.go b/pkg/abi/linux/capability.go index c120cac64..65dd77e6e 100644 --- a/pkg/abi/linux/capability.go +++ b/pkg/abi/linux/capability.go @@ -69,6 +69,90 @@ func (cp Capability) Ok() bool { return cp >= 0 && cp <= MaxCapability } +// String returns the capability name. +func (cp Capability) String() string { + switch cp { + case CAP_CHOWN: + return "CAP_CHOWN" + case CAP_DAC_OVERRIDE: + return "CAP_DAC_OVERRIDE" + case CAP_DAC_READ_SEARCH: + return "CAP_DAC_READ_SEARCH" + case CAP_FOWNER: + return "CAP_FOWNER" + case CAP_FSETID: + return "CAP_FSETID" + case CAP_KILL: + return "CAP_KILL" + case CAP_SETGID: + return "CAP_SETGID" + case CAP_SETUID: + return "CAP_SETUID" + case CAP_SETPCAP: + return "CAP_SETPCAP" + case CAP_LINUX_IMMUTABLE: + return "CAP_LINUX_IMMUTABLE" + case CAP_NET_BIND_SERVICE: + return "CAP_NET_BIND_SERVICE" + case CAP_NET_BROADCAST: + return "CAP_NET_BROADCAST" + case CAP_NET_ADMIN: + return "CAP_NET_ADMIN" + case CAP_NET_RAW: + return "CAP_NET_RAW" + case CAP_IPC_LOCK: + return "CAP_IPC_LOCK" + case CAP_IPC_OWNER: + return "CAP_IPC_OWNER" + case CAP_SYS_MODULE: + return "CAP_SYS_MODULE" + case CAP_SYS_RAWIO: + return "CAP_SYS_RAWIO" + case CAP_SYS_CHROOT: + return "CAP_SYS_CHROOT" + case CAP_SYS_PTRACE: + return "CAP_SYS_PTRACE" + case CAP_SYS_PACCT: + return "CAP_SYS_PACCT" + case CAP_SYS_ADMIN: + return "CAP_SYS_ADMIN" + case CAP_SYS_BOOT: + return "CAP_SYS_BOOT" + case CAP_SYS_NICE: + return "CAP_SYS_NICE" + case CAP_SYS_RESOURCE: + return "CAP_SYS_RESOURCE" + case CAP_SYS_TIME: + return "CAP_SYS_TIME" + case CAP_SYS_TTY_CONFIG: + return "CAP_SYS_TTY_CONFIG" + case CAP_MKNOD: + return "CAP_MKNOD" + case CAP_LEASE: + return "CAP_LEASE" + case CAP_AUDIT_WRITE: + return "CAP_AUDIT_WRITE" + case CAP_AUDIT_CONTROL: + return "CAP_AUDIT_CONTROL" + case CAP_SETFCAP: + return "CAP_SETFCAP" + case CAP_MAC_OVERRIDE: + return "CAP_MAC_OVERRIDE" + case CAP_MAC_ADMIN: + return "CAP_MAC_ADMIN" + case CAP_SYSLOG: + return "CAP_SYSLOG" + case CAP_WAKE_ALARM: + return "CAP_WAKE_ALARM" + case CAP_BLOCK_SUSPEND: + return "CAP_BLOCK_SUSPEND" + case CAP_AUDIT_READ: + return "CAP_AUDIT_READ" + default: + return "UNKNOWN" + } +} + // Version numbers used by the capget/capset syscalls, defined in Linux's // include/uapi/linux/capability.h. const ( diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 0572053db..27cd3728b 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -32,6 +32,51 @@ import ( // syscall. const maxSyscallNum = 2000 +// SyscallSupportLevel is a syscall support levels. +type SyscallSupportLevel int + +// String returns a human readable represetation of the support level. +func (l SyscallSupportLevel) String() string { + switch l { + case SupportUnimplemented: + return "Unimplemented" + case SupportPartial: + return "Partial Support" + case SupportFull: + return "Full Support" + default: + return "Undocumented" + } +} + +const ( + // SupportUndocumented indicates the syscall is not documented yet. + SupportUndocumented = iota + + // SupportUnimplemented indicates the syscall is unimplemented. + SupportUnimplemented + + // SupportPartial indicates the syscall is partially supported. + SupportPartial + + // SupportFull indicates the syscall is fully supported. + SupportFull +) + +// Syscall includes the syscall implementation and compatibility information. +type Syscall struct { + // Name is the syscall name. + Name string + // Fn is the implementation of the syscall. + Fn SyscallFn + // SupportLevel is the level of support implemented in gVisor. + SupportLevel SyscallSupportLevel + // Note describes the compatibility of the syscall. + Note string + // URLs is set of URLs to any relevant bugs or issues. + URLs []string +} + // SyscallFn is a syscall implementation. type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error) @@ -83,7 +128,7 @@ type SyscallFlagsTable struct { // Init initializes the struct, with all syscalls in table set to enable. // // max is the largest syscall number in table. -func (e *SyscallFlagsTable) init(table map[uintptr]SyscallFn, max uintptr) { +func (e *SyscallFlagsTable) init(table map[uintptr]Syscall, max uintptr) { e.enable = make([]uint32, max+1) for num := range table { e.enable[num] = syscallPresent @@ -194,7 +239,7 @@ type SyscallTable struct { AuditNumber uint32 `state:"manual"` // Table is the collection of functions. - Table map[uintptr]SyscallFn `state:"manual"` + Table map[uintptr]Syscall `state:"manual"` // lookup is a fixed-size array that holds the syscalls (indexed by // their numbers). It is used for fast look ups. @@ -247,7 +292,7 @@ func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { func RegisterSyscallTable(s *SyscallTable) { if s.Table == nil { // Ensure non-nil lookup table. - s.Table = make(map[uintptr]SyscallFn) + s.Table = make(map[uintptr]Syscall) } if s.Emulate == nil { // Ensure non-nil emulate table. @@ -268,8 +313,8 @@ func RegisterSyscallTable(s *SyscallTable) { s.lookup = make([]SyscallFn, max+1) // Initialize the fast-lookup table. - for num, fn := range s.Table { - s.lookup[num] = fn + for num, sc := range s.Table { + s.lookup[num] = sc.Fn } s.FeatureEnable.init(s.Table, max) @@ -303,5 +348,8 @@ func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { // mapLookup is similar to Lookup, except that it only uses the syscall table, // that is, it skips the fast look array. This is available for benchmarking. func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn { - return s.Table[sysno] + if sc, ok := s.Table[sysno]; ok { + return sc.Fn + } + return nil } diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go index 8f7cdb9f3..3f2b042c8 100644 --- a/pkg/sentry/kernel/table_test.go +++ b/pkg/sentry/kernel/table_test.go @@ -26,11 +26,13 @@ const ( ) func createSyscallTable() *SyscallTable { - m := make(map[uintptr]SyscallFn) + m := make(map[uintptr]Syscall) for i := uintptr(0); i <= maxTestSyscall; i++ { j := i - m[i] = func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) { - return j, nil, nil + m[i] = Syscall{ + Fn: func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) { + return j, nil, nil + }, } } diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index ad88b1391..5251c2463 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -34,33 +34,6 @@ const _AUDIT_ARCH_X86_64 = 0xc000003e // AMD64 is a table of Linux amd64 syscall API with the corresponding syscall // numbers from Linux 4.4. The entries commented out are those syscalls we // don't currently support. -// -// Syscall support is documented as annotations in Go comments of the form: -// @Syscall(, , ...) -// -// Supported args and values are: -// -// - arg: A syscall option. This entry only applies to the syscall when given -// this option. -// - support: Indicates support level -// - UNIMPLEMENTED: Unimplemented (default, implies returns:ENOSYS) -// - PARTIAL: Partial support. Details should be provided in note. -// - FULL: Full support -// - returns: Indicates a known return value. Values are syscall errors. This -// is treated as a string so you can use something like -// "returns:EPERM or ENOSYS". -// - issue: A Github issue number. -// - note: A note -// -// Example: -// // @Syscall(mmap, arg:MAP_PRIVATE, support:FULL, note:Private memory fully supported) -// // @Syscall(mmap, arg:MAP_SHARED, issue:123, note:Shared memory not supported) -// // @Syscall(setxattr, returns:ENOTSUP, note:Requires file system support) -// -// Annotations should be placed as close to their implementation as possible -// (preferrably as part of a supporting function's Godoc) and should be -// updated as syscall support changes. Unimplemented syscalls are documented -// here due to their lack of a supporting function or method. var AMD64 = &kernel.SyscallTable{ OS: abi.Linux, Arch: arch.AMD64, @@ -74,404 +47,338 @@ var AMD64 = &kernel.SyscallTable{ Version: "#1 SMP Sun Jan 10 15:06:54 PST 2016", }, AuditNumber: _AUDIT_ARCH_X86_64, - Table: map[uintptr]kernel.SyscallFn{ - 0: Read, - 1: Write, - 2: Open, - 3: Close, - 4: Stat, - 5: Fstat, - 6: Lstat, - 7: Poll, - 8: Lseek, - 9: Mmap, - 10: Mprotect, - 11: Munmap, - 12: Brk, - 13: RtSigaction, - 14: RtSigprocmask, - 15: RtSigreturn, - 16: Ioctl, - 17: Pread64, - 18: Pwrite64, - 19: Readv, - 20: Writev, - 21: Access, - 22: Pipe, - 23: Select, - 24: SchedYield, - 25: Mremap, - 26: Msync, - 27: Mincore, - 28: Madvise, - 29: Shmget, - 30: Shmat, - 31: Shmctl, - 32: Dup, - 33: Dup2, - 34: Pause, - 35: Nanosleep, - 36: Getitimer, - 37: Alarm, - 38: Setitimer, - 39: Getpid, - 40: Sendfile, - 41: Socket, - 42: Connect, - 43: Accept, - 44: SendTo, - 45: RecvFrom, - 46: SendMsg, - 47: RecvMsg, - 48: Shutdown, - 49: Bind, - 50: Listen, - 51: GetSockName, - 52: GetPeerName, - 53: SocketPair, - 54: SetSockOpt, - 55: GetSockOpt, - 56: Clone, - 57: Fork, - 58: Vfork, - 59: Execve, - 60: Exit, - 61: Wait4, - 62: Kill, - 63: Uname, - 64: Semget, - 65: Semop, - 66: Semctl, - 67: Shmdt, - // 68: @Syscall(Msgget), TODO(b/29354921) - // 69: @Syscall(Msgsnd), TODO(b/29354921) - // 70: @Syscall(Msgrcv), TODO(b/29354921) - // 71: @Syscall(Msgctl), TODO(b/29354921) - 72: Fcntl, - 73: Flock, - 74: Fsync, - 75: Fdatasync, - 76: Truncate, - 77: Ftruncate, - 78: Getdents, - 79: Getcwd, - 80: Chdir, - 81: Fchdir, - 82: Rename, - 83: Mkdir, - 84: Rmdir, - 85: Creat, - 86: Link, - 87: Unlink, - 88: Symlink, - 89: Readlink, - 90: Chmod, - 91: Fchmod, - 92: Chown, - 93: Fchown, - 94: Lchown, - 95: Umask, - 96: Gettimeofday, - 97: Getrlimit, - 98: Getrusage, - 99: Sysinfo, - 100: Times, - 101: Ptrace, - 102: Getuid, - 103: Syslog, - 104: Getgid, - 105: Setuid, - 106: Setgid, - 107: Geteuid, - 108: Getegid, - 109: Setpgid, - 110: Getppid, - 111: Getpgrp, - 112: Setsid, - 113: Setreuid, - 114: Setregid, - 115: Getgroups, - 116: Setgroups, - 117: Setresuid, - 118: Getresuid, - 119: Setresgid, - 120: Getresgid, - 121: Getpgid, - // 122: @Syscall(Setfsuid), TODO(b/112851702) - // 123: @Syscall(Setfsgid), TODO(b/112851702) - 124: Getsid, - 125: Capget, - 126: Capset, - 127: RtSigpending, - 128: RtSigtimedwait, - 129: RtSigqueueinfo, - 130: RtSigsuspend, - 131: Sigaltstack, - 132: Utime, - 133: Mknod, - // @Syscall(Uselib, note:Obsolete) - 134: syscalls.Error(syscall.ENOSYS), - // @Syscall(SetPersonality, returns:EINVAL, note:Unable to change personality) - 135: syscalls.ErrorWithEvent(syscall.EINVAL), - // @Syscall(Ustat, note:Needs filesystem support) - 136: syscalls.ErrorWithEvent(syscall.ENOSYS), - 137: Statfs, - 138: Fstatfs, - // 139: @Syscall(Sysfs), TODO(gvisor.dev/issue/165) - 140: Getpriority, - 141: Setpriority, - // @Syscall(SchedSetparam, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise) - 142: syscalls.CapError(linux.CAP_SYS_NICE), // requires cap_sys_nice - 143: SchedGetparam, - 144: SchedSetscheduler, - 145: SchedGetscheduler, - 146: SchedGetPriorityMax, - 147: SchedGetPriorityMin, - // @Syscall(SchedRrGetInterval, returns:EPERM) - 148: syscalls.ErrorWithEvent(syscall.EPERM), - 149: Mlock, - 150: Munlock, - 151: Mlockall, - 152: Munlockall, - // @Syscall(Vhangup, returns:EPERM) - 153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), - // @Syscall(ModifyLdt, returns:EPERM) - 154: syscalls.Error(syscall.EPERM), - // @Syscall(PivotRoot, returns:EPERM) - 155: syscalls.Error(syscall.EPERM), - // @Syscall(Sysctl, returns:EPERM) - 156: syscalls.Error(syscall.EPERM), // syscall is "worthless" - 157: Prctl, - 158: ArchPrctl, - // @Syscall(Adjtimex, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_time; ENOSYS otherwise) - 159: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time - 160: Setrlimit, - 161: Chroot, - 162: Sync, - // @Syscall(Acct, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_pacct; ENOSYS otherwise) - 163: syscalls.CapError(linux.CAP_SYS_PACCT), // requires cap_sys_pacct - // @Syscall(Settimeofday, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_time; ENOSYS otherwise) - 164: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time - 165: Mount, - 166: Umount2, - // @Syscall(Swapon, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise) - 167: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin - // @Syscall(Swapoff, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise) - 168: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin - // @Syscall(Reboot, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise) - 169: syscalls.CapError(linux.CAP_SYS_BOOT), // requires cap_sys_boot - 170: Sethostname, - 171: Setdomainname, - // @Syscall(Iopl, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_rawio; ENOSYS otherwise) - 172: syscalls.CapError(linux.CAP_SYS_RAWIO), // requires cap_sys_rawio - // @Syscall(Ioperm, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_rawio; ENOSYS otherwise) - 173: syscalls.CapError(linux.CAP_SYS_RAWIO), // requires cap_sys_rawio - // @Syscall(CreateModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise) - 174: syscalls.CapError(linux.CAP_SYS_MODULE), // CreateModule, requires cap_sys_module - // @Syscall(InitModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise) - 175: syscalls.CapError(linux.CAP_SYS_MODULE), // requires cap_sys_module - // @Syscall(DeleteModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise) - 176: syscalls.CapError(linux.CAP_SYS_MODULE), // requires cap_sys_module - // @Syscall(GetKernelSyms, note:Not supported in > 2.6) - 177: syscalls.Error(syscall.ENOSYS), - // @Syscall(QueryModule, note:Not supported in > 2.6) - 178: syscalls.Error(syscall.ENOSYS), - // @Syscall(Quotactl, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise) - 179: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin (most operations) - // @Syscall(Nfsservctl, note:Does not exist > 3.1) - 180: syscalls.Error(syscall.ENOSYS), - // @Syscall(Getpmsg, note:Not implemented in Linux) - 181: syscalls.Error(syscall.ENOSYS), - // @Syscall(Putpmsg, note:Not implemented in Linux) - 182: syscalls.Error(syscall.ENOSYS), - // @Syscall(AfsSyscall, note:Not implemented in Linux) - 183: syscalls.Error(syscall.ENOSYS), - // @Syscall(Tuxcall, note:Not implemented in Linux) - 184: syscalls.Error(syscall.ENOSYS), - // @Syscall(Security, note:Not implemented in Linux) - 185: syscalls.Error(syscall.ENOSYS), - 186: Gettid, - 187: nil, // @Syscall(Readahead), TODO(b/29351341) - // @Syscall(Setxattr, returns:ENOTSUP, note:Requires filesystem support) - 188: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Lsetxattr, returns:ENOTSUP, note:Requires filesystem support) - 189: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Fsetxattr, returns:ENOTSUP, note:Requires filesystem support) - 190: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Getxattr, returns:ENOTSUP, note:Requires filesystem support) - 191: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Lgetxattr, returns:ENOTSUP, note:Requires filesystem support) - 192: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Fgetxattr, returns:ENOTSUP, note:Requires filesystem support) - 193: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Listxattr, returns:ENOTSUP, note:Requires filesystem support) - 194: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Llistxattr, returns:ENOTSUP, note:Requires filesystem support) - 195: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Flistxattr, returns:ENOTSUP, note:Requires filesystem support) - 196: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Removexattr, returns:ENOTSUP, note:Requires filesystem support) - 197: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Lremovexattr, returns:ENOTSUP, note:Requires filesystem support) - 198: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Fremovexattr, returns:ENOTSUP, note:Requires filesystem support) - 199: syscalls.ErrorWithEvent(syscall.ENOTSUP), - 200: Tkill, - 201: Time, - 202: Futex, - 203: SchedSetaffinity, - 204: SchedGetaffinity, - // @Syscall(SetThreadArea, note:Expected to return ENOSYS on 64-bit) - 205: syscalls.Error(syscall.ENOSYS), - 206: IoSetup, - 207: IoDestroy, - 208: IoGetevents, - 209: IoSubmit, - 210: IoCancel, - // @Syscall(GetThreadArea, note:Expected to return ENOSYS on 64-bit) - 211: syscalls.Error(syscall.ENOSYS), - // @Syscall(LookupDcookie, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise) - 212: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin - 213: EpollCreate, - // @Syscall(EpollCtlOld, note:Deprecated) - 214: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated (afaik, unused) - // @Syscall(EpollWaitOld, note:Deprecated) - 215: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated (afaik, unused) - // @Syscall(RemapFilePages, note:Deprecated) - 216: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated since 3.16 - 217: Getdents64, - 218: SetTidAddress, - 219: RestartSyscall, - // 220: @Syscall(Semtimedop), TODO(b/29354920) - 221: Fadvise64, - 222: TimerCreate, - 223: TimerSettime, - 224: TimerGettime, - 225: TimerGetoverrun, - 226: TimerDelete, - 227: ClockSettime, - 228: ClockGettime, - 229: ClockGetres, - 230: ClockNanosleep, - 231: ExitGroup, - 232: EpollWait, - 233: EpollCtl, - 234: Tgkill, - 235: Utimes, - // @Syscall(Vserver, note:Not implemented by Linux) - 236: syscalls.Error(syscall.ENOSYS), // Vserver, not implemented by Linux - 237: Mbind, - 238: SetMempolicy, - 239: GetMempolicy, - // 240: @Syscall(MqOpen), TODO(b/29354921) - // 241: @Syscall(MqUnlink), TODO(b/29354921) - // 242: @Syscall(MqTimedsend), TODO(b/29354921) - // 243: @Syscall(MqTimedreceive), TODO(b/29354921) - // 244: @Syscall(MqNotify), TODO(b/29354921) - // 245: @Syscall(MqGetsetattr), TODO(b/29354921) - 246: syscalls.CapError(linux.CAP_SYS_BOOT), // kexec_load, requires cap_sys_boot - 247: Waitid, - // @Syscall(AddKey, returns:EACCES, note:Not available to user) - 248: syscalls.Error(syscall.EACCES), - // @Syscall(RequestKey, returns:EACCES, note:Not available to user) - 249: syscalls.Error(syscall.EACCES), - // @Syscall(Keyctl, returns:EACCES, note:Not available to user) - 250: syscalls.Error(syscall.EACCES), - // @Syscall(IoprioSet, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise) - 251: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_nice or cap_sys_admin (depending) - // @Syscall(IoprioGet, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise) - 252: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_nice or cap_sys_admin (depending) - 253: InotifyInit, - 254: InotifyAddWatch, - 255: InotifyRmWatch, - // @Syscall(MigratePages, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise) - 256: syscalls.CapError(linux.CAP_SYS_NICE), - 257: Openat, - 258: Mkdirat, - 259: Mknodat, - 260: Fchownat, - 261: Futimesat, - 262: Fstatat, - 263: Unlinkat, - 264: Renameat, - 265: Linkat, - 266: Symlinkat, - 267: Readlinkat, - 268: Fchmodat, - 269: Faccessat, - 270: Pselect, - 271: Ppoll, - 272: Unshare, - // @Syscall(SetRobustList, note:Obsolete) - 273: syscalls.Error(syscall.ENOSYS), - // @Syscall(GetRobustList, note:Obsolete) - 274: syscalls.Error(syscall.ENOSYS), - 275: Splice, - // 276: @Syscall(Tee), TODO(b/29354098) - 277: SyncFileRange, - // 278: @Syscall(Vmsplice), TODO(b/29354098) - // @Syscall(MovePages, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise) - 279: syscalls.CapError(linux.CAP_SYS_NICE), // requires cap_sys_nice (mostly) - 280: Utimensat, - 281: EpollPwait, - // 282: @Syscall(Signalfd), TODO(b/19846426) - 283: TimerfdCreate, - 284: Eventfd, - 285: Fallocate, - 286: TimerfdSettime, - 287: TimerfdGettime, - 288: Accept4, - // 289: @Syscall(Signalfd4), TODO(b/19846426) - 290: Eventfd2, - 291: EpollCreate1, - 292: Dup3, - 293: Pipe2, - 294: InotifyInit1, - 295: Preadv, - 296: Pwritev, - 297: RtTgsigqueueinfo, - // @Syscall(PerfEventOpen, returns:ENODEV, note:No support for perf counters) - 298: syscalls.ErrorWithEvent(syscall.ENODEV), - 299: RecvMMsg, - // @Syscall(FanotifyInit, note:Needs CONFIG_FANOTIFY) - 300: syscalls.ErrorWithEvent(syscall.ENOSYS), - // @Syscall(FanotifyMark, note:Needs CONFIG_FANOTIFY) - 301: syscalls.ErrorWithEvent(syscall.ENOSYS), - 302: Prlimit64, - // @Syscall(NameToHandleAt, returns:EOPNOTSUPP, note:Needs filesystem support) - 303: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP), - // @Syscall(OpenByHandleAt, returns:EOPNOTSUPP, note:Needs filesystem support) - 304: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP), - // @Syscall(ClockAdjtime, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise) - 305: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time - 306: Syncfs, - 307: SendMMsg, - // 308: @Syscall(Setns), TODO(b/29354995) - 309: Getcpu, - // 310: @Syscall(ProcessVmReadv), TODO(gvisor.dev/issue/158) may require cap_sys_ptrace - // 311: @Syscall(ProcessVmWritev), TODO(gvisor.dev/issue/158) may require cap_sys_ptrace - // @Syscall(Kcmp, returns:EPERM or ENOSYS, note:Requires cap_sys_ptrace) - 312: syscalls.CapError(linux.CAP_SYS_PTRACE), - // @Syscall(FinitModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise) - 313: syscalls.CapError(linux.CAP_SYS_MODULE), - // 314: @Syscall(SchedSetattr), TODO(b/118902272), we have no scheduler - // 315: @Syscall(SchedGetattr), TODO(b/118902272), we have no scheduler - // 316: @Syscall(Renameat2), TODO(b/118902772) - 317: Seccomp, - 318: GetRandom, - 319: MemfdCreate, - // @Syscall(KexecFileLoad, EPERM or ENOSYS, note:Infeasible to support. Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise) - 320: syscalls.CapError(linux.CAP_SYS_BOOT), - // @Syscall(Bpf, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise) - 321: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin for all commands - // 322: @Syscall(Execveat), TODO(b/118901836) - // 323: @Syscall(Userfaultfd), TODO(b/118906345) - // 324: @Syscall(Membarrier), TODO(b/118904897) - 325: Mlock2, + Table: map[uintptr]kernel.Syscall{ + 0: syscalls.Supported("read", Read), + 1: syscalls.Supported("write", Write), + 2: syscalls.Supported("open", Open), + 3: syscalls.Supported("close", Close), + 4: syscalls.Undocumented("stat", Stat), + 5: syscalls.Undocumented("fstat", Fstat), + 6: syscalls.Undocumented("lstat", Lstat), + 7: syscalls.Undocumented("poll", Poll), + 8: syscalls.Undocumented("lseek", Lseek), + 9: syscalls.Undocumented("mmap", Mmap), + 10: syscalls.Undocumented("mprotect", Mprotect), + 11: syscalls.Undocumented("munmap", Munmap), + 12: syscalls.Undocumented("brk", Brk), + 13: syscalls.Undocumented("rt_sigaction", RtSigaction), + 14: syscalls.Undocumented("rt_sigprocmask", RtSigprocmask), + 15: syscalls.Undocumented("rt_sigreturn", RtSigreturn), + 16: syscalls.Undocumented("ioctl", Ioctl), + 17: syscalls.Undocumented("pread64", Pread64), + 18: syscalls.Undocumented("pwrite64", Pwrite64), + 19: syscalls.Undocumented("readv", Readv), + 20: syscalls.Undocumented("writev", Writev), + 21: syscalls.Undocumented("access", Access), + 22: syscalls.Undocumented("pipe", Pipe), + 23: syscalls.Undocumented("select", Select), + 24: syscalls.Undocumented("sched_yield", SchedYield), + 25: syscalls.Undocumented("mremap", Mremap), + 26: syscalls.Undocumented("msync", Msync), + 27: syscalls.Undocumented("mincore", Mincore), + 28: syscalls.Undocumented("madvise", Madvise), + 29: syscalls.Undocumented("shmget", Shmget), + 30: syscalls.Undocumented("shmat", Shmat), + 31: syscalls.Undocumented("shmctl", Shmctl), + 32: syscalls.Undocumented("dup", Dup), + 33: syscalls.Undocumented("dup2", Dup2), + 34: syscalls.Undocumented("pause", Pause), + 35: syscalls.Undocumented("nanosleep", Nanosleep), + 36: syscalls.Undocumented("getitimer", Getitimer), + 37: syscalls.Undocumented("alarm", Alarm), + 38: syscalls.Undocumented("setitimer", Setitimer), + 39: syscalls.Undocumented("getpid", Getpid), + 40: syscalls.Undocumented("sendfile", Sendfile), + 41: syscalls.Undocumented("socket", Socket), + 42: syscalls.Undocumented("connect", Connect), + 43: syscalls.Undocumented("accept", Accept), + 44: syscalls.Undocumented("sendto", SendTo), + 45: syscalls.Undocumented("recvfrom", RecvFrom), + 46: syscalls.Undocumented("sendmsg", SendMsg), + 47: syscalls.Undocumented("recvmsg", RecvMsg), + 48: syscalls.Undocumented("shutdown", Shutdown), + 49: syscalls.Undocumented("bind", Bind), + 50: syscalls.Undocumented("listen", Listen), + 51: syscalls.Undocumented("getsockname", GetSockName), + 52: syscalls.Undocumented("getpeername", GetPeerName), + 53: syscalls.Undocumented("socketpair", SocketPair), + 54: syscalls.Undocumented("setsockopt", SetSockOpt), + 55: syscalls.Undocumented("getsockopt", GetSockOpt), + 56: syscalls.Undocumented("clone", Clone), + 57: syscalls.Undocumented("fork", Fork), + 58: syscalls.Undocumented("vfork", Vfork), + 59: syscalls.Undocumented("execve", Execve), + 60: syscalls.Undocumented("exit", Exit), + 61: syscalls.Undocumented("wait4", Wait4), + 62: syscalls.Undocumented("kill", Kill), + 63: syscalls.Undocumented("uname", Uname), + 64: syscalls.Undocumented("semget", Semget), + 65: syscalls.Undocumented("semop", Semop), + 66: syscalls.Undocumented("semctl", Semctl), + 67: syscalls.Undocumented("shmdt", Shmdt), + 68: syscalls.ErrorWithEvent("msgget", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 69: syscalls.ErrorWithEvent("msgsnd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 70: syscalls.ErrorWithEvent("msgrcv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 71: syscalls.ErrorWithEvent("msgctl", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 72: syscalls.Undocumented("fcntl", Fcntl), + 73: syscalls.Undocumented("flock", Flock), + 74: syscalls.Undocumented("fsync", Fsync), + 75: syscalls.Undocumented("fdatasync", Fdatasync), + 76: syscalls.Undocumented("truncate", Truncate), + 77: syscalls.Undocumented("ftruncate", Ftruncate), + 78: syscalls.Undocumented("getdents", Getdents), + 79: syscalls.Undocumented("getcwd", Getcwd), + 80: syscalls.Undocumented("chdir", Chdir), + 81: syscalls.Undocumented("fchdir", Fchdir), + 82: syscalls.Undocumented("rename", Rename), + 83: syscalls.Undocumented("mkdir", Mkdir), + 84: syscalls.Undocumented("rmdir", Rmdir), + 85: syscalls.Undocumented("creat", Creat), + 86: syscalls.Undocumented("link", Link), + 87: syscalls.Undocumented("link", Unlink), + 88: syscalls.Undocumented("symlink", Symlink), + 89: syscalls.Undocumented("readlink", Readlink), + 90: syscalls.Undocumented("chmod", Chmod), + 91: syscalls.Undocumented("fchmod", Fchmod), + 92: syscalls.Undocumented("chown", Chown), + 93: syscalls.Undocumented("fchown", Fchown), + 94: syscalls.Undocumented("lchown", Lchown), + 95: syscalls.Undocumented("umask", Umask), + 96: syscalls.Undocumented("gettimeofday", Gettimeofday), + 97: syscalls.Undocumented("getrlimit", Getrlimit), + 98: syscalls.Undocumented("getrusage", Getrusage), + 99: syscalls.Undocumented("sysinfo", Sysinfo), + 100: syscalls.Undocumented("times", Times), + 101: syscalls.Undocumented("ptrace", Ptrace), + 102: syscalls.Undocumented("getuid", Getuid), + 103: syscalls.Undocumented("syslog", Syslog), + 104: syscalls.Undocumented("getgid", Getgid), + 105: syscalls.Undocumented("setuid", Setuid), + 106: syscalls.Undocumented("setgid", Setgid), + 107: syscalls.Undocumented("geteuid", Geteuid), + 108: syscalls.Undocumented("getegid", Getegid), + 109: syscalls.Undocumented("setpgid", Setpgid), + 110: syscalls.Undocumented("getppid", Getppid), + 111: syscalls.Undocumented("getpgrp", Getpgrp), + 112: syscalls.Undocumented("setsid", Setsid), + 113: syscalls.Undocumented("setreuid", Setreuid), + 114: syscalls.Undocumented("setregid", Setregid), + 115: syscalls.Undocumented("getgroups", Getgroups), + 116: syscalls.Undocumented("setgroups", Setgroups), + 117: syscalls.Undocumented("setresuid", Setresuid), + 118: syscalls.Undocumented("getresuid", Getresuid), + 119: syscalls.Undocumented("setresgid", Setresgid), + 120: syscalls.Undocumented("setresgid", Getresgid), + 121: syscalls.Undocumented("getpgid", Getpgid), + 122: syscalls.ErrorWithEvent("setfsuid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 123: syscalls.ErrorWithEvent("setfsgid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 124: syscalls.Undocumented("getsid", Getsid), + 125: syscalls.Undocumented("capget", Capget), + 126: syscalls.Undocumented("capset", Capset), + 127: syscalls.Undocumented("rt_sigpending", RtSigpending), + 128: syscalls.Undocumented("rt_sigtimedwait", RtSigtimedwait), + 129: syscalls.Undocumented("rt_sigqueueinfo", RtSigqueueinfo), + 130: syscalls.Undocumented("rt_sigsuspend", RtSigsuspend), + 131: syscalls.Undocumented("sigaltstack", Sigaltstack), + 132: syscalls.Undocumented("utime", Utime), + 133: syscalls.Undocumented("mknod", Mknod), + 134: syscalls.Error("uselib", syscall.ENOSYS, "Obsolete", nil), + 135: syscalls.ErrorWithEvent("personality", syscall.EINVAL, "Unable to change personality.", nil), + 136: syscalls.ErrorWithEvent("ustat", syscall.ENOSYS, "Needs filesystem support.", nil), + 137: syscalls.Undocumented("statfs", Statfs), + 138: syscalls.Undocumented("fstatfs", Fstatfs), + 139: syscalls.ErrorWithEvent("sysfs", syscall.ENOSYS, "", []string{"gvisor.dev/issue/165"}), + 140: syscalls.Undocumented("getpriority", Getpriority), + 141: syscalls.Undocumented("setpriority", Setpriority), + 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), + 143: syscalls.Undocumented("sched_getparam", SchedGetparam), + 144: syscalls.Undocumented("sched_setscheduler", SchedSetscheduler), + 145: syscalls.Undocumented("sched_getscheduler", SchedGetscheduler), + 146: syscalls.Undocumented("sched_get_priority_max", SchedGetPriorityMax), + 147: syscalls.Undocumented("sched_get_priority_min", SchedGetPriorityMin), + 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syscall.EPERM, "", nil), + 149: syscalls.Undocumented("mlock", Mlock), + 150: syscalls.Undocumented("munlock", Munlock), + 151: syscalls.Undocumented("mlockall", Mlockall), + 152: syscalls.Undocumented("munlockall", Munlockall), + 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), + 154: syscalls.Error("modify_ldt", syscall.EPERM, "", nil), + 155: syscalls.Error("pivot_root", syscall.EPERM, "", nil), + 156: syscalls.Error("sysctl", syscall.EPERM, `syscall is "worthless"`, nil), + 157: syscalls.Undocumented("prctl", Prctl), + 158: syscalls.Undocumented("arch_prctl", ArchPrctl), + 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), + 160: syscalls.Undocumented("setrlimit", Setrlimit), + 161: syscalls.Undocumented("chroot", Chroot), + 162: syscalls.Undocumented("sync", Sync), + 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), + 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), + 165: syscalls.Undocumented("mount", Mount), + 166: syscalls.Undocumented("umount2", Umount2), + 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), + 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), + 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), + 170: syscalls.Undocumented("sethostname", Sethostname), + 171: syscalls.Undocumented("setdomainname", Setdomainname), + 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil), + 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil), + 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), + 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), + 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), + 177: syscalls.Error("get_kernel_syms", syscall.ENOSYS, "Not supported in > 2.6", nil), + 178: syscalls.Error("query_module", syscall.ENOSYS, "Not supported in > 2.6", nil), + 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations + 180: syscalls.Error("nfsservctl", syscall.ENOSYS, "Does not exist > 3.1", nil), + 181: syscalls.Error("getpmsg", syscall.ENOSYS, "Not implemented in Linux", nil), + 182: syscalls.Error("putpmsg", syscall.ENOSYS, "Not implemented in Linux", nil), + 183: syscalls.Error("afs_syscall", syscall.ENOSYS, "Not implemented in Linux", nil), + 184: syscalls.Error("tuxcall", syscall.ENOSYS, "Not implemented in Linux", nil), + 185: syscalls.Error("security", syscall.ENOSYS, "Not implemented in Linux", nil), + 186: syscalls.Undocumented("gettid", Gettid), + 187: syscalls.ErrorWithEvent("readahead", syscall.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341) + 188: syscalls.ErrorWithEvent("setxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 189: syscalls.ErrorWithEvent("lsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 190: syscalls.ErrorWithEvent("fsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 191: syscalls.ErrorWithEvent("getxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 192: syscalls.ErrorWithEvent("lgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 193: syscalls.ErrorWithEvent("fgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 194: syscalls.ErrorWithEvent("listxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 195: syscalls.ErrorWithEvent("llistxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 196: syscalls.ErrorWithEvent("flistxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 197: syscalls.ErrorWithEvent("removexattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 198: syscalls.ErrorWithEvent("lremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 199: syscalls.ErrorWithEvent("fremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 200: syscalls.Undocumented("tkill", Tkill), + 201: syscalls.Undocumented("time", Time), + 202: syscalls.Undocumented("futex", Futex), + 203: syscalls.Undocumented("sched_setaffinity", SchedSetaffinity), + 204: syscalls.Undocumented("sched_getaffinity", SchedGetaffinity), + 205: syscalls.Error("set_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 206: syscalls.Undocumented("io_setup", IoSetup), + 207: syscalls.Undocumented("io_destroy", IoDestroy), + 208: syscalls.Undocumented("io_getevents", IoGetevents), + 209: syscalls.Undocumented("io_submit", IoSubmit), + 210: syscalls.Undocumented("io_cancel", IoCancel), + 211: syscalls.Error("get_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), + 213: syscalls.Undocumented("epoll_create", EpollCreate), + 214: syscalls.ErrorWithEvent("epoll_ctl_old", syscall.ENOSYS, "Deprecated", nil), + 215: syscalls.ErrorWithEvent("epoll_wait_old", syscall.ENOSYS, "Deprecated", nil), + 216: syscalls.ErrorWithEvent("remap_file_pages", syscall.ENOSYS, "Deprecated since 3.16", nil), + 217: syscalls.Undocumented("getdents64", Getdents64), + 218: syscalls.Undocumented("set_tid_address", SetTidAddress), + 219: syscalls.Undocumented("restart_syscall", RestartSyscall), + 220: syscalls.ErrorWithEvent("semtimedop", syscall.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) + 221: syscalls.Undocumented("fadvise64", Fadvise64), + 222: syscalls.Undocumented("timer_create", TimerCreate), + 223: syscalls.Undocumented("timer_settime", TimerSettime), + 224: syscalls.Undocumented("timer_gettime", TimerGettime), + 225: syscalls.Undocumented("timer_getoverrun", TimerGetoverrun), + 226: syscalls.Undocumented("timer_delete", TimerDelete), + 227: syscalls.Undocumented("clock_settime", ClockSettime), + 228: syscalls.Undocumented("clock_gettime", ClockGettime), + 229: syscalls.Undocumented("clock_getres", ClockGetres), + 230: syscalls.Undocumented("clock_nanosleep", ClockNanosleep), + 231: syscalls.Undocumented("exit_group", ExitGroup), + 232: syscalls.Undocumented("epoll_wait", EpollWait), + 233: syscalls.Undocumented("epoll_ctl", EpollCtl), + 234: syscalls.Undocumented("tgkill", Tgkill), + 235: syscalls.Undocumented("utimes", Utimes), + 236: syscalls.Error("vserver", syscall.ENOSYS, "Not implemented by Linux", nil), + 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), + 238: syscalls.Undocumented("set_mempolicy", SetMempolicy), + 239: syscalls.Undocumented("get_mempolicy", GetMempolicy), + 240: syscalls.ErrorWithEvent("mq_open", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 241: syscalls.ErrorWithEvent("mq_unlink", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 242: syscalls.ErrorWithEvent("mq_timedsend", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 243: syscalls.ErrorWithEvent("mq_timedreceive", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 244: syscalls.ErrorWithEvent("mq_notify", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 245: syscalls.ErrorWithEvent("mq_getsetattr", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), + 247: syscalls.Undocumented("waitid", Waitid), + 248: syscalls.Error("add_key", syscall.EACCES, "Not available to user", nil), + 249: syscalls.Error("request_key", syscall.EACCES, "Not available to user", nil), + 250: syscalls.Error("keyctl", syscall.EACCES, "Not available to user", nil), + 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 253: syscalls.Undocumented("inotify_init", InotifyInit), + 254: syscalls.Undocumented("inotify_add_watch", InotifyAddWatch), + 255: syscalls.Undocumented("inotify_rm_watch", InotifyRmWatch), + 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), + 257: syscalls.Undocumented("openat", Openat), + 258: syscalls.Undocumented("mkdirat", Mkdirat), + 259: syscalls.Undocumented("mknodat", Mknodat), + 260: syscalls.Undocumented("fchownat", Fchownat), + 261: syscalls.Undocumented("futimesat", Futimesat), + 262: syscalls.Undocumented("fstatat", Fstatat), + 263: syscalls.Undocumented("unlinkat", Unlinkat), + 264: syscalls.Undocumented("renameat", Renameat), + 265: syscalls.Undocumented("linkat", Linkat), + 266: syscalls.Undocumented("symlinkat", Symlinkat), + 267: syscalls.Undocumented("readlinkat", Readlinkat), + 268: syscalls.Undocumented("fchmodat", Fchmodat), + 269: syscalls.Undocumented("faccessat", Faccessat), + 270: syscalls.Undocumented("pselect", Pselect), + 271: syscalls.Undocumented("ppoll", Ppoll), + 272: syscalls.Undocumented("unshare", Unshare), + 273: syscalls.Error("set_robust_list", syscall.ENOSYS, "Obsolete", nil), + 274: syscalls.Error("get_robust_list", syscall.ENOSYS, "Obsolete", nil), + 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 276: syscalls.ErrorWithEvent("tee", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 277: syscalls.Undocumented("sync_file_range", SyncFileRange), + 278: syscalls.ErrorWithEvent("vmsplice", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) + 280: syscalls.Undocumented("utimensat", Utimensat), + 281: syscalls.Undocumented("epoll_pwait", EpollPwait), + 282: syscalls.ErrorWithEvent("signalfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) + 283: syscalls.Undocumented("timerfd_create", TimerfdCreate), + 284: syscalls.Undocumented("eventfd", Eventfd), + 285: syscalls.Undocumented("fallocate", Fallocate), + 286: syscalls.Undocumented("timerfd_settime", TimerfdSettime), + 287: syscalls.Undocumented("timerfd_gettime", TimerfdGettime), + 288: syscalls.Undocumented("accept4", Accept4), + 289: syscalls.ErrorWithEvent("signalfd4", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) + 290: syscalls.Undocumented("eventfd2", Eventfd2), + 291: syscalls.Undocumented("epoll_create1", EpollCreate1), + 292: syscalls.Undocumented("dup3", Dup3), + 293: syscalls.Undocumented("pipe2", Pipe2), + 294: syscalls.Undocumented("inotify_init1", InotifyInit1), + 295: syscalls.Undocumented("preadv", Preadv), + 296: syscalls.Undocumented("pwritev", Pwritev), + 297: syscalls.Undocumented("rt_tgsigqueueinfo", RtTgsigqueueinfo), + 298: syscalls.ErrorWithEvent("perf_event_open", syscall.ENODEV, "No support for perf counters", nil), + 299: syscalls.Undocumented("recvmmsg", RecvMMsg), + 300: syscalls.ErrorWithEvent("fanotify_init", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 301: syscalls.ErrorWithEvent("fanotify_mark", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 302: syscalls.Undocumented("prlimit64", Prlimit64), + 303: syscalls.ErrorWithEvent("name_to_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil), + 304: syscalls.ErrorWithEvent("open_by_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil), + 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), + 306: syscalls.Undocumented("syncfs", Syncfs), + 307: syscalls.Undocumented("sendmmsg", SendMMsg), + 308: syscalls.ErrorWithEvent("setns", syscall.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 309: syscalls.Undocumented("getcpu", Getcpu), + 310: syscalls.ErrorWithEvent("process_vm_readv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 311: syscalls.ErrorWithEvent("process_vm_writev", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), + 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), + 314: syscalls.ErrorWithEvent("sched_setattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 315: syscalls.ErrorWithEvent("sched_getattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 316: syscalls.ErrorWithEvent("renameat2", syscall.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 317: syscalls.Undocumented("seccomp", Seccomp), + 318: syscalls.Undocumented("getrandom", GetRandom), + 319: syscalls.Undocumented("memfd_create", MemfdCreate), + 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), + 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), + 322: syscalls.ErrorWithEvent("execveat", syscall.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836) + 323: syscalls.ErrorWithEvent("userfaultfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 324: syscalls.ErrorWithEvent("membarrier", syscall.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897) + 325: syscalls.Undocumented("mlock2", Mlock2), + // Syscalls after 325 are "backports" from versions of Linux after 4.4. - // 326: @Syscall(CopyFileRange), - 327: Preadv2, - 328: Pwritev2, + 326: syscalls.ErrorWithEvent("copy_file_range", syscall.ENOSYS, "", nil), + 327: syscalls.Undocumented("preadv2", Preadv2), + 328: syscalls.Undocumented("pwritev2", Pwritev2), }, Emulate: map[usermem.Addr]uintptr{ diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go index 5d10b3824..48c114232 100644 --- a/pkg/sentry/syscalls/syscalls.go +++ b/pkg/sentry/syscalls/syscalls.go @@ -25,37 +25,97 @@ package syscalls import ( + "fmt" + "syscall" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/syserror" ) +// Supported returns a syscall that is fully supported. +func Supported(name string, fn kernel.SyscallFn) kernel.Syscall { + return kernel.Syscall{ + Name: name, + Fn: fn, + SupportLevel: kernel.SupportFull, + Note: "Full Support", + } +} + +// Undocumented returns a syscall that is undocumented. +func Undocumented(name string, fn kernel.SyscallFn) kernel.Syscall { + return kernel.Syscall{ + Name: name, + Fn: fn, + SupportLevel: kernel.SupportUndocumented, + } +} + +// PartiallySupported returns a syscall that has a partial implementation. +func PartiallySupported(name string, fn kernel.SyscallFn, note string, urls []string) kernel.Syscall { + return kernel.Syscall{ + Name: name, + Fn: fn, + SupportLevel: kernel.SupportPartial, + Note: note, + URLs: urls, + } +} + // Error returns a syscall handler that will always give the passed error. -func Error(err error) kernel.SyscallFn { - return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, err +func Error(name string, err syscall.Errno, note string, urls []string) kernel.Syscall { + if note != "" { + note = note + "; " + } + return kernel.Syscall{ + Name: name, + Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, err + }, + SupportLevel: kernel.SupportUnimplemented, + Note: fmt.Sprintf("%sReturns %q", note, err.Error()), + URLs: urls, } } // ErrorWithEvent gives a syscall function that sends an unimplemented // syscall event via the event channel and returns the passed error. -func ErrorWithEvent(err error) kernel.SyscallFn { - return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, err +func ErrorWithEvent(name string, err syscall.Errno, note string, urls []string) kernel.Syscall { + if note != "" { + note = note + "; " + } + return kernel.Syscall{ + Name: name, + Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, err + }, + SupportLevel: kernel.SupportUnimplemented, + Note: fmt.Sprintf("%sReturns %q", note, err.Error()), + URLs: urls, } } // CapError gives a syscall function that checks for capability c. If the task // has the capability, it returns ENOSYS, otherwise EPERM. To unprivileged // tasks, it will seem like there is an implementation. -func CapError(c linux.Capability) kernel.SyscallFn { - return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - if !t.HasCapability(c) { - return 0, nil, syserror.EPERM - } - t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOSYS +func CapError(name string, c linux.Capability, note string, urls []string) kernel.Syscall { + if note != "" { + note = note + "; " + } + return kernel.Syscall{ + Name: name, + Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + if !t.HasCapability(c) { + return 0, nil, syserror.EPERM + } + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.ENOSYS + }, + SupportLevel: kernel.SupportUnimplemented, + Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise", note, syserror.EPERM, c.String(), syserror.ENOSYS), + URLs: urls, } } diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 173b7671e..df6af0ced 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -18,6 +18,7 @@ go_library( "events.go", "exec.go", "gofer.go", + "help.go", "kill.go", "list.go", "path.go", @@ -29,6 +30,7 @@ go_library( "spec.go", "start.go", "state.go", + "syscalls.go", "wait.go", ], importpath = "gvisor.googlesource.com/gvisor/runsc/cmd", @@ -39,6 +41,7 @@ go_library( "//pkg/log", "//pkg/p9", "//pkg/sentry/control", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/unet", "//pkg/urpc", diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go new file mode 100644 index 000000000..ff4f901cb --- /dev/null +++ b/runsc/cmd/help.go @@ -0,0 +1,126 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "fmt" + + "flag" + "github.com/google/subcommands" +) + +// NewHelp returns a help command for the given commander. +func NewHelp(cdr *subcommands.Commander) *Help { + return &Help{ + cdr: cdr, + } +} + +// Help implements subcommands.Command for the "help" command. The 'help' +// command prints help for commands registered to a Commander but also allows for +// registering additional help commands that print other documentation. +type Help struct { + cdr *subcommands.Commander + commands []subcommands.Command + help bool +} + +// Name implements subcommands.Command.Name. +func (*Help) Name() string { + return "help" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Help) Synopsis() string { + return "Print help documentation." +} + +// Usage implements subcommands.Command.Usage. +func (*Help) Usage() string { + return `help []: + With an argument, prints detailed information on the use of + the specified topic or subcommand. With no argument, print a list of + all commands and a brief description of each. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (h *Help) SetFlags(f *flag.FlagSet) {} + +// Execute implements subcommands.Command.Execute. +func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + switch f.NArg() { + case 0: + fmt.Fprintf(h.cdr.Output, "Usage: %s \n\n", h.cdr.Name()) + fmt.Fprintf(h.cdr.Output, `runsc is a command line client for running applications packaged in the Open +Container Initiative (OCI) format. Applications run by runsc are run in an +isolated gVisor sandbox that emulates a Linux environment. + +gVisor is a user-space kernel, written in Go, that implements a substantial +portion of the Linux system call interface. It provides an additional layer +of isolation between running applications and the host operating system. + +Functionality is provided by subcommands. For additonal help on individual +subcommands use "%s %s ". + +`, h.cdr.Name(), h.Name()) + h.cdr.VisitGroups(func(g *subcommands.CommandGroup) { + h.cdr.ExplainGroup(h.cdr.Output, g) + }) + + fmt.Fprintf(h.cdr.Output, "Additional help topics (Use \"%s %s \" to see help on the topic):\n", h.cdr.Name(), h.Name()) + for _, cmd := range h.commands { + fmt.Fprintf(h.cdr.Output, "\t%-15s %s\n", cmd.Name(), cmd.Synopsis()) + } + fmt.Fprintf(h.cdr.Output, "\nUse \"%s flags\" for a list of top-level flags\n", h.cdr.Name()) + return subcommands.ExitSuccess + default: + // Look for commands registered to the commander and print help explanation if found. + found := false + h.cdr.VisitCommands(func(g *subcommands.CommandGroup, cmd subcommands.Command) { + if f.Arg(0) == cmd.Name() { + h.cdr.ExplainCommand(h.cdr.Output, cmd) + found = true + } + }) + if found { + return subcommands.ExitSuccess + } + + // Next check commands registered to the help command. + for _, cmd := range h.commands { + if f.Arg(0) == cmd.Name() { + fs := flag.NewFlagSet(f.Arg(0), flag.ContinueOnError) + fs.Usage = func() { h.cdr.ExplainCommand(h.cdr.Error, cmd) } + cmd.SetFlags(fs) + if fs.Parse(f.Args()[1:]) != nil { + return subcommands.ExitUsageError + } + return cmd.Execute(ctx, f, args...) + } + } + + fmt.Fprintf(h.cdr.Error, "Subcommand %s not understood\n", f.Arg(0)) + } + + f.Usage() + return subcommands.ExitUsageError +} + +// Register registers a new help command. +func (h *Help) Register(cmd subcommands.Command) { + h.commands = append(h.commands, cmd) +} diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go new file mode 100644 index 000000000..9c8a66490 --- /dev/null +++ b/runsc/cmd/syscalls.go @@ -0,0 +1,347 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/csv" + "encoding/json" + "fmt" + "io" + "os" + "sort" + "strconv" + "text/tabwriter" + + "flag" + "github.com/google/subcommands" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" +) + +// Syscalls implements subcommands.Command for the "syscalls" command. +type Syscalls struct { + output string + os string + arch string +} + +// CompatibilityInfo is a map of system and architecture to compatibility doc. +// Maps operating system to architecture to ArchInfo. +type CompatibilityInfo map[string]map[string]ArchInfo + +// ArchInfo is compatbility doc for an architecture. +type ArchInfo struct { + // Syscalls maps syscall number for the architecture to the doc. + Syscalls map[uintptr]SyscallDoc `json:"syscalls"` +} + +// SyscallDoc represents a single item of syscall documentation. +type SyscallDoc struct { + Name string `json:"name"` + num uintptr + + Support string `json:"support"` + Note string `json:"note,omitempty"` + URLs []string `json:"urls,omitempty"` +} + +type outputFunc func(io.Writer, CompatibilityInfo) error + +var ( + // The string name to use for printing compatibility for all OSes. + osAll = "all" + + // The string name to use for printing compatibility for all architectures. + archAll = "all" + + // A map of OS name to map of architecture name to syscall table. + syscallTableMap = make(map[string]map[string]*kernel.SyscallTable) + + // A map of output type names to output functions. + outputMap = map[string]outputFunc{ + "table": outputTable, + "json": outputJSON, + "csv": outputCSV, + } +) + +// Name implements subcommands.Command.Name. +func (*Syscalls) Name() string { + return "syscalls" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Syscalls) Synopsis() string { + return "Print compatibility information for syscalls." +} + +// Usage implements subcommands.Command.Usage. +func (*Syscalls) Usage() string { + return `syscalls [options] - Print compatibility information for syscalls. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (s *Syscalls) SetFlags(f *flag.FlagSet) { + f.StringVar(&s.output, "o", "table", "Output format (table, csv, json).") + f.StringVar(&s.os, "os", osAll, "The OS (e.g. linux)") + f.StringVar(&s.arch, "arch", archAll, "The CPU architecture (e.g. amd64).") +} + +// Execute implements subcommands.Command.Execute. +func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + out, ok := outputMap[s.output] + if !ok { + Fatalf("Unsupported output format %q", s.output) + } + + // Build map of all supported architectures. + tables := kernel.SyscallTables() + for _, t := range tables { + osMap, ok := syscallTableMap[t.OS.String()] + if !ok { + osMap = make(map[string]*kernel.SyscallTable) + syscallTableMap[t.OS.String()] = osMap + } + osMap[t.Arch.String()] = t + } + + // Build a map of the architectures we want to output. + info, err := getCompatibilityInfo(s.os, s.arch) + if err != nil { + Fatalf("%v", err) + } + + if err := out(os.Stdout, info); err != nil { + Fatalf("Error writing output: %v", err) + } + + return subcommands.ExitSuccess +} + +// getCompatibilityInfo returns compatibility info for the given OS name and +// architecture name. Supports the special name 'all' for OS and architecture that +// specifies that all supported OSes or architectures should be included. +func getCompatibilityInfo(osName string, archName string) (CompatibilityInfo, error) { + info := CompatibilityInfo(make(map[string]map[string]ArchInfo)) + if osName == osAll { + // Special processing for the 'all' OS name. + for osName, _ := range syscallTableMap { + info[osName] = make(map[string]ArchInfo) + // osName is a specific OS name. + if err := addToCompatibilityInfo(info, osName, archName); err != nil { + return info, err + } + } + } else { + // osName is a specific OS name. + info[osName] = make(map[string]ArchInfo) + if err := addToCompatibilityInfo(info, osName, archName); err != nil { + return info, err + } + } + + return info, nil +} + +// addToCompatibilityInfo adds ArchInfo for the given specific OS name and +// architecture name. Supports the special architecture name 'all' to specify +// that all supported architectures for the OS should be included. +func addToCompatibilityInfo(info CompatibilityInfo, osName string, archName string) error { + if archName == archAll { + // Special processing for the 'all' architecture name. + for archName, _ := range syscallTableMap[osName] { + archInfo, err := getArchInfo(osName, archName) + if err != nil { + return err + } + info[osName][archName] = archInfo + } + } else { + // archName is a specific architecture name. + archInfo, err := getArchInfo(osName, archName) + if err != nil { + return err + } + info[osName][archName] = archInfo + } + + return nil +} + +// getArchInfo returns compatibility info for a specific OS and architecture. +func getArchInfo(osName string, archName string) (ArchInfo, error) { + info := ArchInfo{} + info.Syscalls = make(map[uintptr]SyscallDoc) + + t, ok := syscallTableMap[osName][archName] + if !ok { + return info, fmt.Errorf("syscall table for %s/%s not found", osName, archName) + } + + for num, sc := range t.Table { + info.Syscalls[num] = SyscallDoc{ + Name: sc.Name, + num: num, + Support: sc.SupportLevel.String(), + Note: sc.Note, + URLs: sc.URLs, + } + } + + return info, nil +} + +// outputTable outputs the syscall info in tabular format. +func outputTable(w io.Writer, info CompatibilityInfo) error { + tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0) + + // Linux + for osName, osInfo := range info { + for archName, archInfo := range osInfo { + // Print the OS/arch + fmt.Fprintf(w, "%s/%s:\n\n", osName, archName) + + // Sort the syscalls for output in the table. + sortedCalls := []SyscallDoc{} + for _, sc := range archInfo.Syscalls { + sortedCalls = append(sortedCalls, sc) + } + sort.Slice(sortedCalls, func(i, j int) bool { + return sortedCalls[i].num < sortedCalls[j].num + }) + + // Write the header + _, err := fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n", + "NUM", + "NAME", + "SUPPORT", + "NOTE", + ) + if err != nil { + return err + } + + // Write each syscall entry + for _, sc := range sortedCalls { + _, err = fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n", + strconv.FormatInt(int64(sc.num), 10), + sc.Name, + sc.Support, + sc.Note, + ) + if err != nil { + return err + } + // Add issue urls to note. + for _, url := range sc.URLs { + _, err = fmt.Fprintf(tw, "%s\t%s\t%s\tSee: %s\t\n", + "", + "", + "", + url, + ) + if err != nil { + return err + } + } + } + + err = tw.Flush() + if err != nil { + return err + } + } + } + + return nil +} + +// outputJSON outputs the syscall info in JSON format. +func outputJSON(w io.Writer, info CompatibilityInfo) error { + e := json.NewEncoder(w) + e.SetIndent("", " ") + return e.Encode(info) +} + +// numberedRow is aCSV row annotated by syscall number (used for sorting) +type numberedRow struct { + num uintptr + row []string +} + +// outputCSV outputs the syscall info in tabular format. +func outputCSV(w io.Writer, info CompatibilityInfo) error { + csvWriter := csv.NewWriter(w) + + // Linux + for osName, osInfo := range info { + for archName, archInfo := range osInfo { + // Sort the syscalls for output in the table. + sortedCalls := []numberedRow{} + for _, sc := range archInfo.Syscalls { + // Add issue urls to note. + note := sc.Note + for _, url := range sc.URLs { + note = fmt.Sprintf("%s\nSee: %s", note, url) + } + + sortedCalls = append(sortedCalls, numberedRow{ + num: sc.num, + row: []string{ + osName, + archName, + strconv.FormatInt(int64(sc.num), 10), + sc.Name, + sc.Support, + note, + }, + }) + } + sort.Slice(sortedCalls, func(i, j int) bool { + return sortedCalls[i].num < sortedCalls[j].num + }) + + // Write the header + err := csvWriter.Write([]string{ + "OS", + "Arch", + "Num", + "Name", + "Support", + "Note", + }) + if err != nil { + return err + } + + // Write each syscall entry + for _, sc := range sortedCalls { + err = csvWriter.Write(sc.row) + if err != nil { + return err + } + } + + csvWriter.Flush() + err = csvWriter.Error() + if err != nil { + return err + } + } + } + + return nil +} diff --git a/runsc/main.go b/runsc/main.go index 6f8e6e378..a214f6ba0 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -76,7 +76,9 @@ var ( func main() { // Help and flags commands are generated automatically. - subcommands.Register(subcommands.HelpCommand(), "") + help := cmd.NewHelp(subcommands.DefaultCommander) + help.Register(new(cmd.Syscalls)) + subcommands.Register(help, "") subcommands.Register(subcommands.FlagsCommand(), "") // Register user-facing runsc commands. -- cgit v1.2.3 From df110ad4fe571721a7eb4a5a1f9ce92584ef7809 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Tue, 11 Jun 2019 19:23:27 -0700 Subject: Eat sendfile partial error For sendfile(2), we propagate a TCP error through the system call layer. This should be eaten if there is a partial result. This change also adds a test to ensure that there is no panic in this case, for both TCP sockets and unix domain sockets. PiperOrigin-RevId: 252746192 --- pkg/sentry/syscalls/linux/error.go | 4 + test/syscalls/linux/sendfile_socket.cc | 170 ++++++++++++++++++++++++--------- 2 files changed, 131 insertions(+), 43 deletions(-) (limited to 'pkg/sentry/syscalls') diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index 1ba3695fb..72146ea63 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -92,6 +92,10 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // TODO(gvisor.dev/issue/161): In some cases SIGPIPE should // also be sent to the application. return nil + case syserror.ECONNRESET: + // For TCP sendfile connections, we may have a reset. But we + // should just return n as the result. + return nil case syserror.ErrWouldBlock: // Syscall would block, but completed a partial read/write. // This case should only be returned by IssueIO for nonblocking diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc index 66adda515..1c56540bc 100644 --- a/test/syscalls/linux/sendfile_socket.cc +++ b/test/syscalls/linux/sendfile_socket.cc @@ -33,9 +33,69 @@ namespace gvisor { namespace testing { namespace { +class SendFileTest : public ::testing::TestWithParam { + protected: + PosixErrorOr> Sockets() { + // Bind a server socket. + int family = GetParam(); + struct sockaddr server_addr = {}; + switch (family) { + case AF_INET: { + struct sockaddr_in *server_addr_in = + reinterpret_cast(&server_addr); + server_addr_in->sin_family = family; + server_addr_in->sin_addr.s_addr = INADDR_ANY; + break; + } + case AF_UNIX: { + struct sockaddr_un *server_addr_un = + reinterpret_cast(&server_addr); + server_addr_un->sun_family = family; + server_addr_un->sun_path[0] = '\0'; + break; + } + default: + return PosixError(EINVAL); + } + int server = socket(family, SOCK_STREAM, 0); + if (bind(server, &server_addr, sizeof(server_addr)) < 0) { + return PosixError(errno); + } + if (listen(server, 1) < 0) { + close(server); + return PosixError(errno); + } + + // Fetch the address; both are anonymous. + socklen_t length = sizeof(server_addr); + if (getsockname(server, &server_addr, &length) < 0) { + close(server); + return PosixError(errno); + } + + // Connect the client. + int client = socket(family, SOCK_STREAM, 0); + if (connect(client, &server_addr, length) < 0) { + close(server); + close(client); + return PosixError(errno); + } + + // Accept on the server. + int server_client = accept(server, nullptr, 0); + if (server_client < 0) { + close(server); + close(client); + return PosixError(errno); + } + close(server); + return std::make_tuple(client, server_client); + } +}; + // Sends large file to exercise the path that read and writes data multiple // times, esp. when more data is read than can be written. -TEST(SendFileTest, SendMultiple) { +TEST_P(SendFileTest, SendMultiple) { std::vector data(5 * 1024 * 1024); RandomizeBuffer(data.data(), data.size()); @@ -45,34 +105,20 @@ TEST(SendFileTest, SendMultiple) { TempPath::kDefaultFileMode)); const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); - // Use a socket for target file to make the write window small. - const FileDescriptor server(socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)); - ASSERT_THAT(server.get(), SyscallSucceeds()); - - struct sockaddr_in server_addr = {}; - server_addr.sin_family = AF_INET; - server_addr.sin_addr.s_addr = INADDR_ANY; - ASSERT_THAT( - bind(server.get(), reinterpret_cast(&server_addr), - sizeof(server_addr)), - SyscallSucceeds()); - ASSERT_THAT(listen(server.get(), 1), SyscallSucceeds()); + // Create sockets. + std::tuple fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets()); + const FileDescriptor server(std::get<0>(fds)); + FileDescriptor client(std::get<1>(fds)); // non-const, reset is used. // Thread that reads data from socket and dumps to a file. - ScopedThread th([&server, &out_file, &server_addr] { - socklen_t addrlen = sizeof(server_addr); - const FileDescriptor fd(RetryEINTR(accept)( - server.get(), reinterpret_cast(&server_addr), - &addrlen)); - ASSERT_THAT(fd.get(), SyscallSucceeds()); - + ScopedThread th([&] { FileDescriptor outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY)); // Read until socket is closed. char buf[10240]; for (int cnt = 0;; cnt++) { - int r = RetryEINTR(read)(fd.get(), buf, sizeof(buf)); + int r = RetryEINTR(read)(server.get(), buf, sizeof(buf)); // We cannot afford to save on every read() call. if (cnt % 1000 == 0) { ASSERT_THAT(r, SyscallSucceeds()); @@ -99,25 +145,6 @@ TEST(SendFileTest, SendMultiple) { const FileDescriptor inf = ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY)); - FileDescriptor outf(socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)); - ASSERT_THAT(outf.get(), SyscallSucceeds()); - - // Get the port bound by the listening socket. - socklen_t addrlen = sizeof(server_addr); - ASSERT_THAT(getsockname(server.get(), - reinterpret_cast(&server_addr), &addrlen), - SyscallSucceeds()); - - struct sockaddr_in addr = {}; - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = inet_addr("127.0.0.1"); - addr.sin_port = server_addr.sin_port; - std::cout << "Connecting on port=" << server_addr.sin_port; - ASSERT_THAT( - RetryEINTR(connect)( - outf.get(), reinterpret_cast(&addr), sizeof(addr)), - SyscallSucceeds()); - int cnt = 0; for (size_t sent = 0; sent < data.size(); cnt++) { const size_t remain = data.size() - sent; @@ -125,7 +152,7 @@ TEST(SendFileTest, SendMultiple) { << ", remain=" << remain; // Send data and verify that sendfile returns the correct value. - int res = sendfile(outf.get(), inf.get(), nullptr, remain); + int res = sendfile(client.get(), inf.get(), nullptr, remain); // We cannot afford to save on every sendfile() call. if (cnt % 120 == 0) { MaybeSave(); @@ -142,17 +169,74 @@ TEST(SendFileTest, SendMultiple) { } // Close socket to stop thread. - outf.reset(); + client.reset(); th.Join(); // Verify that the output file has the correct data. - outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY)); + const FileDescriptor outf = + ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY)); std::vector actual(data.size(), '\0'); ASSERT_THAT(RetryEINTR(read)(outf.get(), actual.data(), actual.size()), SyscallSucceedsWithValue(actual.size())); ASSERT_EQ(memcmp(data.data(), actual.data(), data.size()), 0); } +TEST_P(SendFileTest, Shutdown) { + // Create a socket. + std::tuple fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets()); + const FileDescriptor client(std::get<0>(fds)); + FileDescriptor server(std::get<1>(fds)); // non-const, released below. + + // If this is a TCP socket, then turn off linger. + if (GetParam() == AF_INET) { + struct linger sl; + sl.l_onoff = 1; + sl.l_linger = 0; + ASSERT_THAT( + setsockopt(server.get(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)), + SyscallSucceeds()); + } + + // Create a 1m file with random data. + std::vector data(1024 * 1024); + RandomizeBuffer(data.data(), data.size()); + const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( + GetAbsoluteTestTmpdir(), absl::string_view(data.data(), data.size()), + TempPath::kDefaultFileMode)); + const FileDescriptor inf = + ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY)); + + // Read some data, then shutdown the socket. We don't actually care about + // checking the contents (other tests do that), so we just re-use the same + // buffer as above. + ScopedThread t([&]() { + int done = 0; + while (done < data.size()) { + int n = read(server.get(), data.data(), data.size()); + ASSERT_THAT(n, SyscallSucceeds()); + done += n; + } + // Close the server side socket. + ASSERT_THAT(close(server.release()), SyscallSucceeds()); + }); + + // Continuously stream from the file to the socket. Note we do not assert + // that a specific amount of data has been written at any time, just that some + // data is written. Eventually, we should get a connection reset error. + while (1) { + off_t offset = 0; // Always read from the start. + int n = sendfile(client.get(), inf.get(), &offset, data.size()); + EXPECT_THAT(n, AnyOf(SyscallFailsWithErrno(ECONNRESET), + SyscallFailsWithErrno(EPIPE), SyscallSucceeds())); + if (n <= 0) { + break; + } + } +} + +INSTANTIATE_TEST_SUITE_P(AddressFamily, SendFileTest, + ::testing::Values(AF_UNIX, AF_INET)); + } // namespace } // namespace testing } // namespace gvisor -- cgit v1.2.3