diff options
Diffstat (limited to 'pkg/sentry/syscalls')
84 files changed, 20630 insertions, 0 deletions
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD new file mode 100644 index 000000000..b8d1bd415 --- /dev/null +++ b/pkg/sentry/syscalls/BUILD @@ -0,0 +1,21 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "syscalls", + srcs = [ + "epoll.go", + "syscalls.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/arch", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/epoll", + "//pkg/sentry/kernel/time", + "//pkg/syserror", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go new file mode 100644 index 000000000..d9fb808c0 --- /dev/null +++ b/pkg/sentry/syscalls/epoll.go @@ -0,0 +1,173 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package syscalls + +import ( + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// CreateEpoll implements the epoll_create(2) linux syscall. +func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) { + file := epoll.NewEventPoll(t) + defer file.DecRef() + + fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ + CloseOnExec: closeOnExec, + }) + if err != nil { + return 0, err + } + + return fd, nil +} + +// AddEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_ADD. +func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { + // Get epoll from the file descriptor. + epollfile := t.GetFile(epfd) + if epollfile == nil { + return syserror.EBADF + } + defer epollfile.DecRef() + + // Get the target file id. + file := t.GetFile(fd) + if file == nil { + return syserror.EBADF + } + defer file.DecRef() + + // Extract the epollPoll operations. + e, ok := epollfile.FileOperations.(*epoll.EventPoll) + if !ok { + return syserror.EBADF + } + + // Try to add the entry. + return e.AddEntry(epoll.FileIdentifier{file, fd}, flags, mask, userData) +} + +// UpdateEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_MOD. +func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { + // Get epoll from the file descriptor. + epollfile := t.GetFile(epfd) + if epollfile == nil { + return syserror.EBADF + } + defer epollfile.DecRef() + + // Get the target file id. + file := t.GetFile(fd) + if file == nil { + return syserror.EBADF + } + defer file.DecRef() + + // Extract the epollPoll operations. + e, ok := epollfile.FileOperations.(*epoll.EventPoll) + if !ok { + return syserror.EBADF + } + + // Try to update the entry. + return e.UpdateEntry(epoll.FileIdentifier{file, fd}, flags, mask, userData) +} + +// RemoveEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_DEL. +func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { + // Get epoll from the file descriptor. + epollfile := t.GetFile(epfd) + if epollfile == nil { + return syserror.EBADF + } + defer epollfile.DecRef() + + // Get the target file id. + file := t.GetFile(fd) + if file == nil { + return syserror.EBADF + } + defer file.DecRef() + + // Extract the epollPoll operations. + e, ok := epollfile.FileOperations.(*epoll.EventPoll) + if !ok { + return syserror.EBADF + } + + // Try to remove the entry. + return e.RemoveEntry(epoll.FileIdentifier{file, fd}) +} + +// WaitEpoll implements the epoll_wait(2) linux syscall. +func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEvent, error) { + // Get epoll from the file descriptor. + epollfile := t.GetFile(fd) + if epollfile == nil { + return nil, syserror.EBADF + } + defer epollfile.DecRef() + + // Extract the epollPoll operations. + e, ok := epollfile.FileOperations.(*epoll.EventPoll) + if !ok { + return nil, syserror.EBADF + } + + // Try to read events and return right away if we got them or if the + // caller requested a non-blocking "wait". + r := e.ReadEvents(max) + if len(r) != 0 || timeout == 0 { + return r, nil + } + + // We'll have to wait. Set up the timer if a timeout was specified and + // and register with the epoll object for readability events. + var haveDeadline bool + var deadline ktime.Time + if timeout > 0 { + timeoutDur := time.Duration(timeout) * time.Millisecond + deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur) + haveDeadline = true + } + + w, ch := waiter.NewChannelEntry(nil) + e.EventRegister(&w, waiter.EventIn) + defer e.EventUnregister(&w) + + // Try to read the events again until we succeed, timeout or get + // interrupted. + for { + r = e.ReadEvents(max) + if len(r) != 0 { + return r, nil + } + + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + return nil, nil + } + + return nil, err + } + } +} diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD new file mode 100644 index 000000000..217fcfef2 --- /dev/null +++ b/pkg/sentry/syscalls/linux/BUILD @@ -0,0 +1,103 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "linux", + srcs = [ + "error.go", + "flags.go", + "linux64.go", + "sigset.go", + "sys_aio.go", + "sys_capability.go", + "sys_clone_amd64.go", + "sys_clone_arm64.go", + "sys_epoll.go", + "sys_eventfd.go", + "sys_file.go", + "sys_futex.go", + "sys_getdents.go", + "sys_identity.go", + "sys_inotify.go", + "sys_lseek.go", + "sys_mempolicy.go", + "sys_mmap.go", + "sys_mount.go", + "sys_pipe.go", + "sys_poll.go", + "sys_prctl.go", + "sys_random.go", + "sys_read.go", + "sys_rlimit.go", + "sys_rseq.go", + "sys_rusage.go", + "sys_sched.go", + "sys_seccomp.go", + "sys_sem.go", + "sys_shm.go", + "sys_signal.go", + "sys_socket.go", + "sys_splice.go", + "sys_stat.go", + "sys_stat_amd64.go", + "sys_stat_arm64.go", + "sys_sync.go", + "sys_sysinfo.go", + "sys_syslog.go", + "sys_thread.go", + "sys_time.go", + "sys_timer.go", + "sys_timerfd.go", + "sys_tls_amd64.go", + "sys_tls_arm64.go", + "sys_utsname.go", + "sys_write.go", + "sys_xattr.go", + "timespec.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/abi", + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/bpf", + "//pkg/context", + "//pkg/log", + "//pkg/metric", + "//pkg/rand", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/fs", + "//pkg/sentry/fs/anon", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fs/timerfd", + "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/fsbridge", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/epoll", + "//pkg/sentry/kernel/eventfd", + "//pkg/sentry/kernel/fasync", + "//pkg/sentry/kernel/pipe", + "//pkg/sentry/kernel/sched", + "//pkg/sentry/kernel/shm", + "//pkg/sentry/kernel/signalfd", + "//pkg/sentry/kernel/time", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/memmap", + "//pkg/sentry/mm", + "//pkg/sentry/socket", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/syscalls", + "//pkg/sentry/usage", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserr", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go new file mode 100644 index 000000000..64de56ac5 --- /dev/null +++ b/pkg/sentry/syscalls/linux/error.go @@ -0,0 +1,157 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "io" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/metric" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +var ( + partialResultMetric = metric.MustCreateNewUint64Metric("/syscalls/partial_result", true /* sync */, "Whether or not a partial result has occurred for this sandbox.") + partialResultOnce sync.Once +) + +// HandleIOErrorVFS2 handles special error cases for partial results. For some +// errors, we may consume the error and return only the partial read/write. +// +// op and f are used only for panics. +func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op string, f *vfs.FileDescription) error { + known, err := handleIOErrorImpl(t, partialResult, err, intr, op) + if err != nil { + return err + } + if !known { + // An unknown error is encountered with a partial read/write. + fs := f.Mount().Filesystem().VirtualFilesystem() + root := vfs.RootFromContext(t) + name, _ := fs.PathnameWithDeleted(t, root, f.VirtualDentry()) + log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, err, err, op, name) + partialResultOnce.Do(partialResultMetric.Increment) + } + return nil +} + +// handleIOError handles special error cases for partial results. For some +// errors, we may consume the error and return only the partial read/write. +// +// op and f are used only for panics. +func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error { + known, err := handleIOErrorImpl(t, partialResult, err, intr, op) + if err != nil { + return err + } + if !known { + // An unknown error is encountered with a partial read/write. + name, _ := f.Dirent.FullName(nil /* ignore chroot */) + log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations) + partialResultOnce.Do(partialResultMetric.Increment) + } + return nil +} + +// handleIOError handles special error cases for partial results. For some +// errors, we may consume the error and return only the partial read/write. +// +// Returns false if error is unknown. +func handleIOErrorImpl(t *kernel.Task, partialResult bool, err, intr error, op string) (bool, error) { + switch err { + case nil: + // Typical successful syscall. + return true, nil + case io.EOF: + // EOF is always consumed. If this is a partial read/write + // (result != 0), the application will see that, otherwise + // they will see 0. + return true, nil + case syserror.ErrExceedsFileSizeLimit: + // Ignore partialResult because this error only applies to + // normal files, and for those files we cannot accumulate + // write results. + // + // Do not consume the error and return it as EFBIG. + // Simultaneously send a SIGXFSZ per setrlimit(2). + t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) + return true, syserror.EFBIG + case syserror.ErrInterrupted: + // The syscall was interrupted. Return nil if it completed + // partially, otherwise return the error code that the syscall + // needs (to indicate to the kernel what it should do). + if partialResult { + return true, nil + } + return true, intr + } + + if !partialResult { + // Typical syscall error. + return true, err + } + + switch err { + case syserror.EINTR: + // Syscall interrupted, but completed a partial + // read/write. Like ErrWouldBlock, since we have a + // partial read/write, we consume the error and return + // the partial result. + return true, nil + case syserror.EFAULT: + // EFAULT is only shown the user if nothing was + // read/written. If we read something (this case), they see + // a partial read/write. They will then presumably try again + // with an incremented buffer, which will EFAULT with + // result == 0. + return true, nil + case syserror.EPIPE: + // Writes to a pipe or socket will return EPIPE if the other + // side is gone. The partial write is returned. EPIPE will be + // returned on the next call. + // + // TODO(gvisor.dev/issue/161): In some cases SIGPIPE should + // also be sent to the application. + return true, nil + case syserror.ENOSPC: + // Similar to EPIPE. Return what we wrote this time, and let + // ENOSPC be returned on the next call. + return true, nil + case syserror.ECONNRESET: + // For TCP sendfile connections, we may have a reset. But we + // should just return n as the result. + return true, nil + case syserror.ErrWouldBlock: + // Syscall would block, but completed a partial read/write. + // This case should only be returned by IssueIO for nonblocking + // files. Since we have a partial read/write, we consume + // ErrWouldBlock, returning the partial result. + return true, nil + } + + switch err.(type) { + case kernel.SyscallRestartErrno: + // Identical to the EINTR case. + return true, nil + } + + // Error is unknown and cannot be properly handled. + return false, nil +} diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go new file mode 100644 index 000000000..07961dad9 --- /dev/null +++ b/pkg/sentry/syscalls/linux/flags.go @@ -0,0 +1,55 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +// flagsToPermissions returns a Permissions object from Linux flags. +// This includes truncate permission if O_TRUNC is set in the mask. +func flagsToPermissions(mask uint) (p fs.PermMask) { + if mask&linux.O_TRUNC != 0 { + p.Write = true + } + switch mask & linux.O_ACCMODE { + case linux.O_WRONLY: + p.Write = true + case linux.O_RDWR: + p.Write = true + p.Read = true + case linux.O_RDONLY: + p.Read = true + } + return +} + +// linuxToFlags converts Linux file flags to a FileFlags object. +func linuxToFlags(mask uint) fs.FileFlags { + return fs.FileFlags{ + Direct: mask&linux.O_DIRECT != 0, + DSync: mask&(linux.O_DSYNC|linux.O_SYNC) != 0, + Sync: mask&linux.O_SYNC != 0, + NonBlocking: mask&linux.O_NONBLOCK != 0, + Read: (mask & linux.O_ACCMODE) != linux.O_WRONLY, + Write: (mask & linux.O_ACCMODE) != linux.O_RDONLY, + Append: mask&linux.O_APPEND != 0, + Directory: mask&linux.O_DIRECTORY != 0, + Async: mask&linux.O_ASYNC != 0, + LargeFile: mask&linux.O_LARGEFILE != 0, + Truncate: mask&linux.O_TRUNC != 0, + } +} diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go new file mode 100644 index 000000000..ea4f9b1a7 --- /dev/null +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -0,0 +1,736 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package linux provides syscall tables for amd64 Linux. +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const ( + // LinuxSysname is the OS name advertised by gVisor. + LinuxSysname = "Linux" + + // LinuxRelease is the Linux release version number advertised by gVisor. + LinuxRelease = "4.4.0" + + // LinuxVersion is the version info advertised by gVisor. + LinuxVersion = "#1 SMP Sun Jan 10 15:06:54 PST 2016" +) + +// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall +// numbers from Linux 4.4. +var AMD64 = &kernel.SyscallTable{ + OS: abi.Linux, + Arch: arch.AMD64, + Version: kernel.Version{ + // Version 4.4 is chosen as a stable, longterm version of Linux, which + // guides the interface provided by this syscall table. The build + // version is that for a clean build with default kernel config, at 5 + // minutes after v4.4 was tagged. + Sysname: LinuxSysname, + Release: LinuxRelease, + Version: LinuxVersion, + }, + AuditNumber: linux.AUDIT_ARCH_X86_64, + Table: map[uintptr]kernel.Syscall{ + 0: syscalls.Supported("read", Read), + 1: syscalls.Supported("write", Write), + 2: syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil), + 3: syscalls.Supported("close", Close), + 4: syscalls.Supported("stat", Stat), + 5: syscalls.Supported("fstat", Fstat), + 6: syscalls.Supported("lstat", Lstat), + 7: syscalls.Supported("poll", Poll), + 8: syscalls.Supported("lseek", Lseek), + 9: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), + 10: syscalls.Supported("mprotect", Mprotect), + 11: syscalls.Supported("munmap", Munmap), + 12: syscalls.Supported("brk", Brk), + 13: syscalls.Supported("rt_sigaction", RtSigaction), + 14: syscalls.Supported("rt_sigprocmask", RtSigprocmask), + 15: syscalls.Supported("rt_sigreturn", RtSigreturn), + 16: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), + 17: syscalls.Supported("pread64", Pread64), + 18: syscalls.Supported("pwrite64", Pwrite64), + 19: syscalls.Supported("readv", Readv), + 20: syscalls.Supported("writev", Writev), + 21: syscalls.Supported("access", Access), + 22: syscalls.Supported("pipe", Pipe), + 23: syscalls.Supported("select", Select), + 24: syscalls.Supported("sched_yield", SchedYield), + 25: syscalls.Supported("mremap", Mremap), + 26: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), + 27: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), + 28: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), + 29: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), + 30: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), + 31: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), + 32: syscalls.Supported("dup", Dup), + 33: syscalls.Supported("dup2", Dup2), + 34: syscalls.Supported("pause", Pause), + 35: syscalls.Supported("nanosleep", Nanosleep), + 36: syscalls.Supported("getitimer", Getitimer), + 37: syscalls.Supported("alarm", Alarm), + 38: syscalls.Supported("setitimer", Setitimer), + 39: syscalls.Supported("getpid", Getpid), + 40: syscalls.Supported("sendfile", Sendfile), + 41: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), + 42: syscalls.Supported("connect", Connect), + 43: syscalls.Supported("accept", Accept), + 44: syscalls.Supported("sendto", SendTo), + 45: syscalls.Supported("recvfrom", RecvFrom), + 46: syscalls.Supported("sendmsg", SendMsg), + 47: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), + 48: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), + 49: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), + 50: syscalls.Supported("listen", Listen), + 51: syscalls.Supported("getsockname", GetSockName), + 52: syscalls.Supported("getpeername", GetPeerName), + 53: syscalls.Supported("socketpair", SocketPair), + 54: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), + 55: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), + 56: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), + 57: syscalls.Supported("fork", Fork), + 58: syscalls.Supported("vfork", Vfork), + 59: syscalls.Supported("execve", Execve), + 60: syscalls.Supported("exit", Exit), + 61: syscalls.Supported("wait4", Wait4), + 62: syscalls.Supported("kill", Kill), + 63: syscalls.Supported("uname", Uname), + 64: syscalls.Supported("semget", Semget), + 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), + 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), + 67: syscalls.Supported("shmdt", Shmdt), + 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), + 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), + 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), + 75: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), + 76: syscalls.Supported("truncate", Truncate), + 77: syscalls.Supported("ftruncate", Ftruncate), + 78: syscalls.Supported("getdents", Getdents), + 79: syscalls.Supported("getcwd", Getcwd), + 80: syscalls.Supported("chdir", Chdir), + 81: syscalls.Supported("fchdir", Fchdir), + 82: syscalls.Supported("rename", Rename), + 83: syscalls.Supported("mkdir", Mkdir), + 84: syscalls.Supported("rmdir", Rmdir), + 85: syscalls.Supported("creat", Creat), + 86: syscalls.Supported("link", Link), + 87: syscalls.Supported("unlink", Unlink), + 88: syscalls.Supported("symlink", Symlink), + 89: syscalls.Supported("readlink", Readlink), + 90: syscalls.Supported("chmod", Chmod), + 91: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), + 92: syscalls.Supported("chown", Chown), + 93: syscalls.Supported("fchown", Fchown), + 94: syscalls.Supported("lchown", Lchown), + 95: syscalls.Supported("umask", Umask), + 96: syscalls.Supported("gettimeofday", Gettimeofday), + 97: syscalls.Supported("getrlimit", Getrlimit), + 98: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), + 99: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), + 100: syscalls.Supported("times", Times), + 101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), + 102: syscalls.Supported("getuid", Getuid), + 103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), + 104: syscalls.Supported("getgid", Getgid), + 105: syscalls.Supported("setuid", Setuid), + 106: syscalls.Supported("setgid", Setgid), + 107: syscalls.Supported("geteuid", Geteuid), + 108: syscalls.Supported("getegid", Getegid), + 109: syscalls.Supported("setpgid", Setpgid), + 110: syscalls.Supported("getppid", Getppid), + 111: syscalls.Supported("getpgrp", Getpgrp), + 112: syscalls.Supported("setsid", Setsid), + 113: syscalls.Supported("setreuid", Setreuid), + 114: syscalls.Supported("setregid", Setregid), + 115: syscalls.Supported("getgroups", Getgroups), + 116: syscalls.Supported("setgroups", Setgroups), + 117: syscalls.Supported("setresuid", Setresuid), + 118: syscalls.Supported("getresuid", Getresuid), + 119: syscalls.Supported("setresgid", Setresgid), + 120: syscalls.Supported("getresgid", Getresgid), + 121: syscalls.Supported("getpgid", Getpgid), + 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 124: syscalls.Supported("getsid", Getsid), + 125: syscalls.Supported("capget", Capget), + 126: syscalls.Supported("capset", Capset), + 127: syscalls.Supported("rt_sigpending", RtSigpending), + 128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), + 129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), + 130: syscalls.Supported("rt_sigsuspend", RtSigsuspend), + 131: syscalls.Supported("sigaltstack", Sigaltstack), + 132: syscalls.Supported("utime", Utime), + 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), + 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil), + 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil), + 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), + 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), + 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}), + 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), + 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), + 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), + 143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), + 144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), + 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), + 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), + 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), + 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), + 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), + 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil), + 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil), + 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil), + 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), + 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), + 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), + 160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), + 161: syscalls.Supported("chroot", Chroot), + 162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), + 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), + 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), + 165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), + 166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), + 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), + 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), + 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), + 170: syscalls.Supported("sethostname", Sethostname), + 171: syscalls.Supported("setdomainname", Setdomainname), + 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil), + 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil), + 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), + 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), + 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), + 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), + 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), + 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations + 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), + 186: syscalls.Supported("gettid", Gettid), + 187: syscalls.Supported("readahead", Readahead), + 188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), + 189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), + 190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), + 191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), + 192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), + 193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), + 194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), + 195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), + 196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), + 197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), + 198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), + 199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), + 200: syscalls.Supported("tkill", Tkill), + 201: syscalls.Supported("time", Time), + 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), + 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), + 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), + 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), + 213: syscalls.Supported("epoll_create", EpollCreate), + 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil), + 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil), + 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), + 217: syscalls.Supported("getdents64", Getdents64), + 218: syscalls.Supported("set_tid_address", SetTidAddress), + 219: syscalls.Supported("restart_syscall", RestartSyscall), + 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), + 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), + 222: syscalls.Supported("timer_create", TimerCreate), + 223: syscalls.Supported("timer_settime", TimerSettime), + 224: syscalls.Supported("timer_gettime", TimerGettime), + 225: syscalls.Supported("timer_getoverrun", TimerGetoverrun), + 226: syscalls.Supported("timer_delete", TimerDelete), + 227: syscalls.Supported("clock_settime", ClockSettime), + 228: syscalls.Supported("clock_gettime", ClockGettime), + 229: syscalls.Supported("clock_getres", ClockGetres), + 230: syscalls.Supported("clock_nanosleep", ClockNanosleep), + 231: syscalls.Supported("exit_group", ExitGroup), + 232: syscalls.Supported("epoll_wait", EpollWait), + 233: syscalls.Supported("epoll_ctl", EpollCtl), + 234: syscalls.Supported("tgkill", Tgkill), + 235: syscalls.Supported("utimes", Utimes), + 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil), + 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), + 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), + 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), + 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), + 247: syscalls.Supported("waitid", Waitid), + 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), + 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), + 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), + 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil), + 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), + 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), + 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), + 257: syscalls.Supported("openat", Openat), + 258: syscalls.Supported("mkdirat", Mkdirat), + 259: syscalls.Supported("mknodat", Mknodat), + 260: syscalls.Supported("fchownat", Fchownat), + 261: syscalls.Supported("futimesat", Futimesat), + 262: syscalls.Supported("fstatat", Fstatat), + 263: syscalls.Supported("unlinkat", Unlinkat), + 264: syscalls.Supported("renameat", Renameat), + 265: syscalls.Supported("linkat", Linkat), + 266: syscalls.Supported("symlinkat", Symlinkat), + 267: syscalls.Supported("readlinkat", Readlinkat), + 268: syscalls.Supported("fchmodat", Fchmodat), + 269: syscalls.Supported("faccessat", Faccessat), + 270: syscalls.Supported("pselect", Pselect), + 271: syscalls.Supported("ppoll", Ppoll), + 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), + 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 275: syscalls.Supported("splice", Splice), + 276: syscalls.Supported("tee", Tee), + 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), + 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) + 280: syscalls.Supported("utimensat", Utimensat), + 281: syscalls.Supported("epoll_pwait", EpollPwait), + 282: syscalls.PartiallySupported("signalfd", Signalfd, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), + 283: syscalls.Supported("timerfd_create", TimerfdCreate), + 284: syscalls.Supported("eventfd", Eventfd), + 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), + 286: syscalls.Supported("timerfd_settime", TimerfdSettime), + 287: syscalls.Supported("timerfd_gettime", TimerfdGettime), + 288: syscalls.Supported("accept4", Accept4), + 289: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), + 290: syscalls.Supported("eventfd2", Eventfd2), + 291: syscalls.Supported("epoll_create1", EpollCreate1), + 292: syscalls.Supported("dup3", Dup3), + 293: syscalls.Supported("pipe2", Pipe2), + 294: syscalls.Supported("inotify_init1", InotifyInit1), + 295: syscalls.Supported("preadv", Preadv), + 296: syscalls.Supported("pwritev", Pwritev), + 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), + 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), + 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), + 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 302: syscalls.Supported("prlimit64", Prlimit64), + 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), + 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), + 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), + 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 309: syscalls.Supported("getcpu", Getcpu), + 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), + 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), + 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 317: syscalls.Supported("seccomp", Seccomp), + 318: syscalls.Supported("getrandom", GetRandom), + 319: syscalls.Supported("memfd_create", MemfdCreate), + 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), + 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), + 322: syscalls.Supported("execveat", Execveat), + 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) + 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + + // Syscalls implemented after 325 are "backports" from versions + // of Linux after 4.4. + 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), + 327: syscalls.Supported("preadv2", Preadv2), + 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), + 329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), + 330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), + 331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), + 332: syscalls.Supported("statx", Statx), + 333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), + + // Linux skips ahead to syscall 424 to sync numbers between arches. + 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + }, + Emulate: map[usermem.Addr]uintptr{ + 0xffffffffff600000: 96, // vsyscall gettimeofday(2) + 0xffffffffff600400: 201, // vsyscall time(2) + 0xffffffffff600800: 309, // vsyscall getcpu(2) + }, + Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { + t.Kernel().EmitUnimplementedEvent(t) + return 0, syserror.ENOSYS + }, +} + +// ARM64 is a table of Linux arm64 syscall API with the corresponding syscall +// numbers from Linux 4.4. +var ARM64 = &kernel.SyscallTable{ + OS: abi.Linux, + Arch: arch.ARM64, + Version: kernel.Version{ + Sysname: LinuxSysname, + Release: LinuxRelease, + Version: LinuxVersion, + }, + AuditNumber: linux.AUDIT_ARCH_AARCH64, + Table: map[uintptr]kernel.Syscall{ + 0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 5: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), + 6: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), + 7: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), + 8: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), + 9: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), + 10: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), + 11: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), + 12: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), + 13: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), + 14: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), + 15: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), + 16: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), + 17: syscalls.Supported("getcwd", Getcwd), + 18: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), + 19: syscalls.Supported("eventfd2", Eventfd2), + 20: syscalls.Supported("epoll_create1", EpollCreate1), + 21: syscalls.Supported("epoll_ctl", EpollCtl), + 22: syscalls.Supported("epoll_pwait", EpollPwait), + 23: syscalls.Supported("dup", Dup), + 24: syscalls.Supported("dup3", Dup3), + 25: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), + 26: syscalls.Supported("inotify_init1", InotifyInit1), + 27: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), + 28: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), + 29: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), + 30: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 31: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 32: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), + 33: syscalls.Supported("mknodat", Mknodat), + 34: syscalls.Supported("mkdirat", Mkdirat), + 35: syscalls.Supported("unlinkat", Unlinkat), + 36: syscalls.Supported("symlinkat", Symlinkat), + 37: syscalls.Supported("linkat", Linkat), + 38: syscalls.Supported("renameat", Renameat), + 39: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), + 40: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), + 41: syscalls.Error("pivot_root", syserror.EPERM, "", nil), + 42: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 43: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), + 44: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), + 45: syscalls.Supported("truncate", Truncate), + 46: syscalls.Supported("ftruncate", Ftruncate), + 47: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), + 48: syscalls.Supported("faccessat", Faccessat), + 49: syscalls.Supported("chdir", Chdir), + 50: syscalls.Supported("fchdir", Fchdir), + 51: syscalls.Supported("chroot", Chroot), + 52: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), + 53: syscalls.Supported("fchmodat", Fchmodat), + 54: syscalls.Supported("fchownat", Fchownat), + 55: syscalls.Supported("fchown", Fchown), + 56: syscalls.Supported("openat", Openat), + 57: syscalls.Supported("close", Close), + 58: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), + 59: syscalls.Supported("pipe2", Pipe2), + 60: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations + 61: syscalls.Supported("getdents64", Getdents64), + 62: syscalls.Supported("lseek", Lseek), + 63: syscalls.Supported("read", Read), + 64: syscalls.Supported("write", Write), + 65: syscalls.Supported("readv", Readv), + 66: syscalls.Supported("writev", Writev), + 67: syscalls.Supported("pread64", Pread64), + 68: syscalls.Supported("pwrite64", Pwrite64), + 69: syscalls.Supported("preadv", Preadv), + 70: syscalls.Supported("pwritev", Pwritev), + 71: syscalls.Supported("sendfile", Sendfile), + 72: syscalls.Supported("pselect", Pselect), + 73: syscalls.Supported("ppoll", Ppoll), + 74: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), + 75: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 76: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 77: syscalls.Supported("tee", Tee), + 78: syscalls.Supported("readlinkat", Readlinkat), + 79: syscalls.Supported("fstatat", Fstatat), + 80: syscalls.Supported("fstat", Fstat), + 81: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), + 82: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), + 83: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), + 84: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), + 85: syscalls.Supported("timerfd_create", TimerfdCreate), + 86: syscalls.Supported("timerfd_settime", TimerfdSettime), + 87: syscalls.Supported("timerfd_gettime", TimerfdGettime), + 88: syscalls.Supported("utimensat", Utimensat), + 89: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), + 90: syscalls.Supported("capget", Capget), + 91: syscalls.Supported("capset", Capset), + 92: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 93: syscalls.Supported("exit", Exit), + 94: syscalls.Supported("exit_group", ExitGroup), + 95: syscalls.Supported("waitid", Waitid), + 96: syscalls.Supported("set_tid_address", SetTidAddress), + 97: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), + 98: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), + 99: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 101: syscalls.Supported("nanosleep", Nanosleep), + 102: syscalls.Supported("getitimer", Getitimer), + 103: syscalls.Supported("setitimer", Setitimer), + 104: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), + 105: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), + 106: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), + 107: syscalls.Supported("timer_create", TimerCreate), + 108: syscalls.Supported("timer_gettime", TimerGettime), + 109: syscalls.Supported("timer_getoverrun", TimerGetoverrun), + 110: syscalls.Supported("timer_settime", TimerSettime), + 111: syscalls.Supported("timer_delete", TimerDelete), + 112: syscalls.Supported("clock_settime", ClockSettime), + 113: syscalls.Supported("clock_gettime", ClockGettime), + 114: syscalls.Supported("clock_getres", ClockGetres), + 115: syscalls.Supported("clock_nanosleep", ClockNanosleep), + 116: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), + 117: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), + 118: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), + 119: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), + 120: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), + 121: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), + 122: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), + 123: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), + 124: syscalls.Supported("sched_yield", SchedYield), + 125: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), + 126: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), + 127: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), + 128: syscalls.Supported("restart_syscall", RestartSyscall), + 129: syscalls.Supported("kill", Kill), + 130: syscalls.Supported("tkill", Tkill), + 131: syscalls.Supported("tgkill", Tgkill), + 132: syscalls.Supported("sigaltstack", Sigaltstack), + 133: syscalls.Supported("rt_sigsuspend", RtSigsuspend), + 134: syscalls.Supported("rt_sigaction", RtSigaction), + 135: syscalls.Supported("rt_sigprocmask", RtSigprocmask), + 136: syscalls.Supported("rt_sigpending", RtSigpending), + 137: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), + 138: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), + 139: syscalls.Supported("rt_sigreturn", RtSigreturn), + 140: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), + 141: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), + 142: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), + 143: syscalls.Supported("setregid", Setregid), + 144: syscalls.Supported("setgid", Setgid), + 145: syscalls.Supported("setreuid", Setreuid), + 146: syscalls.Supported("setuid", Setuid), + 147: syscalls.Supported("setresuid", Setresuid), + 148: syscalls.Supported("getresuid", Getresuid), + 149: syscalls.Supported("setresgid", Setresgid), + 150: syscalls.Supported("getresgid", Getresgid), + 151: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 152: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 153: syscalls.Supported("times", Times), + 154: syscalls.Supported("setpgid", Setpgid), + 155: syscalls.Supported("getpgid", Getpgid), + 156: syscalls.Supported("getsid", Getsid), + 157: syscalls.Supported("setsid", Setsid), + 158: syscalls.Supported("getgroups", Getgroups), + 159: syscalls.Supported("setgroups", Setgroups), + 160: syscalls.Supported("uname", Uname), + 161: syscalls.Supported("sethostname", Sethostname), + 162: syscalls.Supported("setdomainname", Setdomainname), + 163: syscalls.Supported("getrlimit", Getrlimit), + 164: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), + 165: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), + 166: syscalls.Supported("umask", Umask), + 167: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), + 168: syscalls.Supported("getcpu", Getcpu), + 169: syscalls.Supported("gettimeofday", Gettimeofday), + 170: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), + 171: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), + 172: syscalls.Supported("getpid", Getpid), + 173: syscalls.Supported("getppid", Getppid), + 174: syscalls.Supported("getuid", Getuid), + 175: syscalls.Supported("geteuid", Geteuid), + 176: syscalls.Supported("getgid", Getgid), + 177: syscalls.Supported("getegid", Getegid), + 178: syscalls.Supported("gettid", Gettid), + 179: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), + 180: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 181: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 182: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 183: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 184: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 185: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 186: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 187: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 190: syscalls.Supported("semget", Semget), + 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), + 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), + 193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), + 194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), + 195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), + 196: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), + 197: syscalls.Supported("shmdt", Shmdt), + 198: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), + 199: syscalls.Supported("socketpair", SocketPair), + 200: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), + 201: syscalls.Supported("listen", Listen), + 202: syscalls.Supported("accept", Accept), + 203: syscalls.Supported("connect", Connect), + 204: syscalls.Supported("getsockname", GetSockName), + 205: syscalls.Supported("getpeername", GetPeerName), + 206: syscalls.Supported("sendto", SendTo), + 207: syscalls.Supported("recvfrom", RecvFrom), + 208: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), + 209: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), + 210: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), + 211: syscalls.Supported("sendmsg", SendMsg), + 212: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), + 213: syscalls.Supported("readahead", Readahead), + 214: syscalls.Supported("brk", Brk), + 215: syscalls.Supported("munmap", Munmap), + 216: syscalls.Supported("mremap", Mremap), + 217: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), + 218: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), + 219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), + 220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), + 221: syscalls.Supported("execve", Execve), + 222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), + 223: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), + 224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), + 225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), + 226: syscalls.Supported("mprotect", Mprotect), + 227: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), + 228: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 229: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 230: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 231: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 232: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), + 233: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), + 234: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), + 235: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), + 236: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), + 237: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), + 238: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), + 239: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) + 240: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), + 241: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), + 242: syscalls.Supported("accept4", Accept4), + 243: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), + 260: syscalls.Supported("wait4", Wait4), + 261: syscalls.Supported("prlimit64", Prlimit64), + 262: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 263: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 264: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 265: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), + 267: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), + 268: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 269: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), + 270: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 271: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 272: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), + 273: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), + 274: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 275: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 276: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 277: syscalls.Supported("seccomp", Seccomp), + 278: syscalls.Supported("getrandom", GetRandom), + 279: syscalls.Supported("memfd_create", MemfdCreate), + 280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), + 281: syscalls.Supported("execveat", Execveat), + 282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) + 284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + + // Syscalls after 284 are "backports" from versions of Linux after 4.4. + 285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), + 286: syscalls.Supported("preadv2", Preadv2), + 287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), + 288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), + 289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), + 290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), + 291: syscalls.Supported("statx", Statx), + 292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), + + // Linux skips ahead to syscall 424 to sync numbers between arches. + 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + }, + Emulate: map[usermem.Addr]uintptr{}, + Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { + t.Kernel().EmitUnimplementedEvent(t) + return 0, syserror.ENOSYS + }, +} + +func init() { + kernel.RegisterSyscallTable(AMD64) + kernel.RegisterSyscallTable(ARM64) +} diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go new file mode 100644 index 000000000..434559b80 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sigset.go @@ -0,0 +1,71 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// CopyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and +// STOP are clear. +// +// TODO(gvisor.dev/issue/1624): This is only exported because +// syscalls/vfs2/signal.go depends on it. Once vfs1 is deleted and the vfs2 +// syscalls are moved into this package, then they can be unexported. +func CopyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) { + if size != linux.SignalSetSize { + return 0, syserror.EINVAL + } + b := t.CopyScratchBuffer(8) + if _, err := t.CopyInBytes(sigSetAddr, b); err != nil { + return 0, err + } + mask := usermem.ByteOrder.Uint64(b[:]) + return linux.SignalSet(mask) &^ kernel.UnblockableSignals, nil +} + +// copyOutSigSet copies out a sigset_t. +func copyOutSigSet(t *kernel.Task, sigSetAddr usermem.Addr, mask linux.SignalSet) error { + b := t.CopyScratchBuffer(8) + usermem.ByteOrder.PutUint64(b, uint64(mask)) + _, err := t.CopyOutBytes(sigSetAddr, b) + return err +} + +// copyInSigSetWithSize copies in a structure as below +// +// struct { +// sigset_t* sigset_addr; +// size_t sizeof_sigset; +// }; +// +// and returns sigset_addr and size. +func copyInSigSetWithSize(t *kernel.Task, addr usermem.Addr) (usermem.Addr, uint, error) { + switch t.Arch().Width() { + case 8: + in := t.CopyScratchBuffer(16) + if _, err := t.CopyInBytes(addr, in); err != nil { + return 0, 0, err + } + maskAddr := usermem.Addr(usermem.ByteOrder.Uint64(in[0:])) + maskSize := uint(usermem.ByteOrder.Uint64(in[8:])) + return maskAddr, maskSize, nil + default: + return 0, 0, syserror.ENOSYS + } +} diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go new file mode 100644 index 000000000..ba2557c52 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -0,0 +1,382 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// IoSetup implements linux syscall io_setup(2). +func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nrEvents := args[0].Int() + idAddr := args[1].Pointer() + + // Linux uses the native long as the aio ID. + // + // The context pointer _must_ be zero initially. + var idIn uint64 + if _, err := t.CopyIn(idAddr, &idIn); err != nil { + return 0, nil, err + } + if idIn != 0 { + return 0, nil, syserror.EINVAL + } + + id, err := t.MemoryManager().NewAIOContext(t, uint32(nrEvents)) + if err != nil { + return 0, nil, err + } + + // Copy out the new ID. + if _, err := t.CopyOut(idAddr, &id); err != nil { + t.MemoryManager().DestroyAIOContext(t, id) + return 0, nil, err + } + + return 0, nil, nil +} + +// IoDestroy implements linux syscall io_destroy(2). +func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Uint64() + + ctx := t.MemoryManager().DestroyAIOContext(t, id) + if ctx == nil { + // Does not exist. + return 0, nil, syserror.EINVAL + } + + // Drain completed requests amd wait for pending requests until there are no + // more. + for { + ctx.Drain() + + ch := ctx.WaitChannel() + if ch == nil { + // No more requests, we're done. + return 0, nil, nil + } + // The task cannot be interrupted during the wait. Equivalent to + // TASK_UNINTERRUPTIBLE in Linux. + t.UninterruptibleSleepStart(true /* deactivate */) + <-ch + t.UninterruptibleSleepFinish(true /* activate */) + } +} + +// IoGetevents implements linux syscall io_getevents(2). +func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Uint64() + minEvents := args[1].Int() + events := args[2].Int() + eventsAddr := args[3].Pointer() + timespecAddr := args[4].Pointer() + + // Sanity check arguments. + if minEvents < 0 || minEvents > events { + return 0, nil, syserror.EINVAL + } + + ctx, ok := t.MemoryManager().LookupAIOContext(t, id) + if !ok { + return 0, nil, syserror.EINVAL + } + + // Setup the timeout. + var haveDeadline bool + var deadline ktime.Time + if timespecAddr != 0 { + d, err := copyTimespecIn(t, timespecAddr) + if err != nil { + return 0, nil, err + } + if !d.Valid() { + return 0, nil, syserror.EINVAL + } + deadline = t.Kernel().MonotonicClock().Now().Add(d.ToDuration()) + haveDeadline = true + } + + // Loop over all requests. + for count := int32(0); count < events; count++ { + // Get a request, per semantics. + var v interface{} + if count >= minEvents { + var ok bool + v, ok = ctx.PopRequest() + if !ok { + return uintptr(count), nil, nil + } + } else { + var err error + v, err = waitForRequest(ctx, t, haveDeadline, deadline) + if err != nil { + if count > 0 || err == syserror.ETIMEDOUT { + return uintptr(count), nil, nil + } + return 0, nil, syserror.ConvertIntr(err, syserror.EINTR) + } + } + + ev := v.(*linux.IOEvent) + + // Copy out the result. + if _, err := t.CopyOut(eventsAddr, ev); err != nil { + if count > 0 { + return uintptr(count), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Keep rolling. + eventsAddr += usermem.Addr(linux.IOEventSize) + } + + // Everything finished. + return uintptr(events), nil, nil +} + +func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (interface{}, error) { + for { + if v, ok := ctx.PopRequest(); ok { + // Request was readily available. Just return it. + return v, nil + } + + // Need to wait for request completion. + done := ctx.WaitChannel() + if done == nil { + // Context has been destroyed. + return nil, syserror.EINVAL + } + if err := t.BlockWithDeadline(done, haveDeadline, deadline); err != nil { + return nil, err + } + } +} + +// memoryFor returns appropriate memory for the given callback. +func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) { + bytes := int(cb.Bytes) + if bytes < 0 { + // Linux also requires that this field fit in ssize_t. + return usermem.IOSequence{}, syserror.EINVAL + } + + // Since this I/O will be asynchronous with respect to t's task goroutine, + // we have no guarantee that t's AddressSpace will be active during the + // I/O. + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE: + return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV: + return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP: + return usermem.IOSequence{}, nil + + default: + // Not a supported command. + return usermem.IOSequence{}, syserror.EINVAL + } +} + +// IoCancel implements linux syscall io_cancel(2). +// +// It is not presently supported (ENOSYS indicates no support on this +// architecture). +func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, syserror.ENOSYS +} + +// LINT.IfChange + +func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, actx *mm.AIOContext, eventFile *fs.File) kernel.AIOCallback { + return func(ctx context.Context) { + if actx.Dead() { + actx.CancelPendingRequest() + return + } + ev := &linux.IOEvent{ + Data: cb.Data, + Obj: uint64(cbAddr), + } + + var err error + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV: + ev.Result, err = file.Preadv(ctx, ioseq, cb.Offset) + case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + ev.Result, err = file.Pwritev(ctx, ioseq, cb.Offset) + case linux.IOCB_CMD_FSYNC: + err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + case linux.IOCB_CMD_FDSYNC: + err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncData) + } + + // Update the result. + if err != nil { + err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file) + ev.Result = -int64(kernel.ExtractErrno(err, 0)) + } + + file.DecRef() + + // Queue the result for delivery. + actx.FinishRequest(ev) + + // Notify the event file if one was specified. This needs to happen + // *after* queueing the result to avoid racing with the thread we may + // wake up. + if eventFile != nil { + eventFile.FileOperations.(*eventfd.EventOperations).Signal(1) + eventFile.DecRef() + } + } +} + +// submitCallback processes a single callback. +func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error { + file := t.GetFile(cb.FD) + if file == nil { + // File not found. + return syserror.EBADF + } + defer file.DecRef() + + // Was there an eventFD? Extract it. + var eventFile *fs.File + if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { + eventFile = t.GetFile(cb.ResFD) + if eventFile == nil { + // Bad FD. + return syserror.EBADF + } + defer eventFile.DecRef() + + // Check that it is an eventfd. + if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok { + // Not an event FD. + return syserror.EINVAL + } + } + + ioseq, err := memoryFor(t, cb) + if err != nil { + return err + } + + // Check offset for reads/writes. + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + if cb.Offset < 0 { + return syserror.EINVAL + } + } + + // Prepare the request. + ctx, ok := t.MemoryManager().LookupAIOContext(t, id) + if !ok { + return syserror.EINVAL + } + if ready := ctx.Prepare(); !ready { + // Context is busy. + return syserror.EAGAIN + } + + if eventFile != nil { + // The request is set. Make sure there's a ref on the file. + // + // This is necessary when the callback executes on completion, + // which is also what will release this reference. + eventFile.IncRef() + } + + // Perform the request asynchronously. + file.IncRef() + t.QueueAIO(getAIOCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile)) + + // All set. + return nil +} + +// IoSubmit implements linux syscall io_submit(2). +func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Uint64() + nrEvents := args[1].Int() + addr := args[2].Pointer() + + if nrEvents < 0 { + return 0, nil, syserror.EINVAL + } + + for i := int32(0); i < nrEvents; i++ { + // Copy in the address. + cbAddrNative := t.Arch().Native(0) + if _, err := t.CopyIn(addr, cbAddrNative); err != nil { + if i > 0 { + // Some successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Copy in this callback. + var cb linux.IOCallback + cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative)) + if _, err := t.CopyIn(cbAddr, &cb); err != nil { + + if i > 0 { + // Some have been successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Process this callback. + if err := submitCallback(t, id, &cb, cbAddr); err != nil { + if i > 0 { + // Partial success. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Advance to the next one. + addr += usermem.Addr(t.Arch().Width()) + } + + return uintptr(nrEvents), nil, nil +} + +// LINT.ThenChange(vfs2/aio.go) diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go new file mode 100644 index 000000000..adf5ea5f2 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_capability.go @@ -0,0 +1,149 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +func lookupCaps(t *kernel.Task, tid kernel.ThreadID) (permitted, inheritable, effective auth.CapabilitySet, err error) { + if tid < 0 { + err = syserror.EINVAL + return + } + if tid > 0 { + t = t.PIDNamespace().TaskWithID(tid) + } + if t == nil { + err = syserror.ESRCH + return + } + creds := t.Credentials() + permitted, inheritable, effective = creds.PermittedCaps, creds.InheritableCaps, creds.EffectiveCaps + return +} + +// Capget implements Linux syscall capget. +func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + hdrAddr := args[0].Pointer() + dataAddr := args[1].Pointer() + + var hdr linux.CapUserHeader + if _, err := t.CopyIn(hdrAddr, &hdr); err != nil { + return 0, nil, err + } + // hdr.Pid doesn't need to be valid if this capget() is a "version probe" + // (hdr.Version is unrecognized and dataAddr is null), so we can't do the + // lookup yet. + switch hdr.Version { + case linux.LINUX_CAPABILITY_VERSION_1: + if dataAddr == 0 { + return 0, nil, nil + } + p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid)) + if err != nil { + return 0, nil, err + } + data := linux.CapUserData{ + Effective: uint32(e), + Permitted: uint32(p), + Inheritable: uint32(i), + } + _, err = t.CopyOut(dataAddr, &data) + return 0, nil, err + + case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3: + if dataAddr == 0 { + return 0, nil, nil + } + p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid)) + if err != nil { + return 0, nil, err + } + data := [2]linux.CapUserData{ + { + Effective: uint32(e), + Permitted: uint32(p), + Inheritable: uint32(i), + }, + { + Effective: uint32(e >> 32), + Permitted: uint32(p >> 32), + Inheritable: uint32(i >> 32), + }, + } + _, err = t.CopyOut(dataAddr, &data) + return 0, nil, err + + default: + hdr.Version = linux.HighestCapabilityVersion + if _, err := t.CopyOut(hdrAddr, &hdr); err != nil { + return 0, nil, err + } + if dataAddr != 0 { + return 0, nil, syserror.EINVAL + } + return 0, nil, nil + } +} + +// Capset implements Linux syscall capset. +func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + hdrAddr := args[0].Pointer() + dataAddr := args[1].Pointer() + + var hdr linux.CapUserHeader + if _, err := t.CopyIn(hdrAddr, &hdr); err != nil { + return 0, nil, err + } + switch hdr.Version { + case linux.LINUX_CAPABILITY_VERSION_1: + if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() { + return 0, nil, syserror.EPERM + } + var data linux.CapUserData + if _, err := t.CopyIn(dataAddr, &data); err != nil { + return 0, nil, err + } + p := auth.CapabilitySet(data.Permitted) & auth.AllCapabilities + i := auth.CapabilitySet(data.Inheritable) & auth.AllCapabilities + e := auth.CapabilitySet(data.Effective) & auth.AllCapabilities + return 0, nil, t.SetCapabilitySets(p, i, e) + + case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3: + if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() { + return 0, nil, syserror.EPERM + } + var data [2]linux.CapUserData + if _, err := t.CopyIn(dataAddr, &data); err != nil { + return 0, nil, err + } + p := (auth.CapabilitySet(data[0].Permitted) | (auth.CapabilitySet(data[1].Permitted) << 32)) & auth.AllCapabilities + i := (auth.CapabilitySet(data[0].Inheritable) | (auth.CapabilitySet(data[1].Inheritable) << 32)) & auth.AllCapabilities + e := (auth.CapabilitySet(data[0].Effective) | (auth.CapabilitySet(data[1].Effective) << 32)) & auth.AllCapabilities + return 0, nil, t.SetCapabilitySets(p, i, e) + + default: + hdr.Version = linux.HighestCapabilityVersion + if _, err := t.CopyOut(hdrAddr, &hdr); err != nil { + return 0, nil, err + } + return 0, nil, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/sys_clone_amd64.go b/pkg/sentry/syscalls/linux/sys_clone_amd64.go new file mode 100644 index 000000000..dd43cf18d --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_clone_amd64.go @@ -0,0 +1,35 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package linux + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" +) + +// Clone implements linux syscall clone(2). +// sys_clone has so many flavors. We implement the default one in linux 3.11 +// x86_64: +// sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val) +func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := int(args[0].Int()) + stack := args[1].Pointer() + parentTID := args[2].Pointer() + childTID := args[3].Pointer() + tls := args[4].Pointer() + return clone(t, flags, stack, parentTID, childTID, tls) +} diff --git a/pkg/sentry/syscalls/linux/sys_clone_arm64.go b/pkg/sentry/syscalls/linux/sys_clone_arm64.go new file mode 100644 index 000000000..cf68a8949 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_clone_arm64.go @@ -0,0 +1,35 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package linux + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" +) + +// Clone implements linux syscall clone(2). +// sys_clone has so many flavors, and we implement the default one in linux 3.11 +// arm64(kernel/fork.c with CONFIG_CLONE_BACKWARDS defined in the config file): +// sys_clone(clone_flags, newsp, parent_tidptr, tls_val, child_tidptr) +func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := int(args[0].Int()) + stack := args[1].Pointer() + parentTID := args[2].Pointer() + tls := args[3].Pointer() + childTID := args[4].Pointer() + return clone(t, flags, stack, parentTID, childTID, tls) +} diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go new file mode 100644 index 000000000..7f460d30b --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -0,0 +1,147 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" + "gvisor.dev/gvisor/pkg/sentry/syscalls" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// LINT.IfChange + +// EpollCreate1 implements the epoll_create1(2) linux syscall. +func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + if flags & ^linux.EPOLL_CLOEXEC != 0 { + return 0, nil, syserror.EINVAL + } + + closeOnExec := flags&linux.EPOLL_CLOEXEC != 0 + fd, err := syscalls.CreateEpoll(t, closeOnExec) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// EpollCreate implements the epoll_create(2) linux syscall. +func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + size := args[0].Int() + + if size <= 0 { + return 0, nil, syserror.EINVAL + } + + fd, err := syscalls.CreateEpoll(t, false) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// EpollCtl implements the epoll_ctl(2) linux syscall. +func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + op := args[1].Int() + fd := args[2].Int() + eventAddr := args[3].Pointer() + + // Capture the event state if needed. + flags := epoll.EntryFlags(0) + mask := waiter.EventMask(0) + var data [2]int32 + if op != linux.EPOLL_CTL_DEL { + var e linux.EpollEvent + if _, err := e.CopyIn(t, eventAddr); err != nil { + return 0, nil, err + } + + if e.Events&linux.EPOLLONESHOT != 0 { + flags |= epoll.OneShot + } + + if e.Events&linux.EPOLLET != 0 { + flags |= epoll.EdgeTriggered + } + + mask = waiter.EventMaskFromLinux(e.Events) + data = e.Data + } + + // Perform the requested operations. + switch op { + case linux.EPOLL_CTL_ADD: + // See fs/eventpoll.c. + mask |= waiter.EventHUp | waiter.EventErr + return 0, nil, syscalls.AddEpoll(t, epfd, fd, flags, mask, data) + case linux.EPOLL_CTL_DEL: + return 0, nil, syscalls.RemoveEpoll(t, epfd, fd) + case linux.EPOLL_CTL_MOD: + // Same as EPOLL_CTL_ADD. + mask |= waiter.EventHUp | waiter.EventErr + return 0, nil, syscalls.UpdateEpoll(t, epfd, fd, flags, mask, data) + default: + return 0, nil, syserror.EINVAL + } +} + +// EpollWait implements the epoll_wait(2) linux syscall. +func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + eventsAddr := args[1].Pointer() + maxEvents := int(args[2].Int()) + timeout := int(args[3].Int()) + + r, err := syscalls.WaitEpoll(t, epfd, maxEvents, timeout) + if err != nil { + return 0, nil, syserror.ConvertIntr(err, syserror.EINTR) + } + + if len(r) != 0 { + if _, err := linux.CopyEpollEventSliceOut(t, eventsAddr, r); err != nil { + return 0, nil, err + } + } + + return uintptr(len(r)), nil, nil +} + +// EpollPwait implements the epoll_pwait(2) linux syscall. +func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + maskAddr := args[4].Pointer() + maskSize := uint(args[5].Uint()) + + if maskAddr != 0 { + mask, err := CopyInSigSet(t, maskAddr, maskSize) + if err != nil { + return 0, nil, err + } + + oldmask := t.SignalMask() + t.SetSignalMask(mask) + t.SetSavedSignalMask(oldmask) + } + + return EpollWait(t, args) +} + +// LINT.ThenChange(vfs2/epoll.go) diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go new file mode 100644 index 000000000..ed3413ca6 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -0,0 +1,56 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Eventfd2 implements linux syscall eventfd2(2). +func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + initVal := args[0].Int() + flags := uint(args[1].Uint()) + allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC) + + if flags & ^allOps != 0 { + return 0, nil, syserror.EINVAL + } + + event := eventfd.New(t, uint64(initVal), flags&linux.EFD_SEMAPHORE != 0) + event.SetFlags(fs.SettableFileFlags{ + NonBlocking: flags&linux.EFD_NONBLOCK != 0, + }) + defer event.DecRef() + + fd, err := t.NewFDFrom(0, event, kernel.FDFlags{ + CloseOnExec: flags&linux.EFD_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// Eventfd implements linux syscall eventfd(2). +func Eventfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + args[1].Value = 0 + return Eventfd2(t, args) +} diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go new file mode 100644 index 000000000..2797c6a72 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -0,0 +1,2238 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/fasync" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// fileOpAt performs an operation on the second last component in the path. +func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error { + // Extract the last component. + dir, name := fs.SplitLast(path) + if dir == "/" { + // Common case: we are accessing a file in the root. + root := t.FSContext().RootDirectory() + err := fn(root, root, name, linux.MaxSymlinkTraversals) + root.DecRef() + return err + } else if dir == "." && dirFD == linux.AT_FDCWD { + // Common case: we are accessing a file relative to the current + // working directory; skip the look-up. + wd := t.FSContext().WorkingDirectory() + root := t.FSContext().RootDirectory() + err := fn(root, wd, name, linux.MaxSymlinkTraversals) + wd.DecRef() + root.DecRef() + return err + } + + return fileOpOn(t, dirFD, dir, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error { + return fn(root, d, name, remainingTraversals) + }) +} + +// fileOpOn performs an operation on the last entry of the path. +func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error { + var ( + d *fs.Dirent // The file. + wd *fs.Dirent // The working directory (if required.) + rel *fs.Dirent // The relative directory for search (if required.) + f *fs.File // The file corresponding to dirFD (if required.) + err error + ) + + // Extract the working directory (maybe). + if len(path) > 0 && path[0] == '/' { + // Absolute path; rel can be nil. + } else if dirFD == linux.AT_FDCWD { + // Need to reference the working directory. + wd = t.FSContext().WorkingDirectory() + rel = wd + } else { + // Need to extract the given FD. + f = t.GetFile(dirFD) + if f == nil { + return syserror.EBADF + } + rel = f.Dirent + if !fs.IsDir(rel.Inode.StableAttr) { + return syserror.ENOTDIR + } + } + + // Grab the root (always required.) + root := t.FSContext().RootDirectory() + + // Lookup the node. + remainingTraversals := uint(linux.MaxSymlinkTraversals) + if resolve { + d, err = t.MountNamespace().FindInode(t, root, rel, path, &remainingTraversals) + } else { + d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals) + } + root.DecRef() + if wd != nil { + wd.DecRef() + } + if f != nil { + f.DecRef() + } + if err != nil { + return err + } + + err = fn(root, d, remainingTraversals) + d.DecRef() + return err +} + +// copyInPath copies a path in. +func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string, dirPath bool, err error) { + path, err = t.CopyInString(addr, linux.PATH_MAX) + if err != nil { + return "", false, err + } + if path == "" && !allowEmpty { + return "", false, syserror.ENOENT + } + + // If the path ends with a /, then checks must be enforced in various + // ways in the different callers. We pass this back to the caller. + path, dirPath = fs.TrimTrailingSlashes(path) + + return path, dirPath, nil +} + +// LINT.IfChange + +func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) { + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, err + } + + resolve := flags&linux.O_NOFOLLOW == 0 + err = fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + // First check a few things about the filesystem before trying to get the file + // reference. + // + // It's required that Check does not try to open files not that aren't backed by + // this dirent (e.g. pipes and sockets) because this would result in opening these + // files an extra time just to check permissions. + if err := d.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil { + return err + } + + if fs.IsSymlink(d.Inode.StableAttr) && !resolve { + return syserror.ELOOP + } + + fileFlags := linuxToFlags(flags) + // Linux always adds the O_LARGEFILE flag when running in 64-bit mode. + fileFlags.LargeFile = true + if fs.IsDir(d.Inode.StableAttr) { + // Don't allow directories to be opened writable. + if fileFlags.Write { + return syserror.EISDIR + } + } else { + // If O_DIRECTORY is set, but the file is not a directory, then fail. + if fileFlags.Directory { + return syserror.ENOTDIR + } + // If it's a directory, then make sure. + if dirPath { + return syserror.ENOTDIR + } + } + + // Truncate is called when O_TRUNC is specified for any kind of + // existing Dirent. Behavior is delegated to the entry's Truncate + // implementation. + if flags&linux.O_TRUNC != 0 { + if err := d.Inode.Truncate(t, d, 0); err != nil { + return err + } + } + + file, err := d.Inode.GetFile(t, d, fileFlags) + if err != nil { + return syserror.ConvertIntr(err, kernel.ERESTARTSYS) + } + defer file.DecRef() + + // Success. + newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + if err != nil { + return err + } + + // Set return result in frame. + fd = uintptr(newFD) + + // Generate notification for opened file. + d.InotifyEvent(linux.IN_OPEN, 0) + + return nil + }) + return fd, err // Use result in frame. +} + +func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error { + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + if dirPath { + return syserror.ENOENT + } + + return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Do we have the appropriate permissions on the parent? + if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + return err + } + + // Attempt a creation. + perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask())) + + switch mode.FileType() { + case 0: + // "Zero file type is equivalent to type S_IFREG." - mknod(2) + fallthrough + case linux.ModeRegular: + // We are not going to return the file, so the actual + // flags used don't matter, but they cannot be empty or + // Create will complain. + flags := fs.FileFlags{Read: true, Write: true} + file, err := d.Create(t, root, name, flags, perms) + if err != nil { + return err + } + file.DecRef() + return nil + + case linux.ModeNamedPipe: + return d.CreateFifo(t, root, name, perms) + + case linux.ModeSocket: + // While it is possible create a unix domain socket file on linux + // using mknod(2), in practice this is pretty useless from an + // application. Linux internally uses mknod() to create the socket + // node during bind(2), but we implement bind(2) independently. If + // an application explicitly creates a socket node using mknod(), + // you can't seem to bind() or connect() to the resulting socket. + // + // Instead of emulating this seemingly useless behaviour, we'll + // indicate that the filesystem doesn't support the creation of + // sockets. + return syserror.EOPNOTSUPP + + case linux.ModeCharacterDevice: + fallthrough + case linux.ModeBlockDevice: + // TODO(b/72101894): We don't support creating block or character + // devices at the moment. + // + // When we start supporting block and character devices, we'll + // need to check for CAP_MKNOD here. + return syserror.EPERM + + default: + // "EINVAL - mode requested creation of something other than a + // regular file, device special file, FIFO or socket." - mknod(2) + return syserror.EINVAL + } + }) +} + +// Mknod implements the linux syscall mknod(2). +func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + path := args[0].Pointer() + mode := linux.FileMode(args[1].ModeT()) + // We don't need this argument until we support creation of device nodes. + _ = args[2].Uint() // dev + + return 0, nil, mknodAt(t, linux.AT_FDCWD, path, mode) +} + +// Mknodat implements the linux syscall mknodat(2). +func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := args[0].Int() + path := args[1].Pointer() + mode := linux.FileMode(args[2].ModeT()) + // We don't need this argument until we support creation of device nodes. + _ = args[3].Uint() // dev + + return 0, nil, mknodAt(t, dirFD, path, mode) +} + +func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) { + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, err + } + if dirPath { + return 0, syserror.ENOENT + } + + fileFlags := linuxToFlags(flags) + // Linux always adds the O_LARGEFILE flag when running in 64-bit mode. + fileFlags.LargeFile = true + + err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, parent *fs.Dirent, name string, remainingTraversals uint) error { + // Resolve the name to see if it exists, and follow any + // symlinks along the way. We must do the symlink resolution + // manually because if the symlink target does not exist, we + // must create the target (and not the symlink itself). + var ( + found *fs.Dirent + err error + ) + for { + if !fs.IsDir(parent.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Start by looking up the dirent at 'name'. + found, err = t.MountNamespace().FindLink(t, root, parent, name, &remainingTraversals) + if err != nil { + break + } + defer found.DecRef() + + // We found something (possibly a symlink). If the + // O_EXCL flag was passed, then we can immediately + // return EEXIST. + if flags&linux.O_EXCL != 0 { + return syserror.EEXIST + } + + // If we have a non-symlink, then we can proceed. + if !fs.IsSymlink(found.Inode.StableAttr) { + break + } + + // If O_NOFOLLOW was passed, then don't try to resolve + // anything. + if flags&linux.O_NOFOLLOW != 0 { + return syserror.ELOOP + } + + // Try to resolve the symlink directly to a Dirent. + var resolved *fs.Dirent + resolved, err = found.Inode.Getlink(t) + if err == nil { + // No more resolution necessary. + defer resolved.DecRef() + break + } + if err != fs.ErrResolveViaReadlink { + return err + } + + // Are we able to resolve further? + if remainingTraversals == 0 { + return syscall.ELOOP + } + + // Resolve the symlink to a path via Readlink. + var path string + path, err = found.Inode.Readlink(t) + if err != nil { + break + } + remainingTraversals-- + + // Get the new parent from the target path. + var newParent *fs.Dirent + newParentPath, newName := fs.SplitLast(path) + newParent, err = t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals) + if err != nil { + break + } + defer newParent.DecRef() + + // Repeat the process with the parent and name of the + // symlink target. + parent = newParent + name = newName + } + + var newFile *fs.File + switch err { + case nil: + // Like sys_open, check for a few things about the + // filesystem before trying to get a reference to the + // fs.File. The same constraints on Check apply. + if err := found.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil { + return err + } + + // Truncate is called when O_TRUNC is specified for any kind of + // existing Dirent. Behavior is delegated to the entry's Truncate + // implementation. + if flags&linux.O_TRUNC != 0 { + if err := found.Inode.Truncate(t, found, 0); err != nil { + return err + } + } + + // Create a new fs.File. + newFile, err = found.Inode.GetFile(t, found, fileFlags) + if err != nil { + return syserror.ConvertIntr(err, kernel.ERESTARTSYS) + } + defer newFile.DecRef() + case syserror.ENOENT: + // File does not exist. Proceed with creation. + + // Do we have write permissions on the parent? + if err := parent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + return err + } + + // Attempt a creation. + perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask())) + newFile, err = parent.Create(t, root, name, fileFlags, perms) + if err != nil { + // No luck, bail. + return err + } + defer newFile.DecRef() + found = newFile.Dirent + default: + return err + } + + // Success. + newFD, err := t.NewFDFrom(0, newFile, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + if err != nil { + return err + } + + // Set result in frame. + fd = uintptr(newFD) + + // Queue the open inotify event. The creation event is + // automatically queued when the dirent is found. The open + // events are implemented at the syscall layer so we need to + // manually queue one here. + found.InotifyEvent(linux.IN_OPEN, 0) + + return nil + }) + return fd, err // Use result in frame. +} + +// Open implements linux syscall open(2). +func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := uint(args[1].Uint()) + if flags&linux.O_CREAT != 0 { + mode := linux.FileMode(args[2].ModeT()) + n, err := createAt(t, linux.AT_FDCWD, addr, flags, mode) + return n, nil, err + } + n, err := openAt(t, linux.AT_FDCWD, addr, flags) + return n, nil, err +} + +// Openat implements linux syscall openat(2). +func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := args[0].Int() + addr := args[1].Pointer() + flags := uint(args[2].Uint()) + if flags&linux.O_CREAT != 0 { + mode := linux.FileMode(args[3].ModeT()) + n, err := createAt(t, dirFD, addr, flags, mode) + return n, nil, err + } + n, err := openAt(t, dirFD, addr, flags) + return n, nil, err +} + +// Creat implements linux syscall creat(2). +func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := linux.FileMode(args[1].ModeT()) + n, err := createAt(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_TRUNC, mode) + return n, nil, err +} + +// accessContext is a context that overrides the credentials used, but +// otherwise carries the same values as the embedded context. +// +// accessContext should only be used for access(2). +type accessContext struct { + context.Context + creds *auth.Credentials +} + +// Value implements context.Context. +func (ac accessContext) Value(key interface{}) interface{} { + switch key { + case auth.CtxCredentials: + return ac.creds + default: + return ac.Context.Value(key) + } +} + +func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode uint) error { + const rOK = 4 + const wOK = 2 + const xOK = 1 + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + + // Sanity check the mode. + if mode&^(rOK|wOK|xOK) != 0 { + return syserror.EINVAL + } + + return fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + // access(2) and faccessat(2) check permissions using real + // UID/GID, not effective UID/GID. + // + // "access() needs to use the real uid/gid, not the effective + // uid/gid. We do this by temporarily clearing all FS-related + // capabilities and switching the fsuid/fsgid around to the + // real ones." -fs/open.c:faccessat + creds := t.Credentials().Fork() + creds.EffectiveKUID = creds.RealKUID + creds.EffectiveKGID = creds.RealKGID + if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID { + creds.EffectiveCaps = creds.PermittedCaps + } else { + creds.EffectiveCaps = 0 + } + + ctx := &accessContext{ + Context: t, + creds: creds, + } + + return d.Inode.CheckPermission(ctx, fs.PermMask{ + Read: mode&rOK != 0, + Write: mode&wOK != 0, + Execute: mode&xOK != 0, + }) + }) +} + +// Access implements linux syscall access(2). +func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + + return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode) +} + +// Faccessat implements linux syscall faccessat(2). +// +// Note that the faccessat() system call does not take a flags argument: +// "The raw faccessat() system call takes only the first three arguments. The +// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within +// the glibc wrapper function for faccessat(). If either of these flags is +// specified, then the wrapper function employs fstatat(2) to determine access +// permissions." - faccessat(2) +func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := args[0].Int() + addr := args[1].Pointer() + mode := args[2].ModeT() + + return 0, nil, accessAt(t, dirFD, addr, mode) +} + +// LINT.ThenChange(vfs2/filesystem.go) + +// LINT.IfChange + +// Ioctl implements linux syscall ioctl(2). +func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + request := int(args[1].Int()) + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Shared flags between file and socket. + switch request { + case linux.FIONCLEX: + t.FDTable().SetFlags(fd, kernel.FDFlags{ + CloseOnExec: false, + }) + return 0, nil, nil + case linux.FIOCLEX: + t.FDTable().SetFlags(fd, kernel.FDFlags{ + CloseOnExec: true, + }) + return 0, nil, nil + + case linux.FIONBIO: + var set int32 + if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { + return 0, nil, err + } + flags := file.Flags() + if set != 0 { + flags.NonBlocking = true + } else { + flags.NonBlocking = false + } + file.SetFlags(flags.Settable()) + return 0, nil, nil + + case linux.FIOASYNC: + var set int32 + if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { + return 0, nil, err + } + flags := file.Flags() + if set != 0 { + flags.Async = true + } else { + flags.Async = false + } + file.SetFlags(flags.Settable()) + return 0, nil, nil + + case linux.FIOSETOWN, linux.SIOCSPGRP: + var set int32 + if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { + return 0, nil, err + } + fSetOwn(t, file, set) + return 0, nil, nil + + case linux.FIOGETOWN, linux.SIOCGPGRP: + who := fGetOwn(t, file) + _, err := t.CopyOut(args[2].Pointer(), &who) + return 0, nil, err + + default: + ret, err := file.FileOperations.Ioctl(t, file, t.MemoryManager(), args) + if err != nil { + return 0, nil, err + } + + return ret, nil, nil + } +} + +// LINT.ThenChange(vfs2/ioctl.go) + +// LINT.IfChange + +// Getcwd implements the linux syscall getcwd(2). +func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + size := args[1].SizeT() + cwd := t.FSContext().WorkingDirectory() + defer cwd.DecRef() + root := t.FSContext().RootDirectory() + defer root.DecRef() + + // Get our fullname from the root and preprend unreachable if the root was + // unreachable from our current dirent this is the same behavior as on linux. + s, reachable := cwd.FullName(root) + if !reachable { + s = "(unreachable)" + s + } + + // Note this is >= because we need a terminator. + if uint(len(s)) >= size { + return 0, nil, syserror.ERANGE + } + + // Copy out the path name for the node. + bytes, err := t.CopyOutBytes(addr, []byte(s)) + if err != nil { + return 0, nil, err + } + + // Top it off with a terminator. + _, err = t.CopyOut(addr+usermem.Addr(bytes), []byte("\x00")) + return uintptr(bytes + 1), nil, err +} + +// Chroot implements the linux syscall chroot(2). +func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + if !t.HasCapability(linux.CAP_SYS_CHROOT) { + return 0, nil, syserror.EPERM + } + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + // Is it a directory? + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Does it have execute permissions? + if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil { + return err + } + + t.FSContext().SetRootDirectory(d) + return nil + }) +} + +// Chdir implements the linux syscall chdir(2). +func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + // Is it a directory? + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Does it have execute permissions? + if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil { + return err + } + + t.FSContext().SetWorkingDirectory(d) + return nil + }) +} + +// Fchdir implements the linux syscall fchdir(2). +func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Is it a directory? + if !fs.IsDir(file.Dirent.Inode.StableAttr) { + return 0, nil, syserror.ENOTDIR + } + + // Does it have execute permissions? + if err := file.Dirent.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil { + return 0, nil, err + } + + t.FSContext().SetWorkingDirectory(file.Dirent) + return 0, nil, nil +} + +// LINT.ThenChange(vfs2/fscontext.go) + +// LINT.IfChange + +// Close implements linux syscall close(2). +func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + // Note that Remove provides a reference on the file that we may use to + // flush. It is still active until we drop the final reference below + // (and other reference-holding operations complete). + file, _ := t.FDTable().Remove(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := file.Flush(t) + return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file) +} + +// Dup implements linux syscall dup(2). +func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{}) + if err != nil { + return 0, nil, syserror.EMFILE + } + return uintptr(newFD), nil, nil +} + +// Dup2 implements linux syscall dup2(2). +func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldfd := args[0].Int() + newfd := args[1].Int() + + // If oldfd is a valid file descriptor, and newfd has the same value as oldfd, + // then dup2() does nothing, and returns newfd. + if oldfd == newfd { + oldFile := t.GetFile(oldfd) + if oldFile == nil { + return 0, nil, syserror.EBADF + } + defer oldFile.DecRef() + + return uintptr(newfd), nil, nil + } + + // Zero out flags arg to be used by Dup3. + args[2].Value = 0 + return Dup3(t, args) +} + +// Dup3 implements linux syscall dup3(2). +func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldfd := args[0].Int() + newfd := args[1].Int() + flags := args[2].Uint() + + if oldfd == newfd { + return 0, nil, syserror.EINVAL + } + + oldFile := t.GetFile(oldfd) + if oldFile == nil { + return 0, nil, syserror.EBADF + } + defer oldFile.DecRef() + + err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}) + if err != nil { + return 0, nil, err + } + + return uintptr(newfd), nil, nil +} + +func fGetOwnEx(t *kernel.Task, file *fs.File) linux.FOwnerEx { + ma := file.Async(nil) + if ma == nil { + return linux.FOwnerEx{} + } + a := ma.(*fasync.FileAsync) + ot, otg, opg := a.Owner() + switch { + case ot != nil: + return linux.FOwnerEx{ + Type: linux.F_OWNER_TID, + PID: int32(t.PIDNamespace().IDOfTask(ot)), + } + case otg != nil: + return linux.FOwnerEx{ + Type: linux.F_OWNER_PID, + PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)), + } + case opg != nil: + return linux.FOwnerEx{ + Type: linux.F_OWNER_PGRP, + PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)), + } + default: + return linux.FOwnerEx{} + } +} + +func fGetOwn(t *kernel.Task, file *fs.File) int32 { + owner := fGetOwnEx(t, file) + if owner.Type == linux.F_OWNER_PGRP { + return -owner.PID + } + return owner.PID +} + +// fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux. +// +// If who is positive, it represents a PID. If negative, it represents a PGID. +// If the PID or PGID is invalid, the owner is silently unset. +func fSetOwn(t *kernel.Task, file *fs.File, who int32) error { + a := file.Async(fasync.New).(*fasync.FileAsync) + if who < 0 { + // Check for overflow before flipping the sign. + if who-1 > who { + return syserror.EINVAL + } + pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who)) + a.SetOwnerProcessGroup(t, pg) + } else { + tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who)) + a.SetOwnerThreadGroup(t, tg) + } + return nil +} + +// Fcntl implements linux syscall fcntl(2). +func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + cmd := args[1].Int() + + file, flags := t.FDTable().Get(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + switch cmd { + case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: + from := args[2].Int() + fd, err := t.NewFDFrom(from, file, kernel.FDFlags{ + CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil + case linux.F_GETFD: + return uintptr(flags.ToLinuxFDFlags()), nil, nil + case linux.F_SETFD: + flags := args[2].Uint() + err := t.FDTable().SetFlags(fd, kernel.FDFlags{ + CloseOnExec: flags&linux.FD_CLOEXEC != 0, + }) + return 0, nil, err + case linux.F_GETFL: + return uintptr(file.Flags().ToLinux()), nil, nil + case linux.F_SETFL: + flags := uint(args[2].Uint()) + file.SetFlags(linuxToFlags(flags).Settable()) + return 0, nil, nil + case linux.F_SETLK, linux.F_SETLKW: + // In Linux the file system can choose to provide lock operations for an inode. + // Normally pipe and socket types lack lock operations. We diverge and use a heavy + // hammer by only allowing locks on files and directories. + if !fs.IsFile(file.Dirent.Inode.StableAttr) && !fs.IsDir(file.Dirent.Inode.StableAttr) { + return 0, nil, syserror.EBADF + } + + // Copy in the lock request. + flockAddr := args[2].Pointer() + var flock linux.Flock + if _, err := t.CopyIn(flockAddr, &flock); err != nil { + return 0, nil, err + } + + // Compute the lock whence. + var sw fs.SeekWhence + switch flock.Whence { + case 0: + sw = fs.SeekSet + case 1: + sw = fs.SeekCurrent + case 2: + sw = fs.SeekEnd + default: + return 0, nil, syserror.EINVAL + } + + // Compute the lock offset. + var off int64 + switch sw { + case fs.SeekSet: + off = 0 + case fs.SeekCurrent: + // Note that Linux does not hold any mutexes while retrieving the file offset, + // see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk. + off = file.Offset() + case fs.SeekEnd: + uattr, err := file.Dirent.Inode.UnstableAttr(t) + if err != nil { + return 0, nil, err + } + off = uattr.Size + default: + return 0, nil, syserror.EINVAL + } + + // Compute the lock range. + rng, err := lock.ComputeRange(flock.Start, flock.Len, off) + if err != nil { + return 0, nil, err + } + + // These locks don't block; execute the non-blocking operation using the inode's lock + // context directly. + switch flock.Type { + case linux.F_RDLCK: + if !file.Flags().Read { + return 0, nil, syserror.EBADF + } + if cmd == linux.F_SETLK { + // Non-blocking lock, provide a nil lock.Blocker. + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, nil) { + return 0, nil, syserror.EAGAIN + } + } else { + // Blocking lock, pass in the task to satisfy the lock.Blocker interface. + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, t) { + return 0, nil, syserror.EINTR + } + } + return 0, nil, nil + case linux.F_WRLCK: + if !file.Flags().Write { + return 0, nil, syserror.EBADF + } + if cmd == linux.F_SETLK { + // Non-blocking lock, provide a nil lock.Blocker. + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, nil) { + return 0, nil, syserror.EAGAIN + } + } else { + // Blocking lock, pass in the task to satisfy the lock.Blocker interface. + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, t) { + return 0, nil, syserror.EINTR + } + } + return 0, nil, nil + case linux.F_UNLCK: + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng) + return 0, nil, nil + default: + return 0, nil, syserror.EINVAL + } + case linux.F_GETOWN: + return uintptr(fGetOwn(t, file)), nil, nil + case linux.F_SETOWN: + return 0, nil, fSetOwn(t, file, args[2].Int()) + case linux.F_GETOWN_EX: + addr := args[2].Pointer() + owner := fGetOwnEx(t, file) + _, err := t.CopyOut(addr, &owner) + return 0, nil, err + case linux.F_SETOWN_EX: + addr := args[2].Pointer() + var owner linux.FOwnerEx + n, err := t.CopyIn(addr, &owner) + if err != nil { + return 0, nil, err + } + a := file.Async(fasync.New).(*fasync.FileAsync) + switch owner.Type { + case linux.F_OWNER_TID: + task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID)) + if task == nil { + return 0, nil, syserror.ESRCH + } + a.SetOwnerTask(t, task) + return uintptr(n), nil, nil + case linux.F_OWNER_PID: + tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID)) + if tg == nil { + return 0, nil, syserror.ESRCH + } + a.SetOwnerThreadGroup(t, tg) + return uintptr(n), nil, nil + case linux.F_OWNER_PGRP: + pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID)) + if pg == nil { + return 0, nil, syserror.ESRCH + } + a.SetOwnerProcessGroup(t, pg) + return uintptr(n), nil, nil + default: + return 0, nil, syserror.EINVAL + } + case linux.F_GET_SEALS: + val, err := tmpfs.GetSeals(file.Dirent.Inode) + return uintptr(val), nil, err + case linux.F_ADD_SEALS: + if !file.Flags().Write { + return 0, nil, syserror.EPERM + } + err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint()) + return 0, nil, err + case linux.F_GETPIPE_SZ: + sz, ok := file.FileOperations.(fs.FifoSizer) + if !ok { + return 0, nil, syserror.EINVAL + } + size, err := sz.FifoSize(t, file) + return uintptr(size), nil, err + case linux.F_SETPIPE_SZ: + sz, ok := file.FileOperations.(fs.FifoSizer) + if !ok { + return 0, nil, syserror.EINVAL + } + n, err := sz.SetFifoSize(int64(args[2].Int())) + return uintptr(n), nil, err + default: + // Everything else is not yet supported. + return 0, nil, syserror.EINVAL + } +} + +// Fadvise64 implements linux syscall fadvise64(2). +// This implementation currently ignores the provided advice. +func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + length := args[2].Int64() + advice := args[3].Int() + + // Note: offset is allowed to be negative. + if length < 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // If the FD refers to a pipe or FIFO, return error. + if fs.IsPipe(file.Dirent.Inode.StableAttr) { + return 0, nil, syserror.ESPIPE + } + + switch advice { + case linux.POSIX_FADV_NORMAL: + case linux.POSIX_FADV_RANDOM: + case linux.POSIX_FADV_SEQUENTIAL: + case linux.POSIX_FADV_WILLNEED: + case linux.POSIX_FADV_DONTNEED: + case linux.POSIX_FADV_NOREUSE: + default: + return 0, nil, syserror.EINVAL + } + + // Sure, whatever. + return 0, nil, nil +} + +func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error { + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + + return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Does this directory exist already? + remainingTraversals := uint(linux.MaxSymlinkTraversals) + f, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals) + switch err { + case nil: + // The directory existed. + defer f.DecRef() + return syserror.EEXIST + case syserror.EACCES: + // Permission denied while walking to the directory. + return err + default: + // Do we have write permissions on the parent? + if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + return err + } + + // Create the directory. + perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask())) + return d.CreateDirectory(t, root, name, perms) + } + }) +} + +// Mkdir implements linux syscall mkdir(2). +func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := linux.FileMode(args[1].ModeT()) + + return 0, nil, mkdirAt(t, linux.AT_FDCWD, addr, mode) +} + +// Mkdirat implements linux syscall mkdirat(2). +func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := args[0].Int() + addr := args[1].Pointer() + mode := linux.FileMode(args[2].ModeT()) + + return 0, nil, mkdirAt(t, dirFD, addr, mode) +} + +func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + + // Special case: removing the root always returns EBUSY. + if path == "/" { + return syserror.EBUSY + } + + return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Linux returns different ernos when the path ends in single + // dot vs. double dots. + switch name { + case ".": + return syserror.EINVAL + case "..": + return syserror.ENOTEMPTY + } + + if err := d.MayDelete(t, root, name); err != nil { + return err + } + + return d.RemoveDirectory(t, root, name) + }) +} + +// Rmdir implements linux syscall rmdir(2). +func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr) +} + +func symlinkAt(t *kernel.Task, dirFD int32, newAddr usermem.Addr, oldAddr usermem.Addr) error { + newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */) + if err != nil { + return err + } + if dirPath { + return syserror.ENOENT + } + + // The oldPath is copied in verbatim. This is because the symlink + // will include all details, including trailing slashes. + oldPath, err := t.CopyInString(oldAddr, linux.PATH_MAX) + if err != nil { + return err + } + if oldPath == "" { + return syserror.ENOENT + } + + return fileOpAt(t, dirFD, newPath, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Make sure we have write permissions on the parent directory. + if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + return err + } + return d.CreateLink(t, root, oldPath, name) + }) +} + +// Symlink implements linux syscall symlink(2). +func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldAddr := args[0].Pointer() + newAddr := args[1].Pointer() + + return 0, nil, symlinkAt(t, linux.AT_FDCWD, newAddr, oldAddr) +} + +// Symlinkat implements linux syscall symlinkat(2). +func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldAddr := args[0].Pointer() + dirFD := args[1].Int() + newAddr := args[2].Pointer() + + return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr) +} + +// mayLinkAt determines whether t can create a hard link to target. +// +// This corresponds to Linux's fs/namei.c:may_linkat. +func mayLinkAt(t *kernel.Task, target *fs.Inode) error { + // Linux will impose the following restrictions on hard links only if + // sysctl_protected_hardlinks is enabled. The kernel disables this + // setting by default for backward compatibility (see commit + // 561ec64ae67e), but also recommends that distributions enable it (and + // Debian does: + // https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=889098). + // + // gVisor currently behaves as though sysctl_protected_hardlinks is + // always enabled, and thus imposes the following restrictions on hard + // links. + + if target.CheckOwnership(t) { + // fs/namei.c:may_linkat: "Source inode owner (or CAP_FOWNER) + // can hardlink all they like." + return nil + } + + // If we are not the owner, then the file must be regular and have + // Read+Write permissions. + if !fs.IsRegular(target.StableAttr) { + return syserror.EPERM + } + if target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil { + return syserror.EPERM + } + + return nil +} + +// linkAt creates a hard link to the target specified by oldDirFD and oldAddr, +// specified by newDirFD and newAddr. If resolve is true, then the symlinks +// will be followed when evaluating the target. +func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr, resolve, allowEmpty bool) error { + oldPath, _, err := copyInPath(t, oldAddr, allowEmpty) + if err != nil { + return err + } + newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */) + if err != nil { + return err + } + if dirPath { + return syserror.ENOENT + } + + if allowEmpty && oldPath == "" { + target := t.GetFile(oldDirFD) + if target == nil { + return syserror.EBADF + } + defer target.DecRef() + if err := mayLinkAt(t, target.Dirent.Inode); err != nil { + return err + } + + // Resolve the target directory. + return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error { + if !fs.IsDir(newParent.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Make sure we have write permissions on the parent directory. + if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + return err + } + return newParent.CreateHardLink(t, root, target.Dirent, newName) + }) + } + + // Resolve oldDirFD and oldAddr to a dirent. The "resolve" argument + // only applies to this name. + return fileOpOn(t, oldDirFD, oldPath, resolve, func(root *fs.Dirent, target *fs.Dirent, _ uint) error { + if err := mayLinkAt(t, target.Inode); err != nil { + return err + } + + // Next resolve newDirFD and newAddr to the parent dirent and name. + return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error { + if !fs.IsDir(newParent.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Make sure we have write permissions on the parent directory. + if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + return err + } + return newParent.CreateHardLink(t, root, target, newName) + }) + }) +} + +// Link implements linux syscall link(2). +func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldAddr := args[0].Pointer() + newAddr := args[1].Pointer() + + // man link(2): + // POSIX.1-2001 says that link() should dereference oldpath if it is a + // symbolic link. However, since kernel 2.0, Linux does not do so: if + // oldpath is a symbolic link, then newpath is created as a (hard) link + // to the same symbolic link file (i.e., newpath becomes a symbolic + // link to the same file that oldpath refers to). + resolve := false + return 0, nil, linkAt(t, linux.AT_FDCWD, oldAddr, linux.AT_FDCWD, newAddr, resolve, false /* allowEmpty */) +} + +// Linkat implements linux syscall linkat(2). +func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldDirFD := args[0].Int() + oldAddr := args[1].Pointer() + newDirFD := args[2].Int() + newAddr := args[3].Pointer() + + // man linkat(2): + // By default, linkat(), does not dereference oldpath if it is a + // symbolic link (like link(2)). Since Linux 2.6.18, the flag + // AT_SYMLINK_FOLLOW can be specified in flags to cause oldpath to be + // dereferenced if it is a symbolic link. + flags := args[4].Int() + + // Sanity check flags. + if flags&^(linux.AT_SYMLINK_FOLLOW|linux.AT_EMPTY_PATH) != 0 { + return 0, nil, syserror.EINVAL + } + + resolve := flags&linux.AT_SYMLINK_FOLLOW == linux.AT_SYMLINK_FOLLOW + allowEmpty := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH + + if allowEmpty && !t.HasCapabilityIn(linux.CAP_DAC_READ_SEARCH, t.UserNamespace().Root()) { + return 0, nil, syserror.ENOENT + } + + return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty) +} + +// LINT.ThenChange(vfs2/filesystem.go) + +// LINT.IfChange + +func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) { + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, err + } + if dirPath { + return 0, syserror.ENOENT + } + + err = fileOpOn(t, dirFD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + // Check for Read permission. + if err := d.Inode.CheckPermission(t, fs.PermMask{Read: true}); err != nil { + return err + } + + s, err := d.Inode.Readlink(t) + if err == syserror.ENOLINK { + return syserror.EINVAL + } + if err != nil { + return err + } + + buffer := []byte(s) + if uint(len(buffer)) > size { + buffer = buffer[:size] + } + + n, err := t.CopyOutBytes(bufAddr, buffer) + + // Update frame return value. + copied = uintptr(n) + + return err + }) + return copied, err // Return frame value. +} + +// Readlink implements linux syscall readlink(2). +func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + bufAddr := args[1].Pointer() + size := args[2].SizeT() + + n, err := readlinkAt(t, linux.AT_FDCWD, addr, bufAddr, size) + return n, nil, err +} + +// Readlinkat implements linux syscall readlinkat(2). +func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := args[0].Int() + addr := args[1].Pointer() + bufAddr := args[2].Pointer() + size := args[3].SizeT() + + n, err := readlinkAt(t, dirFD, addr, bufAddr, size) + return n, nil, err +} + +// LINT.ThenChange(vfs2/stat.go) + +// LINT.IfChange + +func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + + return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + if err := d.MayDelete(t, root, name); err != nil { + return err + } + + return d.Remove(t, root, name, dirPath) + }) +} + +// Unlink implements linux syscall unlink(2). +func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + return 0, nil, unlinkAt(t, linux.AT_FDCWD, addr) +} + +// Unlinkat implements linux syscall unlinkat(2). +func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := args[0].Int() + addr := args[1].Pointer() + flags := args[2].Uint() + if flags&linux.AT_REMOVEDIR != 0 { + return 0, nil, rmdirAt(t, dirFD, addr) + } + return 0, nil, unlinkAt(t, dirFD, addr) +} + +// LINT.ThenChange(vfs2/filesystem.go) + +// LINT.IfChange + +// Truncate implements linux syscall truncate(2). +func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].Int64() + + if length < 0 { + return 0, nil, syserror.EINVAL + } + + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + if dirPath { + return 0, nil, syserror.EINVAL + } + + if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { + t.SendSignal(&arch.SignalInfo{ + Signo: int32(linux.SIGXFSZ), + Code: arch.SignalInfoUser, + }) + return 0, nil, syserror.EFBIG + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + if fs.IsDir(d.Inode.StableAttr) { + return syserror.EISDIR + } + // In contrast to open(O_TRUNC), truncate(2) is only valid for file + // types. + if !fs.IsFile(d.Inode.StableAttr) { + return syserror.EINVAL + } + + // Reject truncation if the access permissions do not allow truncation. + // This is different from the behavior of sys_ftruncate, see below. + if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil { + return err + } + + if err := d.Inode.Truncate(t, d, length); err != nil { + return err + } + + // File length modified, generate notification. + d.InotifyEvent(linux.IN_MODIFY, 0) + + return nil + }) +} + +// Ftruncate implements linux syscall ftruncate(2). +func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + length := args[1].Int64() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Reject truncation if the file flags do not permit this operation. + // This is different from truncate(2) above. + if !file.Flags().Write { + return 0, nil, syserror.EINVAL + } + + // In contrast to open(O_TRUNC), truncate(2) is only valid for file + // types. Note that this is different from truncate(2) above, where a + // directory returns EISDIR. + if !fs.IsFile(file.Dirent.Inode.StableAttr) { + return 0, nil, syserror.EINVAL + } + + if length < 0 { + return 0, nil, syserror.EINVAL + } + + if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { + t.SendSignal(&arch.SignalInfo{ + Signo: int32(linux.SIGXFSZ), + Code: arch.SignalInfoUser, + }) + return 0, nil, syserror.EFBIG + } + + if err := file.Dirent.Inode.Truncate(t, file.Dirent, length); err != nil { + return 0, nil, err + } + + // File length modified, generate notification. + file.Dirent.InotifyEvent(linux.IN_MODIFY, 0) + + return 0, nil, nil +} + +// LINT.ThenChange(vfs2/setstat.go) + +// Umask implements linux syscall umask(2). +func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + mask := args[0].ModeT() + mask = t.FSContext().SwapUmask(mask & 0777) + return uintptr(mask), nil, nil +} + +// LINT.IfChange + +// Change ownership of a file. +// +// uid and gid may be -1, in which case they will not be changed. +func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { + owner := fs.FileOwner{ + UID: auth.NoID, + GID: auth.NoID, + } + + uattr, err := d.Inode.UnstableAttr(t) + if err != nil { + return err + } + c := t.Credentials() + hasCap := d.Inode.CheckCapability(t, linux.CAP_CHOWN) + isOwner := uattr.Owner.UID == c.EffectiveKUID + if uid.Ok() { + kuid := c.UserNamespace.MapToKUID(uid) + // Valid UID must be supplied if UID is to be changed. + if !kuid.Ok() { + return syserror.EINVAL + } + + // "Only a privileged process (CAP_CHOWN) may change the owner + // of a file." -chown(2) + // + // Linux also allows chown if you own the file and are + // explicitly not changing its UID. + isNoop := uattr.Owner.UID == kuid + if !(hasCap || (isOwner && isNoop)) { + return syserror.EPERM + } + + owner.UID = kuid + } + if gid.Ok() { + kgid := c.UserNamespace.MapToKGID(gid) + // Valid GID must be supplied if GID is to be changed. + if !kgid.Ok() { + return syserror.EINVAL + } + + // "The owner of a file may change the group of the file to any + // group of which that owner is a member. A privileged process + // (CAP_CHOWN) may change the group arbitrarily." -chown(2) + isNoop := uattr.Owner.GID == kgid + isMemberGroup := c.InGroup(kgid) + if !(hasCap || (isOwner && (isNoop || isMemberGroup))) { + return syserror.EPERM + } + + owner.GID = kgid + } + + // FIXME(b/62949101): This is racy; the inode's owner may have changed in + // the meantime. (Linux holds i_mutex while calling + // fs/attr.c:notify_change() => inode_operations::setattr => + // inode_change_ok().) + if err := d.Inode.SetOwner(t, d, owner); err != nil { + return err + } + + // When the owner or group are changed by an unprivileged user, + // chown(2) also clears the set-user-ID and set-group-ID bits, but + // we do not support them. + return nil +} + +func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error { + path, _, err := copyInPath(t, addr, allowEmpty) + if err != nil { + return err + } + + if path == "" { + // Annoying. What's wrong with fchown? + file := t.GetFile(fd) + if file == nil { + return syserror.EBADF + } + defer file.DecRef() + + return chown(t, file.Dirent, uid, gid) + } + + return fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + return chown(t, d, uid, gid) + }) +} + +// Chown implements linux syscall chown(2). +func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + uid := auth.UID(args[1].Uint()) + gid := auth.GID(args[2].Uint()) + + return 0, nil, chownAt(t, linux.AT_FDCWD, addr, true /* resolve */, false /* allowEmpty */, uid, gid) +} + +// Lchown implements linux syscall lchown(2). +func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + uid := auth.UID(args[1].Uint()) + gid := auth.GID(args[2].Uint()) + + return 0, nil, chownAt(t, linux.AT_FDCWD, addr, false /* resolve */, false /* allowEmpty */, uid, gid) +} + +// Fchown implements linux syscall fchown(2). +func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + uid := auth.UID(args[1].Uint()) + gid := auth.GID(args[2].Uint()) + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, chown(t, file.Dirent, uid, gid) +} + +// Fchownat implements Linux syscall fchownat(2). +func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := args[0].Int() + addr := args[1].Pointer() + uid := auth.UID(args[2].Uint()) + gid := auth.GID(args[3].Uint()) + flags := args[4].Int() + + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return 0, nil, syserror.EINVAL + } + + return 0, nil, chownAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, flags&linux.AT_EMPTY_PATH != 0, uid, gid) +} + +func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error { + // Must own file to change mode. + if !d.Inode.CheckOwnership(t) { + return syserror.EPERM + } + + p := fs.FilePermsFromMode(mode) + if !d.Inode.SetPermissions(t, d, p) { + return syserror.EPERM + } + + // File attribute changed, generate notification. + d.InotifyEvent(linux.IN_ATTRIB, 0) + + return nil +} + +func chmodAt(t *kernel.Task, fd int32, addr usermem.Addr, mode linux.FileMode) error { + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + + return fileOpOn(t, fd, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + return chmod(t, d, mode) + }) +} + +// Chmod implements linux syscall chmod(2). +func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := linux.FileMode(args[1].ModeT()) + + return 0, nil, chmodAt(t, linux.AT_FDCWD, addr, mode) +} + +// Fchmod implements linux syscall fchmod(2). +func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + mode := linux.FileMode(args[1].ModeT()) + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, chmod(t, file.Dirent, mode) +} + +// Fchmodat implements linux syscall fchmodat(2). +func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + mode := linux.FileMode(args[2].ModeT()) + + return 0, nil, chmodAt(t, fd, addr, mode) +} + +// defaultSetToSystemTimeSpec returns a TimeSpec that will set ATime and MTime +// to the system time. +func defaultSetToSystemTimeSpec() fs.TimeSpec { + return fs.TimeSpec{ + ATimeSetSystemTime: true, + MTimeSetSystemTime: true, + } +} + +func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error { + setTimestamp := func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + // Does the task own the file? + if !d.Inode.CheckOwnership(t) { + // Trying to set a specific time? Must be owner. + if (ts.ATimeOmit || !ts.ATimeSetSystemTime) && (ts.MTimeOmit || !ts.MTimeSetSystemTime) { + return syserror.EPERM + } + + // Trying to set to current system time? Must have write access. + if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil { + return err + } + } + + if err := d.Inode.SetTimestamps(t, d, ts); err != nil { + return err + } + + // File attribute changed, generate notification. + d.InotifyEvent(linux.IN_ATTRIB, 0) + return nil + } + + // From utimes.c: + // "If filename is NULL and dfd refers to an open file, then operate on + // the file. Otherwise look up filename, possibly using dfd as a + // starting point." + if addr == 0 && dirFD != linux.AT_FDCWD { + if !resolve { + // Linux returns EINVAL in this case. See utimes.c. + return syserror.EINVAL + } + f := t.GetFile(dirFD) + if f == nil { + return syserror.EBADF + } + defer f.DecRef() + + root := t.FSContext().RootDirectory() + defer root.DecRef() + + return setTimestamp(root, f.Dirent, linux.MaxSymlinkTraversals) + } + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + + return fileOpOn(t, dirFD, path, resolve, setTimestamp) +} + +// Utime implements linux syscall utime(2). +func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + filenameAddr := args[0].Pointer() + timesAddr := args[1].Pointer() + + // No timesAddr argument will be interpreted as current system time. + ts := defaultSetToSystemTimeSpec() + if timesAddr != 0 { + var times linux.Utime + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return 0, nil, err + } + ts = fs.TimeSpec{ + ATime: ktime.FromSeconds(times.Actime), + MTime: ktime.FromSeconds(times.Modtime), + } + } + return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true) +} + +// Utimes implements linux syscall utimes(2). +func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + filenameAddr := args[0].Pointer() + timesAddr := args[1].Pointer() + + // No timesAddr argument will be interpreted as current system time. + ts := defaultSetToSystemTimeSpec() + if timesAddr != 0 { + var times [2]linux.Timeval + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return 0, nil, err + } + ts = fs.TimeSpec{ + ATime: ktime.FromTimeval(times[0]), + MTime: ktime.FromTimeval(times[1]), + } + } + return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true) +} + +// timespecIsValid checks that the timespec is valid for use in utimensat. +func timespecIsValid(ts linux.Timespec) bool { + // Nsec must be UTIME_OMIT, UTIME_NOW, or less than 10^9. + return ts.Nsec == linux.UTIME_OMIT || ts.Nsec == linux.UTIME_NOW || ts.Nsec < 1e9 +} + +// Utimensat implements linux syscall utimensat(2). +func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := args[0].Int() + pathnameAddr := args[1].Pointer() + timesAddr := args[2].Pointer() + flags := args[3].Int() + + // No timesAddr argument will be interpreted as current system time. + ts := defaultSetToSystemTimeSpec() + if timesAddr != 0 { + var times [2]linux.Timespec + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return 0, nil, err + } + if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) { + return 0, nil, syserror.EINVAL + } + + // If both are UTIME_OMIT, this is a noop. + if times[0].Nsec == linux.UTIME_OMIT && times[1].Nsec == linux.UTIME_OMIT { + return 0, nil, nil + } + + ts = fs.TimeSpec{ + ATime: ktime.FromTimespec(times[0]), + ATimeOmit: times[0].Nsec == linux.UTIME_OMIT, + ATimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW, + MTime: ktime.FromTimespec(times[1]), + MTimeOmit: times[1].Nsec == linux.UTIME_OMIT, + MTimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW, + } + } + return 0, nil, utimes(t, dirFD, pathnameAddr, ts, flags&linux.AT_SYMLINK_NOFOLLOW == 0) +} + +// Futimesat implements linux syscall futimesat(2). +func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := args[0].Int() + pathnameAddr := args[1].Pointer() + timesAddr := args[2].Pointer() + + // No timesAddr argument will be interpreted as current system time. + ts := defaultSetToSystemTimeSpec() + if timesAddr != 0 { + var times [2]linux.Timeval + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return 0, nil, err + } + if times[0].Usec >= 1e6 || times[0].Usec < 0 || + times[1].Usec >= 1e6 || times[1].Usec < 0 { + return 0, nil, syserror.EINVAL + } + + ts = fs.TimeSpec{ + ATime: ktime.FromTimeval(times[0]), + MTime: ktime.FromTimeval(times[1]), + } + } + return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true) +} + +// LINT.ThenChange(vfs2/setstat.go) + +// LINT.IfChange + +func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error { + newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */) + if err != nil { + return err + } + oldPath, _, err := copyInPath(t, oldAddr, false /* allowEmpty */) + if err != nil { + return err + } + + return fileOpAt(t, oldDirFD, oldPath, func(root *fs.Dirent, oldParent *fs.Dirent, oldName string, _ uint) error { + if !fs.IsDir(oldParent.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Rename rejects paths that end in ".", "..", or empty (i.e. + // the root) with EBUSY. + switch oldName { + case "", ".", "..": + return syserror.EBUSY + } + + return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error { + if !fs.IsDir(newParent.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Rename rejects paths that end in ".", "..", or empty + // (i.e. the root) with EBUSY. + switch newName { + case "", ".", "..": + return syserror.EBUSY + } + + return fs.Rename(t, root, oldParent, oldName, newParent, newName) + }) + }) +} + +// Rename implements linux syscall rename(2). +func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldPathAddr := args[0].Pointer() + newPathAddr := args[1].Pointer() + return 0, nil, renameAt(t, linux.AT_FDCWD, oldPathAddr, linux.AT_FDCWD, newPathAddr) +} + +// Renameat implements linux syscall renameat(2). +func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldDirFD := args[0].Int() + oldPathAddr := args[1].Pointer() + newDirFD := args[2].Int() + newPathAddr := args[3].Pointer() + return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr) +} + +// LINT.ThenChange(vfs2/filesystem.go) + +// Fallocate implements linux system call fallocate(2). +func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + mode := args[1].Int64() + offset := args[2].Int64() + length := args[3].Int64() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + if offset < 0 || length <= 0 { + return 0, nil, syserror.EINVAL + } + if mode != 0 { + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.ENOTSUP + } + if !file.Flags().Write { + return 0, nil, syserror.EBADF + } + if fs.IsPipe(file.Dirent.Inode.StableAttr) { + return 0, nil, syserror.ESPIPE + } + if fs.IsDir(file.Dirent.Inode.StableAttr) { + return 0, nil, syserror.EISDIR + } + if !fs.IsRegular(file.Dirent.Inode.StableAttr) { + return 0, nil, syserror.ENODEV + } + size := offset + length + if size < 0 { + return 0, nil, syserror.EFBIG + } + if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { + t.SendSignal(&arch.SignalInfo{ + Signo: int32(linux.SIGXFSZ), + Code: arch.SignalInfoUser, + }) + return 0, nil, syserror.EFBIG + } + + if err := file.Dirent.Inode.Allocate(t, file.Dirent, offset, length); err != nil { + return 0, nil, err + } + + // File length modified, generate notification. + file.Dirent.InotifyEvent(linux.IN_MODIFY, 0) + + return 0, nil, nil +} + +// Flock implements linux syscall flock(2). +func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + operation := args[1].Int() + + file := t.GetFile(fd) + if file == nil { + // flock(2): EBADF fd is not an open file descriptor. + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + nonblocking := operation&linux.LOCK_NB != 0 + operation &^= linux.LOCK_NB + + // A BSD style lock spans the entire file. + rng := lock.LockRange{ + Start: 0, + End: lock.LockEOF, + } + + switch operation { + case linux.LOCK_EX: + if nonblocking { + // Since we're nonblocking we pass a nil lock.Blocker implementation. + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, nil) { + return 0, nil, syserror.EWOULDBLOCK + } + } else { + // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, t) { + return 0, nil, syserror.EINTR + } + } + case linux.LOCK_SH: + if nonblocking { + // Since we're nonblocking we pass a nil lock.Blocker implementation. + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, nil) { + return 0, nil, syserror.EWOULDBLOCK + } + } else { + // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, t) { + return 0, nil, syserror.EINTR + } + } + case linux.LOCK_UN: + file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng) + default: + // flock(2): EINVAL operation is invalid. + return 0, nil, syserror.EINVAL + } + + return 0, nil, nil +} + +const ( + memfdPrefix = "/memfd:" + memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING) + memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + 1 +) + +// MemfdCreate implements the linux syscall memfd_create(2). +func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Uint() + + if flags&^memfdAllFlags != 0 { + // Unknown bits in flags. + return 0, nil, syserror.EINVAL + } + + allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 + cloExec := flags&linux.MFD_CLOEXEC != 0 + + name, err := t.CopyInString(addr, syscall.PathMax-len(memfdPrefix)) + if err != nil { + return 0, nil, err + } + if len(name) > memfdMaxNameLen { + return 0, nil, syserror.EINVAL + } + name = memfdPrefix + name + + inode := tmpfs.NewMemfdInode(t, allowSeals) + dirent := fs.NewDirent(t, inode, name) + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with + // FMODE_READ | FMODE_WRITE. + file, err := inode.GetFile(t, dirent, fs.FileFlags{Read: true, Write: true}) + if err != nil { + return 0, nil, err + } + + defer dirent.DecRef() + defer file.DecRef() + + newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ + CloseOnExec: cloExec, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(newFD), nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go new file mode 100644 index 000000000..b68261f72 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -0,0 +1,288 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// futexWaitRestartBlock encapsulates the state required to restart futex(2) +// via restart_syscall(2). +// +// +stateify savable +type futexWaitRestartBlock struct { + duration time.Duration + + // addr stored as uint64 since uintptr is not save-able. + addr uint64 + private bool + val uint32 + mask uint32 +} + +// Restart implements kernel.SyscallRestartBlock.Restart. +func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) { + return futexWaitDuration(t, f.duration, false, usermem.Addr(f.addr), f.private, f.val, f.mask) +} + +// futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is +// complete. +// +// The wait blocks forever if forever is true, otherwise it blocks until ts. +// +// If blocking is interrupted, the syscall is restarted with the original +// arguments. +func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr usermem.Addr, private bool, val, mask uint32) (uintptr, error) { + w := t.FutexWaiter() + err := t.Futex().WaitPrepare(w, t, addr, private, val, mask) + if err != nil { + return 0, err + } + + if forever { + err = t.Block(w.C) + } else if clockRealtime { + notifier, tchan := ktime.NewChannelNotifier() + timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier) + timer.Swap(ktime.Setting{ + Enabled: true, + Next: ktime.FromTimespec(ts), + }) + err = t.BlockWithTimer(w.C, tchan) + timer.Destroy() + } else { + err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts)) + } + + t.Futex().WaitComplete(w) + return 0, syserror.ConvertIntr(err, kernel.ERESTARTSYS) +} + +// futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is +// complete. +// +// The wait blocks forever if forever is true, otherwise is blocks for +// duration. +// +// If blocking is interrupted, forever determines how to restart the +// syscall. If forever is true, the syscall is restarted with the original +// arguments. If forever is false, duration is a relative timeout and the +// syscall is restarted with the remaining timeout. +func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr usermem.Addr, private bool, val, mask uint32) (uintptr, error) { + w := t.FutexWaiter() + err := t.Futex().WaitPrepare(w, t, addr, private, val, mask) + if err != nil { + return 0, err + } + + remaining, err := t.BlockWithTimeout(w.C, !forever, duration) + t.Futex().WaitComplete(w) + if err == nil { + return 0, nil + } + + // The wait was unsuccessful for some reason other than interruption. Simply + // forward the error. + if err != syserror.ErrInterrupted { + return 0, err + } + + // The wait was interrupted and we need to restart. Decide how. + + // The wait duration was absolute, restart with the original arguments. + if forever { + return 0, kernel.ERESTARTSYS + } + + // The wait duration was relative, restart with the remaining duration. + t.SetSyscallRestartBlock(&futexWaitRestartBlock{ + duration: remaining, + addr: uint64(addr), + private: private, + val: val, + mask: mask, + }) + return 0, kernel.ERESTART_RESTARTBLOCK +} + +func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.Addr, private bool) error { + w := t.FutexWaiter() + locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false) + if err != nil { + return err + } + if locked { + // Futex acquired, we're done! + return nil + } + + if forever { + err = t.Block(w.C) + } else { + notifier, tchan := ktime.NewChannelNotifier() + timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier) + timer.Swap(ktime.Setting{ + Enabled: true, + Next: ktime.FromTimespec(ts), + }) + err = t.BlockWithTimer(w.C, tchan) + timer.Destroy() + } + + t.Futex().WaitComplete(w) + return syserror.ConvertIntr(err, kernel.ERESTARTSYS) +} + +func tryLockPI(t *kernel.Task, addr usermem.Addr, private bool) error { + w := t.FutexWaiter() + locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true) + if err != nil { + return err + } + if !locked { + return syserror.EWOULDBLOCK + } + return nil +} + +// Futex implements linux syscall futex(2). +// It provides a method for a program to wait for a value at a given address to +// change, and a method to wake up anyone waiting on a particular address. +func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + futexOp := args[1].Int() + val := int(args[2].Int()) + nreq := int(args[3].Int()) + timeout := args[3].Pointer() + naddr := args[4].Pointer() + val3 := args[5].Int() + + cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME) + private := (futexOp & linux.FUTEX_PRIVATE_FLAG) != 0 + clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME + mask := uint32(val3) + + switch cmd { + case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET: + // WAIT{_BITSET} wait forever if the timeout isn't passed. + forever := (timeout == 0) + + var timespec linux.Timespec + if !forever { + var err error + timespec, err = copyTimespecIn(t, timeout) + if err != nil { + return 0, nil, err + } + } + + switch cmd { + case linux.FUTEX_WAIT: + // WAIT uses a relative timeout. + mask = ^uint32(0) + var timeoutDur time.Duration + if !forever { + timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond + } + n, err := futexWaitDuration(t, timeoutDur, forever, addr, private, uint32(val), mask) + return n, nil, err + + case linux.FUTEX_WAIT_BITSET: + // WAIT_BITSET uses an absolute timeout which is either + // CLOCK_MONOTONIC or CLOCK_REALTIME. + if mask == 0 { + return 0, nil, syserror.EINVAL + } + n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask) + return n, nil, err + default: + panic("unreachable") + } + + case linux.FUTEX_WAKE: + mask = ^uint32(0) + fallthrough + + case linux.FUTEX_WAKE_BITSET: + if mask == 0 { + return 0, nil, syserror.EINVAL + } + if val <= 0 { + // The Linux kernel wakes one waiter even if val is + // non-positive. + val = 1 + } + n, err := t.Futex().Wake(t, addr, private, mask, val) + return uintptr(n), nil, err + + case linux.FUTEX_REQUEUE: + n, err := t.Futex().Requeue(t, addr, naddr, private, val, nreq) + return uintptr(n), nil, err + + case linux.FUTEX_CMP_REQUEUE: + // 'val3' contains the value to be checked at 'addr' and + // 'val' is the number of waiters that should be woken up. + nval := uint32(val3) + n, err := t.Futex().RequeueCmp(t, addr, naddr, private, nval, val, nreq) + return uintptr(n), nil, err + + case linux.FUTEX_WAKE_OP: + op := uint32(val3) + if val <= 0 { + // The Linux kernel wakes one waiter even if val is + // non-positive. + val = 1 + } + n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op) + return uintptr(n), nil, err + + case linux.FUTEX_LOCK_PI: + forever := (timeout == 0) + + var timespec linux.Timespec + if !forever { + var err error + timespec, err = copyTimespecIn(t, timeout) + if err != nil { + return 0, nil, err + } + } + err := futexLockPI(t, timespec, forever, addr, private) + return 0, nil, err + + case linux.FUTEX_TRYLOCK_PI: + err := tryLockPI(t, addr, private) + return 0, nil, err + + case linux.FUTEX_UNLOCK_PI: + err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private) + return 0, nil, err + + case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI: + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.ENOSYS + + default: + // We don't even know about this command. + return 0, nil, syserror.ENOSYS + } +} diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go new file mode 100644 index 000000000..b126fecc0 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -0,0 +1,250 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "bytes" + "io" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// LINT.IfChange + +// Getdents implements linux syscall getdents(2) for 64bit systems. +func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := int(args[2].Uint()) + + minSize := int(smallestDirent(t.Arch())) + if size < minSize { + // size is smaller than smallest possible dirent. + return 0, nil, syserror.EINVAL + } + + n, err := getdents(t, fd, addr, size, (*dirent).Serialize) + return n, nil, err +} + +// Getdents64 implements linux syscall getdents64(2). +func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := int(args[2].Uint()) + + minSize := int(smallestDirent64(t.Arch())) + if size < minSize { + // size is smaller than smallest possible dirent. + return 0, nil, syserror.EINVAL + } + + n, err := getdents(t, fd, addr, size, (*dirent).Serialize64) + return n, nil, err +} + +// getdents implements the core of getdents(2)/getdents64(2). +// f is the syscall implementation dirent serialization function. +func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) { + dir := t.GetFile(fd) + if dir == nil { + return 0, syserror.EBADF + } + defer dir.DecRef() + + w := &usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: addr, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + } + + ds := newDirentSerializer(f, w, t.Arch(), size) + rerr := dir.Readdir(t, ds) + + switch err := handleIOError(t, ds.Written() > 0, rerr, kernel.ERESTARTSYS, "getdents", dir); err { + case nil: + dir.Dirent.InotifyEvent(linux.IN_ACCESS, 0) + return uintptr(ds.Written()), nil + case io.EOF: + return 0, nil + default: + return 0, err + } +} + +// oldDirentHdr is a fixed sized header matching the fixed size +// fields found in the old linux dirent struct. +type oldDirentHdr struct { + Ino uint64 + Off uint64 + Reclen uint16 +} + +// direntHdr is a fixed sized header matching the fixed size +// fields found in the new linux dirent struct. +type direntHdr struct { + OldHdr oldDirentHdr + Typ uint8 +} + +// dirent contains the data pointed to by a new linux dirent struct. +type dirent struct { + Hdr direntHdr + Name []byte +} + +// newDirent returns a dirent from an fs.InodeOperationsInfo. +func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent { + d := &dirent{ + Hdr: direntHdr{ + OldHdr: oldDirentHdr{ + Ino: attr.InodeID, + Off: offset, + }, + Typ: fs.ToDirentType(attr.Type), + }, + Name: []byte(name), + } + d.Hdr.OldHdr.Reclen = d.padRec(int(width)) + return d +} + +// smallestDirent returns the size of the smallest possible dirent using +// the old linux dirent format. +func smallestDirent(a arch.Context) uint { + d := dirent{} + return uint(binary.Size(d.Hdr.OldHdr)) + a.Width() + 1 +} + +// smallestDirent64 returns the size of the smallest possible dirent using +// the new linux dirent format. +func smallestDirent64(a arch.Context) uint { + d := dirent{} + return uint(binary.Size(d.Hdr)) + a.Width() +} + +// padRec pads the name field until the rec length is a multiple of the width, +// which must be a power of 2. It returns the padded rec length. +func (d *dirent) padRec(width int) uint16 { + a := int(binary.Size(d.Hdr)) + len(d.Name) + r := (a + width) &^ (width - 1) + padding := r - a + d.Name = append(d.Name, make([]byte, padding)...) + return uint16(r) +} + +// Serialize64 serializes a Dirent struct to a byte slice, keeping the new +// linux dirent format. Returns the number of bytes serialized or an error. +func (d *dirent) Serialize64(w io.Writer) (int, error) { + n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr)) + if err != nil { + return 0, err + } + n2, err := w.Write(d.Name) + if err != nil { + return 0, err + } + return n1 + n2, nil +} + +// Serialize serializes a Dirent struct to a byte slice, using the old linux +// dirent format. +// Returns the number of bytes serialized or an error. +func (d *dirent) Serialize(w io.Writer) (int, error) { + n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr.OldHdr)) + if err != nil { + return 0, err + } + n2, err := w.Write(d.Name) + if err != nil { + return 0, err + } + n3, err := w.Write([]byte{d.Hdr.Typ}) + if err != nil { + return 0, err + } + return n1 + n2 + n3, nil +} + +// direntSerializer implements fs.InodeOperationsInfoSerializer, serializing dirents to an +// io.Writer. +type direntSerializer struct { + serialize func(*dirent, io.Writer) (int, error) + w io.Writer + // width is the arch native value width. + width uint + // offset is the current dirent offset. + offset uint64 + // written is the total bytes serialized. + written int + // size is the size of the buffer to serialize into. + size int +} + +func newDirentSerializer(f func(d *dirent, w io.Writer) (int, error), w io.Writer, ac arch.Context, size int) *direntSerializer { + return &direntSerializer{ + serialize: f, + w: w, + width: ac.Width(), + size: size, + } +} + +// CopyOut implements fs.InodeOperationsInfoSerializer.CopyOut. +// It serializes and writes the fs.DentAttr to the direntSerializer io.Writer. +func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error { + ds.offset++ + + d := newDirent(ds.width, name, attr, ds.offset) + + // Serialize dirent into a temp buffer. + var b bytes.Buffer + n, err := ds.serialize(d, &b) + if err != nil { + ds.offset-- + return err + } + + // Check that we have enough room remaining to write the dirent. + if n > (ds.size - ds.written) { + ds.offset-- + return io.EOF + } + + // Write out the temp buffer. + if _, err := b.WriteTo(ds.w); err != nil { + ds.offset-- + return err + } + + ds.written += n + return nil +} + +// Written returns the total number of bytes written. +func (ds *direntSerializer) Written() int { + return ds.written +} + +// LINT.ThenChange(vfs2/getdents.go) diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go new file mode 100644 index 000000000..715ac45e6 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_identity.go @@ -0,0 +1,180 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + // As NGROUPS_MAX in include/uapi/linux/limits.h. + maxNGroups = 65536 +) + +// Getuid implements the Linux syscall getuid. +func Getuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + c := t.Credentials() + ruid := c.RealKUID.In(c.UserNamespace).OrOverflow() + return uintptr(ruid), nil, nil +} + +// Geteuid implements the Linux syscall geteuid. +func Geteuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + c := t.Credentials() + euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow() + return uintptr(euid), nil, nil +} + +// Getresuid implements the Linux syscall getresuid. +func Getresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + ruidAddr := args[0].Pointer() + euidAddr := args[1].Pointer() + suidAddr := args[2].Pointer() + c := t.Credentials() + ruid := c.RealKUID.In(c.UserNamespace).OrOverflow() + euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow() + suid := c.SavedKUID.In(c.UserNamespace).OrOverflow() + if _, err := t.CopyOut(ruidAddr, ruid); err != nil { + return 0, nil, err + } + if _, err := t.CopyOut(euidAddr, euid); err != nil { + return 0, nil, err + } + if _, err := t.CopyOut(suidAddr, suid); err != nil { + return 0, nil, err + } + return 0, nil, nil +} + +// Getgid implements the Linux syscall getgid. +func Getgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + c := t.Credentials() + rgid := c.RealKGID.In(c.UserNamespace).OrOverflow() + return uintptr(rgid), nil, nil +} + +// Getegid implements the Linux syscall getegid. +func Getegid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + c := t.Credentials() + egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow() + return uintptr(egid), nil, nil +} + +// Getresgid implements the Linux syscall getresgid. +func Getresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + rgidAddr := args[0].Pointer() + egidAddr := args[1].Pointer() + sgidAddr := args[2].Pointer() + c := t.Credentials() + rgid := c.RealKGID.In(c.UserNamespace).OrOverflow() + egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow() + sgid := c.SavedKGID.In(c.UserNamespace).OrOverflow() + if _, err := t.CopyOut(rgidAddr, rgid); err != nil { + return 0, nil, err + } + if _, err := t.CopyOut(egidAddr, egid); err != nil { + return 0, nil, err + } + if _, err := t.CopyOut(sgidAddr, sgid); err != nil { + return 0, nil, err + } + return 0, nil, nil +} + +// Setuid implements the Linux syscall setuid. +func Setuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + uid := auth.UID(args[0].Int()) + return 0, nil, t.SetUID(uid) +} + +// Setreuid implements the Linux syscall setreuid. +func Setreuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + ruid := auth.UID(args[0].Int()) + euid := auth.UID(args[1].Int()) + return 0, nil, t.SetREUID(ruid, euid) +} + +// Setresuid implements the Linux syscall setreuid. +func Setresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + ruid := auth.UID(args[0].Int()) + euid := auth.UID(args[1].Int()) + suid := auth.UID(args[2].Int()) + return 0, nil, t.SetRESUID(ruid, euid, suid) +} + +// Setgid implements the Linux syscall setgid. +func Setgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + gid := auth.GID(args[0].Int()) + return 0, nil, t.SetGID(gid) +} + +// Setregid implements the Linux syscall setregid. +func Setregid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + rgid := auth.GID(args[0].Int()) + egid := auth.GID(args[1].Int()) + return 0, nil, t.SetREGID(rgid, egid) +} + +// Setresgid implements the Linux syscall setregid. +func Setresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + rgid := auth.GID(args[0].Int()) + egid := auth.GID(args[1].Int()) + sgid := auth.GID(args[2].Int()) + return 0, nil, t.SetRESGID(rgid, egid, sgid) +} + +// Getgroups implements the Linux syscall getgroups. +func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + size := int(args[0].Int()) + if size < 0 { + return 0, nil, syserror.EINVAL + } + kgids := t.Credentials().ExtraKGIDs + // "If size is zero, list is not modified, but the total number of + // supplementary group IDs for the process is returned." - getgroups(2) + if size == 0 { + return uintptr(len(kgids)), nil, nil + } + if size < len(kgids) { + return 0, nil, syserror.EINVAL + } + gids := make([]auth.GID, len(kgids)) + for i, kgid := range kgids { + gids[i] = kgid.In(t.UserNamespace()).OrOverflow() + } + if _, err := t.CopyOut(args[1].Pointer(), gids); err != nil { + return 0, nil, err + } + return uintptr(len(gids)), nil, nil +} + +// Setgroups implements the Linux syscall setgroups. +func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + size := args[0].Int() + if size < 0 || size > maxNGroups { + return 0, nil, syserror.EINVAL + } + if size == 0 { + return 0, nil, t.SetExtraGIDs(nil) + } + gids := make([]auth.GID, size) + if _, err := t.CopyIn(args[1].Pointer(), &gids); err != nil { + return 0, nil, err + } + return 0, nil, t.SetExtraGIDs(gids) +} diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go new file mode 100644 index 000000000..b2c7b3444 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -0,0 +1,133 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/anon" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC) + +// InotifyInit1 implements the inotify_init1() syscalls. +func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := int(args[0].Int()) + + if flags&^allFlags != 0 { + return 0, nil, syserror.EINVAL + } + + dirent := fs.NewDirent(t, anon.NewInode(t), "inotify") + fileFlags := fs.FileFlags{ + Read: true, + Write: true, + NonBlocking: flags&linux.IN_NONBLOCK != 0, + } + n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t)) + defer n.DecRef() + + fd, err := t.NewFDFrom(0, n, kernel.FDFlags{ + CloseOnExec: flags&linux.IN_CLOEXEC != 0, + }) + + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// InotifyInit implements the inotify_init() syscalls. +func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + args[0].Value = 0 + return InotifyInit1(t, args) +} + +// fdToInotify resolves an fd to an inotify object. If successful, the file will +// have an extra ref and the caller is responsible for releasing the ref. +func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) { + file := t.GetFile(fd) + if file == nil { + // Invalid fd. + return nil, nil, syserror.EBADF + } + + ino, ok := file.FileOperations.(*fs.Inotify) + if !ok { + // Not an inotify fd. + file.DecRef() + return nil, nil, syserror.EINVAL + } + + return ino, file, nil +} + +// InotifyAddWatch implements the inotify_add_watch() syscall. +func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + mask := args[2].Uint() + + // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link." + // -- inotify(7) + resolve := mask&linux.IN_DONT_FOLLOW == 0 + + // "EINVAL: The given event mask contains no valid events." + // -- inotify_add_watch(2) + if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 { + return 0, nil, syserror.EINVAL + } + + ino, file, err := fdToInotify(t, fd) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + err = fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, dirent *fs.Dirent, _ uint) error { + // "IN_ONLYDIR: Only watch pathname if it is a directory." -- inotify(7) + if onlyDir := mask&linux.IN_ONLYDIR != 0; onlyDir && !fs.IsDir(dirent.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Copy out to the return frame. + fd = ino.AddWatch(dirent, mask) + + return nil + }) + return uintptr(fd), nil, err // Return from the existing value. +} + +// InotifyRmWatch implements the inotify_rm_watch() syscall. +func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + wd := args[1].Int() + + ino, file, err := fdToInotify(t, fd) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + return 0, nil, ino.RmWatch(wd) +} diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go new file mode 100644 index 000000000..3f7691eae --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -0,0 +1,58 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// LINT.IfChange + +// Lseek implements linux syscall lseek(2). +func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + offset := args[1].Int64() + whence := args[2].Int() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + var sw fs.SeekWhence + switch whence { + case 0: + sw = fs.SeekSet + case 1: + sw = fs.SeekCurrent + case 2: + sw = fs.SeekEnd + default: + return 0, nil, syserror.EINVAL + } + + offset, serr := file.Seek(t, sw, offset) + err := handleIOError(t, false /* partialResult */, serr, kernel.ERESTARTSYS, "lseek", file) + if err != nil { + return 0, nil, err + } + return uintptr(offset), nil, err +} + +// LINT.ThenChange(vfs2/read_write.go) diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go new file mode 100644 index 000000000..9b4a5c3f1 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go @@ -0,0 +1,312 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// We unconditionally report a single NUMA node. This also means that our +// "nodemask_t" is a single unsigned long (uint64). +const ( + maxNodes = 1 + allowedNodemask = (1 << maxNodes) - 1 +) + +func copyInNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32) (uint64, error) { + // "nodemask points to a bit mask of node IDs that contains up to maxnode + // bits. The bit mask size is rounded to the next multiple of + // sizeof(unsigned long), but the kernel will use bits only up to maxnode. + // A NULL value of nodemask or a maxnode value of zero specifies the empty + // set of nodes. If the value of maxnode is zero, the nodemask argument is + // ignored." - set_mempolicy(2). Unfortunately, most of this is inaccurate + // because of what appears to be a bug: mm/mempolicy.c:get_nodes() uses + // maxnode-1, not maxnode, as the number of bits. + bits := maxnode - 1 + if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0 + return 0, syserror.EINVAL + } + if bits == 0 { + return 0, nil + } + // Copy in the whole nodemask. + numUint64 := (bits + 63) / 64 + buf := t.CopyScratchBuffer(int(numUint64) * 8) + if _, err := t.CopyInBytes(addr, buf); err != nil { + return 0, err + } + val := usermem.ByteOrder.Uint64(buf) + // Check that only allowed bits in the first unsigned long in the nodemask + // are set. + if val&^allowedNodemask != 0 { + return 0, syserror.EINVAL + } + // Check that all remaining bits in the nodemask are 0. + for i := 8; i < len(buf); i++ { + if buf[i] != 0 { + return 0, syserror.EINVAL + } + } + return val, nil +} + +func copyOutNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32, val uint64) error { + // mm/mempolicy.c:copy_nodes_to_user() also uses maxnode-1 as the number of + // bits. + bits := maxnode - 1 + if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0 + return syserror.EINVAL + } + if bits == 0 { + return nil + } + // Copy out the first unsigned long in the nodemask. + buf := t.CopyScratchBuffer(8) + usermem.ByteOrder.PutUint64(buf, val) + if _, err := t.CopyOutBytes(addr, buf); err != nil { + return err + } + // Zero out remaining unsigned longs in the nodemask. + if bits > 64 { + remAddr, ok := addr.AddLength(8) + if !ok { + return syserror.EFAULT + } + remUint64 := (bits - 1) / 64 + if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return err + } + } + return nil +} + +// GetMempolicy implements the syscall get_mempolicy(2). +func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + mode := args[0].Pointer() + nodemask := args[1].Pointer() + maxnode := args[2].Uint() + addr := args[3].Pointer() + flags := args[4].Uint() + + if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 { + return 0, nil, syserror.EINVAL + } + nodeFlag := flags&linux.MPOL_F_NODE != 0 + addrFlag := flags&linux.MPOL_F_ADDR != 0 + memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0 + + // "EINVAL: The value specified by maxnode is less than the number of node + // IDs supported by the system." - get_mempolicy(2) + if nodemask != 0 && maxnode < maxNodes { + return 0, nil, syserror.EINVAL + } + + // "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is + // ignored and the set of nodes (memories) that the thread is allowed to + // specify in subsequent calls to mbind(2) or set_mempolicy(2) (in the + // absence of any mode flags) is returned in nodemask." + if memsAllowed { + // "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either + // MPOL_F_ADDR or MPOL_F_NODE." + if nodeFlag || addrFlag { + return 0, nil, syserror.EINVAL + } + if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil { + return 0, nil, err + } + return 0, nil, nil + } + + // "If flags specifies MPOL_F_ADDR, then information is returned about the + // policy governing the memory address given in addr. ... If the mode + // argument is not NULL, then get_mempolicy() will store the policy mode + // and any optional mode flags of the requested NUMA policy in the location + // pointed to by this argument. If nodemask is not NULL, then the nodemask + // associated with the policy will be stored in the location pointed to by + // this argument." + if addrFlag { + policy, nodemaskVal, err := t.MemoryManager().NumaPolicy(addr) + if err != nil { + return 0, nil, err + } + if nodeFlag { + // "If flags specifies both MPOL_F_NODE and MPOL_F_ADDR, + // get_mempolicy() will return the node ID of the node on which the + // address addr is allocated into the location pointed to by mode. + // If no page has yet been allocated for the specified address, + // get_mempolicy() will allocate a page as if the thread had + // performed a read (load) access to that address, and return the + // ID of the node where that page was allocated." + buf := t.CopyScratchBuffer(1) + _, err := t.CopyInBytes(addr, buf) + if err != nil { + return 0, nil, err + } + policy = linux.MPOL_DEFAULT // maxNodes == 1 + } + if mode != 0 { + if _, err := policy.CopyOut(t, mode); err != nil { + return 0, nil, err + } + } + if nodemask != 0 { + if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil { + return 0, nil, err + } + } + return 0, nil, nil + } + + // "EINVAL: ... flags specified MPOL_F_ADDR and addr is NULL, or flags did + // not specify MPOL_F_ADDR and addr is not NULL." This is partially + // inaccurate: if flags specifies MPOL_F_ADDR, + // mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will + // just (usually) fail to find a VMA at address 0 and return EFAULT. + if addr != 0 { + return 0, nil, syserror.EINVAL + } + + // "If flags is specified as 0, then information about the calling thread's + // default policy (as set by set_mempolicy(2)) is returned, in the buffers + // pointed to by mode and nodemask. ... If flags specifies MPOL_F_NODE, but + // not MPOL_F_ADDR, and the thread's current policy is MPOL_INTERLEAVE, + // then get_mempolicy() will return in the location pointed to by a + // non-NULL mode argument, the node ID of the next node that will be used + // for interleaving of internal kernel pages allocated on behalf of the + // thread." + policy, nodemaskVal := t.NumaPolicy() + if nodeFlag { + if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE { + return 0, nil, syserror.EINVAL + } + policy = linux.MPOL_DEFAULT // maxNodes == 1 + } + if mode != 0 { + if _, err := policy.CopyOut(t, mode); err != nil { + return 0, nil, err + } + } + if nodemask != 0 { + if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil { + return 0, nil, err + } + } + return 0, nil, nil +} + +// SetMempolicy implements the syscall set_mempolicy(2). +func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + modeWithFlags := linux.NumaPolicy(args[0].Int()) + nodemask := args[1].Pointer() + maxnode := args[2].Uint() + + modeWithFlags, nodemaskVal, err := copyInMempolicyNodemask(t, modeWithFlags, nodemask, maxnode) + if err != nil { + return 0, nil, err + } + + t.SetNumaPolicy(modeWithFlags, nodemaskVal) + return 0, nil, nil +} + +// Mbind implements the syscall mbind(2). +func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].Uint64() + mode := linux.NumaPolicy(args[2].Int()) + nodemask := args[3].Pointer() + maxnode := args[4].Uint() + flags := args[5].Uint() + + if flags&^linux.MPOL_MF_VALID != 0 { + return 0, nil, syserror.EINVAL + } + // "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be + // privileged (CAP_SYS_NICE) to use this flag." - mbind(2) + if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) { + return 0, nil, syserror.EPERM + } + + mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode) + if err != nil { + return 0, nil, err + } + + // Since we claim to have only a single node, all flags can be ignored + // (since all pages must already be on that single node). + err = t.MemoryManager().SetNumaPolicy(addr, length, mode, nodemaskVal) + return 0, nil, err +} + +func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nodemask usermem.Addr, maxnode uint32) (linux.NumaPolicy, uint64, error) { + flags := linux.NumaPolicy(modeWithFlags & linux.MPOL_MODE_FLAGS) + mode := linux.NumaPolicy(modeWithFlags &^ linux.MPOL_MODE_FLAGS) + if flags == linux.MPOL_MODE_FLAGS { + // Can't specify both mode flags simultaneously. + return 0, 0, syserror.EINVAL + } + if mode < 0 || mode >= linux.MPOL_MAX { + // Must specify a valid mode. + return 0, 0, syserror.EINVAL + } + + var nodemaskVal uint64 + if nodemask != 0 { + var err error + nodemaskVal, err = copyInNodemask(t, nodemask, maxnode) + if err != nil { + return 0, 0, err + } + } + + switch mode { + case linux.MPOL_DEFAULT: + // "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate; + // Linux allows a nodemask to be specified, as long as it is empty. + if nodemaskVal != 0 { + return 0, 0, syserror.EINVAL + } + case linux.MPOL_BIND, linux.MPOL_INTERLEAVE: + // These require a non-empty nodemask. + if nodemaskVal == 0 { + return 0, 0, syserror.EINVAL + } + case linux.MPOL_PREFERRED: + // This permits an empty nodemask, as long as no flags are set. + if nodemaskVal == 0 && flags != 0 { + return 0, 0, syserror.EINVAL + } + case linux.MPOL_LOCAL: + // This requires an empty nodemask and no flags set ... + if nodemaskVal != 0 || flags != 0 { + return 0, 0, syserror.EINVAL + } + // ... and is implemented as MPOL_PREFERRED. + mode = linux.MPOL_PREFERRED + default: + // Unknown mode, which we should have rejected above. + panic(fmt.Sprintf("unknown mode: %v", mode)) + } + + return mode | flags, nodemaskVal, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go new file mode 100644 index 000000000..91694d374 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -0,0 +1,332 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "bytes" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Brk implements linux syscall brk(2). +func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr, _ := t.MemoryManager().Brk(t, args[0].Pointer()) + // "However, the actual Linux system call returns the new program break on + // success. On failure, the system call returns the current break." - + // brk(2) + return uintptr(addr), nil, nil +} + +// LINT.IfChange + +// Mmap implements linux syscall mmap(2). +func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + prot := args[2].Int() + flags := args[3].Int() + fd := args[4].Int() + fixed := flags&linux.MAP_FIXED != 0 + private := flags&linux.MAP_PRIVATE != 0 + shared := flags&linux.MAP_SHARED != 0 + anon := flags&linux.MAP_ANONYMOUS != 0 + map32bit := flags&linux.MAP_32BIT != 0 + + // Require exactly one of MAP_PRIVATE and MAP_SHARED. + if private == shared { + return 0, nil, syserror.EINVAL + } + + opts := memmap.MMapOpts{ + Length: args[1].Uint64(), + Offset: args[5].Uint64(), + Addr: args[0].Pointer(), + Fixed: fixed, + Unmap: fixed, + Map32Bit: map32bit, + Private: private, + Perms: usermem.AccessType{ + Read: linux.PROT_READ&prot != 0, + Write: linux.PROT_WRITE&prot != 0, + Execute: linux.PROT_EXEC&prot != 0, + }, + MaxPerms: usermem.AnyAccess, + GrowsDown: linux.MAP_GROWSDOWN&flags != 0, + Precommit: linux.MAP_POPULATE&flags != 0, + } + if linux.MAP_LOCKED&flags != 0 { + opts.MLockMode = memmap.MLockEager + } + defer func() { + if opts.MappingIdentity != nil { + opts.MappingIdentity.DecRef() + } + }() + + if !anon { + // Convert the passed FD to a file reference. + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + flags := file.Flags() + // mmap unconditionally requires that the FD is readable. + if !flags.Read { + return 0, nil, syserror.EACCES + } + // MAP_SHARED requires that the FD be writable for PROT_WRITE. + if shared && !flags.Write { + opts.MaxPerms.Write = false + } + + if err := file.ConfigureMMap(t, &opts); err != nil { + return 0, nil, err + } + } + + rv, err := t.MemoryManager().MMap(t, opts) + return uintptr(rv), nil, err +} + +// LINT.ThenChange(vfs2/mmap.go) + +// Munmap implements linux syscall munmap(2). +func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64()) +} + +// Mremap implements linux syscall mremap(2). +func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldAddr := args[0].Pointer() + oldSize := args[1].Uint64() + newSize := args[2].Uint64() + flags := args[3].Uint64() + newAddr := args[4].Pointer() + + if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 { + return 0, nil, syserror.EINVAL + } + mayMove := flags&linux.MREMAP_MAYMOVE != 0 + fixed := flags&linux.MREMAP_FIXED != 0 + var moveMode mm.MRemapMoveMode + switch { + case !mayMove && !fixed: + moveMode = mm.MRemapNoMove + case mayMove && !fixed: + moveMode = mm.MRemapMayMove + case mayMove && fixed: + moveMode = mm.MRemapMustMove + case !mayMove && fixed: + // "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be + // specified." - mremap(2) + return 0, nil, syserror.EINVAL + } + + rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{ + Move: moveMode, + NewAddr: newAddr, + }) + return uintptr(rv), nil, err +} + +// Mprotect implements linux syscall mprotect(2). +func Mprotect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + length := args[1].Uint64() + prot := args[2].Int() + err := t.MemoryManager().MProtect(args[0].Pointer(), length, usermem.AccessType{ + Read: linux.PROT_READ&prot != 0, + Write: linux.PROT_WRITE&prot != 0, + Execute: linux.PROT_EXEC&prot != 0, + }, linux.PROT_GROWSDOWN&prot != 0) + return 0, nil, err +} + +// Madvise implements linux syscall madvise(2). +func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := uint64(args[1].SizeT()) + adv := args[2].Int() + + // "The Linux implementation requires that the address addr be + // page-aligned, and allows length to be zero." - madvise(2) + if addr.RoundDown() != addr { + return 0, nil, syserror.EINVAL + } + if length == 0 { + return 0, nil, nil + } + // Not explicitly stated: length need not be page-aligned. + lenAddr, ok := usermem.Addr(length).RoundUp() + if !ok { + return 0, nil, syserror.EINVAL + } + length = uint64(lenAddr) + + switch adv { + case linux.MADV_DONTNEED: + return 0, nil, t.MemoryManager().Decommit(addr, length) + case linux.MADV_DOFORK: + return 0, nil, t.MemoryManager().SetDontFork(addr, length, false) + case linux.MADV_DONTFORK: + return 0, nil, t.MemoryManager().SetDontFork(addr, length, true) + case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE: + fallthrough + case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE: + fallthrough + case linux.MADV_DONTDUMP, linux.MADV_DODUMP: + // TODO(b/72045799): Core dumping isn't implemented, so these are + // no-ops. + fallthrough + case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED: + // Do nothing, we totally ignore the suggestions above. + return 0, nil, nil + case linux.MADV_REMOVE: + // These "suggestions" have application-visible side effects, so we + // have to indicate that we don't support them. + return 0, nil, syserror.ENOSYS + case linux.MADV_HWPOISON: + // Only privileged processes are allowed to poison pages. + return 0, nil, syserror.EPERM + default: + // If adv is not a valid value tell the caller. + return 0, nil, syserror.EINVAL + } +} + +// Mincore implements the syscall mincore(2). +func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + vec := args[2].Pointer() + + if addr != addr.RoundDown() { + return 0, nil, syserror.EINVAL + } + // "The length argument need not be a multiple of the page size, but since + // residency information is returned for whole pages, length is effectively + // rounded up to the next multiple of the page size." - mincore(2) + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return 0, nil, syserror.ENOMEM + } + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return 0, nil, syserror.ENOMEM + } + + // Pretend that all mapped pages are "resident in core". + mapped := t.MemoryManager().VirtualMemorySizeRange(ar) + // "ENOMEM: addr to addr + length contained unmapped memory." + if mapped != uint64(la) { + return 0, nil, syserror.ENOMEM + } + resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize)) + _, err := t.CopyOut(vec, resident) + return 0, nil, err +} + +// Msync implements Linux syscall msync(2). +func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + flags := args[2].Int() + + // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC, + // and may additionally include the MS_INVALIDATE bit. ... However, Linux + // permits a call to msync() that specifies neither of these flags, with + // semantics that are (currently) equivalent to specifying MS_ASYNC." - + // msync(2) + if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 { + return 0, nil, syserror.EINVAL + } + sync := flags&linux.MS_SYNC != 0 + if sync && flags&linux.MS_ASYNC != 0 { + return 0, nil, syserror.EINVAL + } + err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{ + Sync: sync, + Invalidate: flags&linux.MS_INVALIDATE != 0, + }) + // MSync calls fsync, the same interrupt conversion rules apply, see + // mm/msync.c, fsync POSIX.1-2008. + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) +} + +// Mlock implements linux syscall mlock(2). +func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + + return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager) +} + +// Mlock2 implements linux syscall mlock2(2). +func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + flags := args[2].Int() + + if flags&^(linux.MLOCK_ONFAULT) != 0 { + return 0, nil, syserror.EINVAL + } + + mode := memmap.MLockEager + if flags&linux.MLOCK_ONFAULT != 0 { + mode = memmap.MLockLazy + } + return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode) +} + +// Munlock implements linux syscall munlock(2). +func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + + return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone) +} + +// Mlockall implements linux syscall mlockall(2). +func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + + if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 { + return 0, nil, syserror.EINVAL + } + + mode := memmap.MLockEager + if flags&linux.MCL_ONFAULT != 0 { + mode = memmap.MLockLazy + } + return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ + Current: flags&linux.MCL_CURRENT != 0, + Future: flags&linux.MCL_FUTURE != 0, + Mode: mode, + }) +} + +// Munlockall implements linux syscall munlockall(2). +func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ + Current: true, + Future: true, + Mode: memmap.MLockNone, + }) +} diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go new file mode 100644 index 000000000..eb5ff48f5 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_mount.go @@ -0,0 +1,154 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Mount implements Linux syscall mount(2). +func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + sourceAddr := args[0].Pointer() + targetAddr := args[1].Pointer() + typeAddr := args[2].Pointer() + flags := args[3].Uint64() + dataAddr := args[4].Pointer() + + fsType, err := t.CopyInString(typeAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + + sourcePath, _, err := copyInPath(t, sourceAddr, true /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + targetPath, _, err := copyInPath(t, targetAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + data := "" + if dataAddr != 0 { + // In Linux, a full page is always copied in regardless of null + // character placement, and the address is passed to each file system. + // Most file systems always treat this data as a string, though, and so + // do all of the ones we implement. + data, err = t.CopyInString(dataAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + } + + // Ignore magic value that was required before Linux 2.4. + if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL { + flags = flags &^ linux.MS_MGC_MSK + } + + // Must have CAP_SYS_ADMIN in the mount namespace's associated user + // namespace. + if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) { + return 0, nil, syserror.EPERM + } + + const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND | + linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE | + linux.MS_UNBINDABLE | linux.MS_MOVE + + // Silently allow MS_NOSUID, since we don't implement set-id bits + // anyway. + const unsupportedFlags = linux.MS_NODEV | + linux.MS_NODIRATIME | linux.MS_STRICTATIME + + // Linux just allows passing any flags to mount(2) - it won't fail when + // unknown or unsupported flags are passed. Since we don't implement + // everything, we fail explicitly on flags that are unimplemented. + if flags&(unsupportedOps|unsupportedFlags) != 0 { + return 0, nil, syserror.EINVAL + } + + rsys, ok := fs.FindFilesystem(fsType) + if !ok { + return 0, nil, syserror.ENODEV + } + if !rsys.AllowUserMount() { + return 0, nil, syserror.EPERM + } + + var superFlags fs.MountSourceFlags + if flags&linux.MS_NOATIME == linux.MS_NOATIME { + superFlags.NoAtime = true + } + if flags&linux.MS_RDONLY == linux.MS_RDONLY { + superFlags.ReadOnly = true + } + if flags&linux.MS_NOEXEC == linux.MS_NOEXEC { + superFlags.NoExec = true + } + + rootInode, err := rsys.Mount(t, sourcePath, superFlags, data, nil) + if err != nil { + return 0, nil, syserror.EINVAL + } + + if err := fileOpOn(t, linux.AT_FDCWD, targetPath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + // Mount will take a reference on rootInode if successful. + return t.MountNamespace().Mount(t, d, rootInode) + }); err != nil { + // Something went wrong. Drop our ref on rootInode before + // returning the error. + rootInode.DecRef() + return 0, nil, err + } + + return 0, nil, nil +} + +// Umount2 implements Linux syscall umount2(2). +func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Int() + + const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE + if flags&unsupported != 0 { + return 0, nil, syserror.EINVAL + } + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + // Must have CAP_SYS_ADMIN in the mount namespace's associated user + // namespace. + // + // Currently, this is always the init task's user namespace. + if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) { + return 0, nil, syserror.EPERM + } + + resolve := flags&linux.UMOUNT_NOFOLLOW != linux.UMOUNT_NOFOLLOW + detachOnly := flags&linux.MNT_DETACH == linux.MNT_DETACH + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + return t.MountNamespace().Unmount(t, d, detachOnly) + }) +} diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go new file mode 100644 index 000000000..43c510930 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -0,0 +1,77 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// LINT.IfChange + +// pipe2 implements the actual system call with flags. +func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { + if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { + return 0, syserror.EINVAL + } + r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize) + + r.SetFlags(linuxToFlags(flags).Settable()) + defer r.DecRef() + + w.SetFlags(linuxToFlags(flags).Settable()) + defer w.DecRef() + + fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + if err != nil { + return 0, err + } + + if _, err := t.CopyOut(addr, fds); err != nil { + for _, fd := range fds { + if file, _ := t.FDTable().Remove(fd); file != nil { + file.DecRef() + } + } + return 0, err + } + return 0, nil +} + +// Pipe implements linux syscall pipe(2). +func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + n, err := pipe2(t, addr, 0) + return n, nil, err +} + +// Pipe2 implements linux syscall pipe2(2). +func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := uint(args[1].Uint()) + + n, err := pipe2(t, addr, flags) + return n, nil, err +} + +// LINT.ThenChange(vfs2/pipe.go) diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go new file mode 100644 index 000000000..f0198141c --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -0,0 +1,545 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// fileCap is the maximum allowable files for poll & select. +const fileCap = 1024 * 1024 + +// Masks for "readable", "writable", and "exceptional" events as defined by +// select(2). +const ( + // selectReadEvents is analogous to the Linux kernel's + // fs/select.c:POLLIN_SET. + selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR + + // selectWriteEvents is analogous to the Linux kernel's + // fs/select.c:POLLOUT_SET. + selectWriteEvents = linux.POLLOUT | linux.POLLERR + + // selectExceptEvents is analogous to the Linux kernel's + // fs/select.c:POLLEX_SET. + selectExceptEvents = linux.POLLPRI +) + +// pollState tracks the associated file descriptor and waiter of a PollFD. +type pollState struct { + file *fs.File + waiter waiter.Entry +} + +// initReadiness gets the current ready mask for the file represented by the FD +// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is +// used to register with the file for event notifications, and a reference to +// the file is stored in "state". +func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) { + if pfd.FD < 0 { + pfd.REvents = 0 + return + } + + file := t.GetFile(pfd.FD) + if file == nil { + pfd.REvents = linux.POLLNVAL + return + } + + if ch == nil { + defer file.DecRef() + } else { + state.file = file + state.waiter, _ = waiter.NewChannelEntry(ch) + file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events))) + } + + r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events))) + pfd.REvents = int16(r.ToLinux()) & pfd.Events +} + +// releaseState releases all the pollState in "state". +func releaseState(state []pollState) { + for i := range state { + if state[i].file != nil { + state[i].file.EventUnregister(&state[i].waiter) + state[i].file.DecRef() + } + } +} + +// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout" +// when "timeout" is greater than zero. +// +// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or +// positive if interrupted by a signal. +func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) { + var ch chan struct{} + if timeout != 0 { + ch = make(chan struct{}, 1) + } + + // Register for event notification in the files involved if we may + // block (timeout not zero). Once we find a file that has a non-zero + // result, we stop registering for events but still go through all files + // to get their ready masks. + state := make([]pollState, len(pfd)) + defer releaseState(state) + n := uintptr(0) + for i := range pfd { + initReadiness(t, &pfd[i], &state[i], ch) + if pfd[i].REvents != 0 { + n++ + ch = nil + } + } + + if timeout == 0 { + return timeout, n, nil + } + + forever := timeout < 0 + + for n == 0 { + var err error + // Wait for a notification. + timeout, err = t.BlockWithTimeout(ch, !forever, timeout) + if err != nil { + if err == syserror.ETIMEDOUT { + err = nil + } + return timeout, 0, err + } + + // We got notified, count how many files are ready. If none, + // then this was a spurious notification, and we just go back + // to sleep with the remaining timeout. + for i := range state { + if state[i].file == nil { + continue + } + + r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events))) + rl := int16(r.ToLinux()) & pfd[i].Events + if rl != 0 { + pfd[i].REvents = rl + n++ + } + } + } + + return timeout, n, nil +} + +// CopyInPollFDs copies an array of struct pollfd unless nfds exceeds the max. +func CopyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) { + if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) { + return nil, syserror.EINVAL + } + + pfd := make([]linux.PollFD, nfds) + if nfds > 0 { + if _, err := t.CopyIn(addr, &pfd); err != nil { + return nil, err + } + } + + return pfd, nil +} + +func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) { + pfd, err := CopyInPollFDs(t, addr, nfds) + if err != nil { + return timeout, 0, err + } + + // Compatibility warning: Linux adds POLLHUP and POLLERR just before + // polling, in fs/select.c:do_pollfd(). Since pfd is copied out after + // polling, changing event masks here is an application-visible difference. + // (Linux also doesn't copy out event masks at all, only revents.) + for i := range pfd { + pfd[i].Events |= linux.POLLHUP | linux.POLLERR + } + remainingTimeout, n, err := pollBlock(t, pfd, timeout) + err = syserror.ConvertIntr(err, syserror.EINTR) + + // The poll entries are copied out regardless of whether + // any are set or not. This aligns with the Linux behavior. + if nfds > 0 && err == nil { + if _, err := t.CopyOut(addr, pfd); err != nil { + return remainingTimeout, 0, err + } + } + + return remainingTimeout, n, err +} + +// CopyInFDSet copies an fd set from select(2)/pselect(2). +func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) { + set := make([]byte, nBytes) + + if addr != 0 { + if _, err := t.CopyIn(addr, &set); err != nil { + return nil, err + } + // If we only use part of the last byte, mask out the extraneous bits. + // + // N.B. This only works on little-endian architectures. + if nBitsInLastPartialByte != 0 { + set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte + } + } + return set, nil +} + +func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) { + if nfds < 0 || nfds > fileCap { + return 0, syserror.EINVAL + } + + // Calculate the size of the fd sets (one bit per fd). + nBytes := (nfds + 7) / 8 + nBitsInLastPartialByte := nfds % 8 + + // Capture all the provided input vectors. + r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + + // Count how many FDs are actually being requested so that we can build + // a PollFD array. + fdCount := 0 + for i := 0; i < nBytes; i++ { + v := r[i] | w[i] | e[i] + for v != 0 { + v &= (v - 1) + fdCount++ + } + } + + // Build the PollFD array. + pfd := make([]linux.PollFD, 0, fdCount) + var fd int32 + for i := 0; i < nBytes; i++ { + rV, wV, eV := r[i], w[i], e[i] + v := rV | wV | eV + m := byte(1) + for j := 0; j < 8; j++ { + if (v & m) != 0 { + // Make sure the fd is valid and decrement the reference + // immediately to ensure we don't leak. Note, another thread + // might be about to close fd. This is racy, but that's + // OK. Linux is racy in the same way. + file := t.GetFile(fd) + if file == nil { + return 0, syserror.EBADF + } + file.DecRef() + + var mask int16 + if (rV & m) != 0 { + mask |= selectReadEvents + } + + if (wV & m) != 0 { + mask |= selectWriteEvents + } + + if (eV & m) != 0 { + mask |= selectExceptEvents + } + + pfd = append(pfd, linux.PollFD{ + FD: fd, + Events: mask, + }) + } + + fd++ + m <<= 1 + } + } + + // Do the syscall, then count the number of bits set. + if _, _, err = pollBlock(t, pfd, timeout); err != nil { + return 0, syserror.ConvertIntr(err, syserror.EINTR) + } + + // r, w, and e are currently event mask bitsets; unset bits corresponding + // to events that *didn't* occur. + bitSetCount := uintptr(0) + for idx := range pfd { + events := pfd[idx].REvents + i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8) + m := byte(1) << j + if r[i]&m != 0 { + if (events & selectReadEvents) != 0 { + bitSetCount++ + } else { + r[i] &^= m + } + } + if w[i]&m != 0 { + if (events & selectWriteEvents) != 0 { + bitSetCount++ + } else { + w[i] &^= m + } + } + if e[i]&m != 0 { + if (events & selectExceptEvents) != 0 { + bitSetCount++ + } else { + e[i] &^= m + } + } + } + + // Copy updated vectors back. + if readFDs != 0 { + if _, err := t.CopyOut(readFDs, r); err != nil { + return 0, err + } + } + + if writeFDs != 0 { + if _, err := t.CopyOut(writeFDs, w); err != nil { + return 0, err + } + } + + if exceptFDs != 0 { + if _, err := t.CopyOut(exceptFDs, e); err != nil { + return 0, err + } + } + + return bitSetCount, nil +} + +// timeoutRemaining returns the amount of time remaining for the specified +// timeout or 0 if it has elapsed. +// +// startNs must be from CLOCK_MONOTONIC. +func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration { + now := t.Kernel().MonotonicClock().Now() + remaining := timeout - now.Sub(startNs) + if remaining < 0 { + remaining = 0 + } + return remaining +} + +// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr. +// +// startNs must be from CLOCK_MONOTONIC. +func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error { + if timeout <= 0 { + return nil + } + remaining := timeoutRemaining(t, startNs, timeout) + tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds()) + return copyTimespecOut(t, timespecAddr, &tsRemaining) +} + +// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr. +// +// startNs must be from CLOCK_MONOTONIC. +func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error { + if timeout <= 0 { + return nil + } + remaining := timeoutRemaining(t, startNs, timeout) + tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds()) + return copyTimevalOut(t, timevalAddr, &tvRemaining) +} + +// pollRestartBlock encapsulates the state required to restart poll(2) via +// restart_syscall(2). +// +// +stateify savable +type pollRestartBlock struct { + pfdAddr usermem.Addr + nfds uint + timeout time.Duration +} + +// Restart implements kernel.SyscallRestartBlock.Restart. +func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) { + return poll(t, p.pfdAddr, p.nfds, p.timeout) +} + +func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) { + remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout) + // On an interrupt poll(2) is restarted with the remaining timeout. + if err == syserror.EINTR { + t.SetSyscallRestartBlock(&pollRestartBlock{ + pfdAddr: pfdAddr, + nfds: nfds, + timeout: remainingTimeout, + }) + return 0, kernel.ERESTART_RESTARTBLOCK + } + return n, err +} + +// Poll implements linux syscall poll(2). +func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pfdAddr := args[0].Pointer() + nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. + timeout := time.Duration(args[2].Int()) * time.Millisecond + n, err := poll(t, pfdAddr, nfds, timeout) + return n, nil, err +} + +// Ppoll implements linux syscall ppoll(2). +func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pfdAddr := args[0].Pointer() + nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. + timespecAddr := args[2].Pointer() + maskAddr := args[3].Pointer() + maskSize := uint(args[4].Uint()) + + timeout, err := copyTimespecInToDuration(t, timespecAddr) + if err != nil { + return 0, nil, err + } + + var startNs ktime.Time + if timeout > 0 { + startNs = t.Kernel().MonotonicClock().Now() + } + + if maskAddr != 0 { + mask, err := CopyInSigSet(t, maskAddr, maskSize) + if err != nil { + return 0, nil, err + } + + oldmask := t.SignalMask() + t.SetSignalMask(mask) + t.SetSavedSignalMask(oldmask) + } + + _, n, err := doPoll(t, pfdAddr, nfds, timeout) + copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) + // doPoll returns EINTR if interrupted, but ppoll is normally restartable + // if interrupted by something other than a signal handled by the + // application (i.e. returns ERESTARTNOHAND). However, if + // copyOutTimespecRemaining failed, then the restarted ppoll would use the + // wrong timeout, so the error should be left as EINTR. + // + // Note that this means that if err is nil but copyErr is not, copyErr is + // ignored. This is consistent with Linux. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// Select implements linux syscall select(2). +func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nfds := int(args[0].Int()) // select(2) uses an int. + readFDs := args[1].Pointer() + writeFDs := args[2].Pointer() + exceptFDs := args[3].Pointer() + timevalAddr := args[4].Pointer() + + // Use a negative Duration to indicate "no timeout". + timeout := time.Duration(-1) + if timevalAddr != 0 { + timeval, err := copyTimevalIn(t, timevalAddr) + if err != nil { + return 0, nil, err + } + if timeval.Sec < 0 || timeval.Usec < 0 { + return 0, nil, syserror.EINVAL + } + timeout = time.Duration(timeval.ToNsecCapped()) + } + startNs := t.Kernel().MonotonicClock().Now() + n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) + copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) + // See comment in Ppoll. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// Pselect implements linux syscall pselect(2). +func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nfds := int(args[0].Int()) // select(2) uses an int. + readFDs := args[1].Pointer() + writeFDs := args[2].Pointer() + exceptFDs := args[3].Pointer() + timespecAddr := args[4].Pointer() + maskWithSizeAddr := args[5].Pointer() + + timeout, err := copyTimespecInToDuration(t, timespecAddr) + if err != nil { + return 0, nil, err + } + + var startNs ktime.Time + if timeout > 0 { + startNs = t.Kernel().MonotonicClock().Now() + } + + if maskWithSizeAddr != 0 { + maskAddr, size, err := copyInSigSetWithSize(t, maskWithSizeAddr) + if err != nil { + return 0, nil, err + } + + if maskAddr != 0 { + mask, err := CopyInSigSet(t, maskAddr, size) + if err != nil { + return 0, nil, err + } + oldmask := t.SignalMask() + t.SetSignalMask(mask) + t.SetSavedSignalMask(oldmask) + } + } + + n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) + copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) + // See comment in Ppoll. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go new file mode 100644 index 000000000..f92bf8096 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -0,0 +1,228 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Prctl implements linux syscall prctl(2). +// It has a list of subfunctions which operate on the process. The arguments are +// all based on each subfunction. +func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + option := args[0].Int() + + switch option { + case linux.PR_SET_PDEATHSIG: + sig := linux.Signal(args[1].Int()) + if sig != 0 && !sig.IsValid() { + return 0, nil, syserror.EINVAL + } + t.SetParentDeathSignal(sig) + return 0, nil, nil + + case linux.PR_GET_PDEATHSIG: + _, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal())) + return 0, nil, err + + case linux.PR_GET_DUMPABLE: + d := t.MemoryManager().Dumpability() + switch d { + case mm.NotDumpable: + return linux.SUID_DUMP_DISABLE, nil, nil + case mm.UserDumpable: + return linux.SUID_DUMP_USER, nil, nil + case mm.RootDumpable: + return linux.SUID_DUMP_ROOT, nil, nil + default: + panic(fmt.Sprintf("Unknown dumpability %v", d)) + } + + case linux.PR_SET_DUMPABLE: + var d mm.Dumpability + switch args[1].Int() { + case linux.SUID_DUMP_DISABLE: + d = mm.NotDumpable + case linux.SUID_DUMP_USER: + d = mm.UserDumpable + default: + // N.B. Userspace may not pass SUID_DUMP_ROOT. + return 0, nil, syserror.EINVAL + } + t.MemoryManager().SetDumpability(d) + return 0, nil, nil + + case linux.PR_GET_KEEPCAPS: + if t.Credentials().KeepCaps { + return 1, nil, nil + } + + return 0, nil, nil + + case linux.PR_SET_KEEPCAPS: + val := args[1].Int() + // prctl(2): arg2 must be either 0 (permitted capabilities are cleared) + // or 1 (permitted capabilities are kept). + if val == 0 { + t.SetKeepCaps(false) + } else if val == 1 { + t.SetKeepCaps(true) + } else { + return 0, nil, syserror.EINVAL + } + + return 0, nil, nil + + case linux.PR_SET_NAME: + addr := args[1].Pointer() + name, err := t.CopyInString(addr, linux.TASK_COMM_LEN-1) + if err != nil && err != syserror.ENAMETOOLONG { + return 0, nil, err + } + t.SetName(name) + + case linux.PR_GET_NAME: + addr := args[1].Pointer() + buf := t.CopyScratchBuffer(linux.TASK_COMM_LEN) + len := copy(buf, t.Name()) + if len < linux.TASK_COMM_LEN { + buf[len] = 0 + len++ + } + _, err := t.CopyOut(addr, buf[:len]) + if err != nil { + return 0, nil, err + } + + case linux.PR_SET_MM: + if !t.HasCapability(linux.CAP_SYS_RESOURCE) { + return 0, nil, syserror.EPERM + } + + switch args[1].Int() { + case linux.PR_SET_MM_EXE_FILE: + fd := args[2].Int() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // They trying to set exe to a non-file? + if !fs.IsFile(file.Dirent.Inode.StableAttr) { + return 0, nil, syserror.EBADF + } + + // Set the underlying executable. + t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file)) + + case linux.PR_SET_MM_AUXV, + linux.PR_SET_MM_START_CODE, + linux.PR_SET_MM_END_CODE, + linux.PR_SET_MM_START_DATA, + linux.PR_SET_MM_END_DATA, + linux.PR_SET_MM_START_STACK, + linux.PR_SET_MM_START_BRK, + linux.PR_SET_MM_BRK, + linux.PR_SET_MM_ARG_START, + linux.PR_SET_MM_ARG_END, + linux.PR_SET_MM_ENV_START, + linux.PR_SET_MM_ENV_END: + + t.Kernel().EmitUnimplementedEvent(t) + fallthrough + default: + return 0, nil, syserror.EINVAL + } + + case linux.PR_SET_NO_NEW_PRIVS: + if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { + return 0, nil, syserror.EINVAL + } + // PR_SET_NO_NEW_PRIVS is assumed to always be set. + // See kernel.Task.updateCredsForExecLocked. + return 0, nil, nil + + case linux.PR_GET_NO_NEW_PRIVS: + if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { + return 0, nil, syserror.EINVAL + } + return 1, nil, nil + + case linux.PR_SET_SECCOMP: + if args[1].Int() != linux.SECCOMP_MODE_FILTER { + // Unsupported mode. + return 0, nil, syserror.EINVAL + } + + return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer()) + + case linux.PR_GET_SECCOMP: + return uintptr(t.SeccompMode()), nil, nil + + case linux.PR_CAPBSET_READ: + cp := linux.Capability(args[1].Uint64()) + if !cp.Ok() { + return 0, nil, syserror.EINVAL + } + var rv uintptr + if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 { + rv = 1 + } + return rv, nil, nil + + case linux.PR_CAPBSET_DROP: + cp := linux.Capability(args[1].Uint64()) + if !cp.Ok() { + return 0, nil, syserror.EINVAL + } + return 0, nil, t.DropBoundingCapability(cp) + + case linux.PR_GET_TIMING, + linux.PR_SET_TIMING, + linux.PR_GET_TSC, + linux.PR_SET_TSC, + linux.PR_TASK_PERF_EVENTS_DISABLE, + linux.PR_TASK_PERF_EVENTS_ENABLE, + linux.PR_GET_TIMERSLACK, + linux.PR_SET_TIMERSLACK, + linux.PR_MCE_KILL, + linux.PR_MCE_KILL_GET, + linux.PR_GET_TID_ADDRESS, + linux.PR_SET_CHILD_SUBREAPER, + linux.PR_GET_CHILD_SUBREAPER, + linux.PR_GET_THP_DISABLE, + linux.PR_SET_THP_DISABLE, + linux.PR_MPX_ENABLE_MANAGEMENT, + linux.PR_MPX_DISABLE_MANAGEMENT: + + t.Kernel().EmitUnimplementedEvent(t) + fallthrough + default: + return 0, nil, syserror.EINVAL + } + + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go new file mode 100644 index 000000000..c0aa0fd60 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_random.go @@ -0,0 +1,92 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "io" + "math" + + "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const ( + _GRND_NONBLOCK = 0x1 + _GRND_RANDOM = 0x2 +) + +// GetRandom implements the linux syscall getrandom(2). +// +// In a multi-tenant/shared environment, the only valid implementation is to +// fetch data from the urandom pool, otherwise starvation attacks become +// possible. The urandom pool is also expected to have plenty of entropy, thus +// the GRND_RANDOM flag is ignored. The GRND_NONBLOCK flag does not apply, as +// the pool will already be initialized. +func GetRandom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + flags := args[2].Int() + + // Flags are checked for validity but otherwise ignored. See above. + if flags & ^(_GRND_NONBLOCK|_GRND_RANDOM) != 0 { + return 0, nil, syserror.EINVAL + } + + if length > math.MaxInt32 { + length = math.MaxInt32 + } + ar, ok := addr.ToRange(uint64(length)) + if !ok { + return 0, nil, syserror.EFAULT + } + + // "If the urandom source has been initialized, reads of up to 256 bytes + // will always return as many bytes as requested and will not be + // interrupted by signals. No such guarantees apply for larger buffer + // sizes." - getrandom(2) + min := int(length) + if min > 256 { + min = 256 + } + n, err := t.MemoryManager().CopyOutFrom(t, usermem.AddrRangeSeqOf(ar), safemem.FromIOReader{&randReader{-1, min}}, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if n >= int64(min) { + return uintptr(n), nil, nil + } + return 0, nil, err +} + +// randReader is a io.Reader that handles partial reads from rand.Reader. +type randReader struct { + done int + min int +} + +// Read implements io.Reader.Read. +func (r *randReader) Read(dst []byte) (int, error) { + if r.done >= r.min { + return rand.Reader.Read(dst) + } + min := r.min - r.done + if min > len(dst) { + min = len(dst) + } + return io.ReadAtLeast(rand.Reader, dst, min) +} diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go new file mode 100644 index 000000000..071b4bacc --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -0,0 +1,394 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// LINT.IfChange + +const ( + // EventMaskRead contains events that can be triggered on reads. + EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr +) + +// Read implements linux syscall read(2). Note that we try to get a buffer that +// is exactly the size requested because some applications like qemu expect +// they can do large reads all at once. Bug for bug. Same for other read +// calls below. +func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is readable. + if !file.Flags().Read { + return 0, nil, syserror.EBADF + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := readv(t, file, dst) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "read", file) +} + +// Readahead implements readahead(2). +func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + offset := args[1].Int64() + size := args[2].SizeT() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is readable. + if !file.Flags().Read { + return 0, nil, syserror.EBADF + } + + // Check that the size is valid. + if int(size) < 0 { + return 0, nil, syserror.EINVAL + } + + // Check that the offset is legitimate and does not overflow. + if offset < 0 || offset+int64(size) < 0 { + return 0, nil, syserror.EINVAL + } + + // Return EINVAL; if the underlying file type does not support readahead, + // then Linux will return EINVAL to indicate as much. In the future, we + // may extend this function to actually support readahead hints. + return 0, nil, syserror.EINVAL +} + +// Pread64 implements linux syscall pread64(2). +func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + offset := args[3].Int64() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate and does not overflow. + if offset < 0 || offset+int64(size) < 0 { + return 0, nil, syserror.EINVAL + } + + // Is reading at an offset supported? + if !file.Flags().Pread { + return 0, nil, syserror.ESPIPE + } + + // Check that the file is readable. + if !file.Flags().Read { + return 0, nil, syserror.EBADF + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := preadv(t, file, dst, offset) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file) +} + +// Readv implements linux syscall readv(2). +func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is readable. + if !file.Flags().Read { + return 0, nil, syserror.EBADF + } + + // Read the iovecs that specify the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := readv(t, file, dst) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "readv", file) +} + +// Preadv implements linux syscall preadv(2). +func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Is reading at an offset supported? + if !file.Flags().Pread { + return 0, nil, syserror.ESPIPE + } + + // Check that the file is readable. + if !file.Flags().Read { + return 0, nil, syserror.EBADF + } + + // Read the iovecs that specify the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := preadv(t, file, dst, offset) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file) +} + +// Preadv2 implements linux syscall preadv2(2). +func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // While the syscall is + // preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) + // the linux internal call + // (https://elixir.bootlin.com/linux/v4.18/source/fs/read_write.c#L1248) + // splits the offset argument into a high/low value for compatibility with + // 32-bit architectures. The flags argument is the 5th argument. + + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + flags := int(args[5].Int()) + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < -1 { + return 0, nil, syserror.EINVAL + } + + // Is reading at an offset supported? + if offset > -1 && !file.Flags().Pread { + return 0, nil, syserror.ESPIPE + } + + // Check that the file is readable. + if !file.Flags().Read { + return 0, nil, syserror.EBADF + } + + // Check flags field. + // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is + // accepted as a valid flag argument for preadv2. + if flags&^linux.RWF_VALID != 0 { + return 0, nil, syserror.EOPNOTSUPP + } + + // Read the iovecs that specify the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + // If preadv2 is called with an offset of -1, readv is called. + if offset == -1 { + n, err := readv(t, file, dst) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file) + } + + n, err := preadv(t, file, dst, offset) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file) +} + +func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) { + n, err := f.Readv(t, dst) + if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if n > 0 { + // Queue notification if we read anything. + f.Dirent.InotifyEvent(linux.IN_ACCESS, 0) + } + return n, err + } + + // Sockets support read timeouts. + var haveDeadline bool + var deadline ktime.Time + if s, ok := f.FileOperations.(socket.Socket); ok { + dl := s.RecvTimeout() + if dl < 0 && err == syserror.ErrWouldBlock { + return n, err + } + if dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + f.EventRegister(&w, EventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst64(n) + + // Issue the request and break out if it completes with anything + // other than "would block". + n, err = f.Readv(t, dst) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = syserror.ErrWouldBlock + } + break + } + } + + f.EventUnregister(&w) + + if total > 0 { + // Queue notification if we read anything. + f.Dirent.InotifyEvent(linux.IN_ACCESS, 0) + } + + return total, err +} + +func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + n, err := f.Preadv(t, dst, offset) + if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if n > 0 { + // Queue notification if we read anything. + f.Dirent.InotifyEvent(linux.IN_ACCESS, 0) + } + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + f.EventRegister(&w, EventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst64(n) + + // Issue the request and break out if it completes with anything + // other than "would block". + n, err = f.Preadv(t, dst, offset+total) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + if err = t.Block(ch); err != nil { + break + } + } + + f.EventUnregister(&w) + + if total > 0 { + // Queue notification if we read anything. + f.Dirent.InotifyEvent(linux.IN_ACCESS, 0) + } + + return total, err +} + +// LINT.ThenChange(vfs2/read_write.go) diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go new file mode 100644 index 000000000..d5d5b6959 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_rlimit.go @@ -0,0 +1,224 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// rlimit describes an implementation of 'struct rlimit', which may vary from +// system-to-system. +type rlimit interface { + // toLimit converts an rlimit to a limits.Limit. + toLimit() *limits.Limit + + // fromLimit converts a limits.Limit to an rlimit. + fromLimit(lim limits.Limit) + + // copyIn copies an rlimit from the untrusted app to the kernel. + copyIn(t *kernel.Task, addr usermem.Addr) error + + // copyOut copies an rlimit from the kernel to the untrusted app. + copyOut(t *kernel.Task, addr usermem.Addr) error +} + +// newRlimit returns the appropriate rlimit type for 'struct rlimit' on this system. +func newRlimit(t *kernel.Task) (rlimit, error) { + switch t.Arch().Width() { + case 8: + // On 64-bit system, struct rlimit and struct rlimit64 are identical. + return &rlimit64{}, nil + default: + return nil, syserror.ENOSYS + } +} + +type rlimit64 struct { + Cur uint64 + Max uint64 +} + +func (r *rlimit64) toLimit() *limits.Limit { + return &limits.Limit{ + Cur: limits.FromLinux(r.Cur), + Max: limits.FromLinux(r.Max), + } +} + +func (r *rlimit64) fromLimit(lim limits.Limit) { + *r = rlimit64{ + Cur: limits.ToLinux(lim.Cur), + Max: limits.ToLinux(lim.Max), + } +} + +func (r *rlimit64) copyIn(t *kernel.Task, addr usermem.Addr) error { + _, err := t.CopyIn(addr, r) + return err +} + +func (r *rlimit64) copyOut(t *kernel.Task, addr usermem.Addr) error { + _, err := t.CopyOut(addr, *r) + return err +} + +func makeRlimit64(lim limits.Limit) *rlimit64 { + return &rlimit64{Cur: lim.Cur, Max: lim.Max} +} + +// setableLimits is the set of supported setable limits. +var setableLimits = map[limits.LimitType]struct{}{ + limits.NumberOfFiles: {}, + limits.AS: {}, + limits.CPU: {}, + limits.Data: {}, + limits.FileSize: {}, + limits.MemoryLocked: {}, + limits.Stack: {}, + // These are not enforced, but we include them here to avoid returning + // EPERM, since some apps expect them to succeed. + limits.Core: {}, + limits.ProcessCount: {}, +} + +func prlimit64(t *kernel.Task, resource limits.LimitType, newLim *limits.Limit) (limits.Limit, error) { + if newLim == nil { + return t.ThreadGroup().Limits().Get(resource), nil + } + + if _, ok := setableLimits[resource]; !ok { + return limits.Limit{}, syserror.EPERM + } + + // "A privileged process (under Linux: one with the CAP_SYS_RESOURCE + // capability in the initial user namespace) may make arbitrary changes + // to either limit value." + privileged := t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.Kernel().RootUserNamespace()) + + oldLim, err := t.ThreadGroup().Limits().Set(resource, *newLim, privileged) + if err != nil { + return limits.Limit{}, err + } + + if resource == limits.CPU { + t.NotifyRlimitCPUUpdated() + } + return oldLim, nil +} + +// Getrlimit implements linux syscall getrlimit(2). +func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + resource, ok := limits.FromLinuxResource[int(args[0].Int())] + if !ok { + // Return err; unknown limit. + return 0, nil, syserror.EINVAL + } + addr := args[1].Pointer() + rlim, err := newRlimit(t) + if err != nil { + return 0, nil, err + } + lim, err := prlimit64(t, resource, nil) + if err != nil { + return 0, nil, err + } + rlim.fromLimit(lim) + return 0, nil, rlim.copyOut(t, addr) +} + +// Setrlimit implements linux syscall setrlimit(2). +func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + resource, ok := limits.FromLinuxResource[int(args[0].Int())] + if !ok { + // Return err; unknown limit. + return 0, nil, syserror.EINVAL + } + addr := args[1].Pointer() + rlim, err := newRlimit(t) + if err != nil { + return 0, nil, err + } + if err := rlim.copyIn(t, addr); err != nil { + return 0, nil, syserror.EFAULT + } + _, err = prlimit64(t, resource, rlim.toLimit()) + return 0, nil, err +} + +// Prlimit64 implements linux syscall prlimit64(2). +func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tid := kernel.ThreadID(args[0].Int()) + resource, ok := limits.FromLinuxResource[int(args[1].Int())] + if !ok { + // Return err; unknown limit. + return 0, nil, syserror.EINVAL + } + newRlimAddr := args[2].Pointer() + oldRlimAddr := args[3].Pointer() + + var newLim *limits.Limit + if newRlimAddr != 0 { + var nrl rlimit64 + if err := nrl.copyIn(t, newRlimAddr); err != nil { + return 0, nil, syserror.EFAULT + } + newLim = nrl.toLimit() + } + + if tid < 0 { + return 0, nil, syserror.EINVAL + } + ot := t + if tid > 0 { + if ot = t.PIDNamespace().TaskWithID(tid); ot == nil { + return 0, nil, syserror.ESRCH + } + } + + // "To set or get the resources of a process other than itself, the caller + // must have the CAP_SYS_RESOURCE capability, or the real, effective, and + // saved set user IDs of the target process must match the real user ID of + // the caller and the real, effective, and saved set group IDs of the + // target process must match the real group ID of the caller." + if ot != t && !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) { + cred, tcred := t.Credentials(), ot.Credentials() + if cred.RealKUID != tcred.RealKUID || + cred.RealKUID != tcred.EffectiveKUID || + cred.RealKUID != tcred.SavedKUID || + cred.RealKGID != tcred.RealKGID || + cred.RealKGID != tcred.EffectiveKGID || + cred.RealKGID != tcred.SavedKGID { + return 0, nil, syserror.EPERM + } + } + + oldLim, err := prlimit64(ot, resource, newLim) + if err != nil { + return 0, nil, err + } + + if oldRlimAddr != 0 { + if err := makeRlimit64(oldLim).copyOut(t, oldRlimAddr); err != nil { + return 0, nil, syserror.EFAULT + } + } + + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_rseq.go b/pkg/sentry/syscalls/linux/sys_rseq.go new file mode 100644 index 000000000..90db10ea6 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_rseq.go @@ -0,0 +1,48 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// RSeq implements syscall rseq(2). +func RSeq(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].Uint() + flags := args[2].Int() + signature := args[3].Uint() + + if !t.RSeqAvailable() { + // Event for applications that want rseq on a configuration + // that doesn't support them. + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.ENOSYS + } + + switch flags { + case 0: + // Register. + return 0, nil, t.SetRSeq(addr, length, signature) + case linux.RSEQ_FLAG_UNREGISTER: + return 0, nil, t.ClearRSeq(addr, length, signature) + default: + // Unknown flag. + return 0, nil, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go new file mode 100644 index 000000000..1674c7445 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_rusage.go @@ -0,0 +1,112 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/syserror" +) + +func getrusage(t *kernel.Task, which int32) linux.Rusage { + var cs usage.CPUStats + + switch which { + case linux.RUSAGE_SELF: + cs = t.ThreadGroup().CPUStats() + + case linux.RUSAGE_CHILDREN: + cs = t.ThreadGroup().JoinedChildCPUStats() + + case linux.RUSAGE_THREAD: + cs = t.CPUStats() + + case linux.RUSAGE_BOTH: + tg := t.ThreadGroup() + cs = tg.CPUStats() + cs.Accumulate(tg.JoinedChildCPUStats()) + } + + return linux.Rusage{ + UTime: linux.NsecToTimeval(cs.UserTime.Nanoseconds()), + STime: linux.NsecToTimeval(cs.SysTime.Nanoseconds()), + NVCSw: int64(cs.VoluntarySwitches), + MaxRSS: int64(t.MaxRSS(which) / 1024), + } +} + +// Getrusage implements linux syscall getrusage(2). +// marked "y" are supported now +// marked "*" are not used on Linux +// marked "p" are pending for support +// +// y struct timeval ru_utime; /* user CPU time used */ +// y struct timeval ru_stime; /* system CPU time used */ +// p long ru_maxrss; /* maximum resident set size */ +// * long ru_ixrss; /* integral shared memory size */ +// * long ru_idrss; /* integral unshared data size */ +// * long ru_isrss; /* integral unshared stack size */ +// p long ru_minflt; /* page reclaims (soft page faults) */ +// p long ru_majflt; /* page faults (hard page faults) */ +// * long ru_nswap; /* swaps */ +// p long ru_inblock; /* block input operations */ +// p long ru_oublock; /* block output operations */ +// * long ru_msgsnd; /* IPC messages sent */ +// * long ru_msgrcv; /* IPC messages received */ +// * long ru_nsignals; /* signals received */ +// y long ru_nvcsw; /* voluntary context switches */ +// y long ru_nivcsw; /* involuntary context switches */ +func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + which := args[0].Int() + addr := args[1].Pointer() + + if which != linux.RUSAGE_SELF && which != linux.RUSAGE_CHILDREN && which != linux.RUSAGE_THREAD { + return 0, nil, syserror.EINVAL + } + + ru := getrusage(t, which) + _, err := t.CopyOut(addr, &ru) + return 0, nil, err +} + +// Times implements linux syscall times(2). +func Times(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + // Calculate the ticks first, and figure out if any additional work is + // necessary. Linux allows for a NULL addr, in which case only the + // return value is meaningful. We don't need to do anything else. + ticks := uintptr(ktime.NowFromContext(t).Nanoseconds() / linux.ClockTick.Nanoseconds()) + if addr == 0 { + return ticks, nil, nil + } + + cs1 := t.ThreadGroup().CPUStats() + cs2 := t.ThreadGroup().JoinedChildCPUStats() + r := linux.Tms{ + UTime: linux.ClockTFromDuration(cs1.UserTime), + STime: linux.ClockTFromDuration(cs1.SysTime), + CUTime: linux.ClockTFromDuration(cs2.UserTime), + CSTime: linux.ClockTFromDuration(cs2.SysTime), + } + if _, err := t.CopyOut(addr, &r); err != nil { + return 0, nil, err + } + + return ticks, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go new file mode 100644 index 000000000..99f6993f5 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_sched.go @@ -0,0 +1,99 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + onlyScheduler = linux.SCHED_NORMAL + onlyPriority = 0 +) + +// SchedParam replicates struct sched_param in sched.h. +type SchedParam struct { + schedPriority int64 +} + +// SchedGetparam implements linux syscall sched_getparam(2). +func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := args[0].Int() + param := args[1].Pointer() + if param == 0 { + return 0, nil, syserror.EINVAL + } + if pid < 0 { + return 0, nil, syserror.EINVAL + } + if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { + return 0, nil, syserror.ESRCH + } + r := SchedParam{schedPriority: onlyPriority} + if _, err := t.CopyOut(param, r); err != nil { + return 0, nil, err + } + + return 0, nil, nil +} + +// SchedGetscheduler implements linux syscall sched_getscheduler(2). +func SchedGetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := args[0].Int() + if pid < 0 { + return 0, nil, syserror.EINVAL + } + if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { + return 0, nil, syserror.ESRCH + } + return onlyScheduler, nil, nil +} + +// SchedSetscheduler implements linux syscall sched_setscheduler(2). +func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := args[0].Int() + policy := args[1].Int() + param := args[2].Pointer() + if pid < 0 { + return 0, nil, syserror.EINVAL + } + if policy != onlyScheduler { + return 0, nil, syserror.EINVAL + } + if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { + return 0, nil, syserror.ESRCH + } + var r SchedParam + if _, err := t.CopyIn(param, &r); err != nil { + return 0, nil, syserror.EINVAL + } + if r.schedPriority != onlyPriority { + return 0, nil, syserror.EINVAL + } + return 0, nil, nil +} + +// SchedGetPriorityMax implements linux syscall sched_get_priority_max(2). +func SchedGetPriorityMax(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return onlyPriority, nil, nil +} + +// SchedGetPriorityMin implements linux syscall sched_get_priority_min(2). +func SchedGetPriorityMin(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return onlyPriority, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go new file mode 100644 index 000000000..5b7a66f4d --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_seccomp.go @@ -0,0 +1,76 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// userSockFprog is equivalent to Linux's struct sock_fprog on amd64. +type userSockFprog struct { + // Len is the length of the filter in BPF instructions. + Len uint16 + + _ [6]byte // padding for alignment + + // Filter is a user pointer to the struct sock_filter array that makes up + // the filter program. Filter is a uint64 rather than a usermem.Addr + // because usermem.Addr is actually uintptr, which is not a fixed-size + // type, and encoding/binary.Read objects to this. + Filter uint64 +} + +// seccomp applies a seccomp policy to the current task. +func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error { + // We only support SECCOMP_SET_MODE_FILTER at the moment. + if mode != linux.SECCOMP_SET_MODE_FILTER { + // Unsupported mode. + return syserror.EINVAL + } + + tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0 + + // The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC. + if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 { + // Unsupported flag. + return syserror.EINVAL + } + + var fprog userSockFprog + if _, err := t.CopyIn(addr, &fprog); err != nil { + return err + } + filter := make([]linux.BPFInstruction, int(fprog.Len)) + if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil { + return err + } + compiledFilter, err := bpf.Compile(filter) + if err != nil { + t.Debugf("Invalid seccomp-bpf filter: %v", err) + return syserror.EINVAL + } + + return t.AppendSyscallFilter(compiledFilter, tsync) +} + +// Seccomp implements linux syscall seccomp(2). +func Seccomp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, seccomp(t, args[0].Uint64(), args[1].Uint64(), args[2].Pointer()) +} diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go new file mode 100644 index 000000000..5f54f2456 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_sem.go @@ -0,0 +1,241 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "math" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const opsMax = 500 // SEMOPM + +// Semget handles: semget(key_t key, int nsems, int semflg) +func Semget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + key := args[0].Int() + nsems := args[1].Int() + flag := args[2].Int() + + private := key == linux.IPC_PRIVATE + create := flag&linux.IPC_CREAT == linux.IPC_CREAT + exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL + mode := linux.FileMode(flag & 0777) + + r := t.IPCNamespace().SemaphoreRegistry() + set, err := r.FindOrCreate(t, key, nsems, mode, private, create, exclusive) + if err != nil { + return 0, nil, err + } + return uintptr(set.ID), nil, nil +} + +// Semop handles: semop(int semid, struct sembuf *sops, size_t nsops) +func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Int() + sembufAddr := args[1].Pointer() + nsops := args[2].SizeT() + + r := t.IPCNamespace().SemaphoreRegistry() + set := r.FindByID(id) + if set == nil { + return 0, nil, syserror.EINVAL + } + if nsops <= 0 { + return 0, nil, syserror.EINVAL + } + if nsops > opsMax { + return 0, nil, syserror.E2BIG + } + + ops := make([]linux.Sembuf, nsops) + if _, err := t.CopyIn(sembufAddr, ops); err != nil { + return 0, nil, err + } + + creds := auth.CredentialsFromContext(t) + pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup()) + for { + ch, num, err := set.ExecuteOps(t, ops, creds, int32(pid)) + if ch == nil || err != nil { + // We're done (either on success or a failure). + return 0, nil, err + } + if err = t.Block(ch); err != nil { + set.AbortWait(num, ch) + return 0, nil, err + } + } +} + +// Semctl handles: semctl(int semid, int semnum, int cmd, ...) +func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Int() + num := args[1].Int() + cmd := args[2].Int() + + switch cmd { + case linux.SETVAL: + val := args[3].Int() + if val > math.MaxInt16 { + return 0, nil, syserror.ERANGE + } + return 0, nil, setVal(t, id, num, int16(val)) + + case linux.SETALL: + array := args[3].Pointer() + return 0, nil, setValAll(t, id, array) + + case linux.GETVAL: + v, err := getVal(t, id, num) + return uintptr(v), nil, err + + case linux.GETALL: + array := args[3].Pointer() + return 0, nil, getValAll(t, id, array) + + case linux.IPC_RMID: + return 0, nil, remove(t, id) + + case linux.IPC_SET: + arg := args[3].Pointer() + s := linux.SemidDS{} + if _, err := t.CopyIn(arg, &s); err != nil { + return 0, nil, err + } + + perms := fs.FilePermsFromMode(linux.FileMode(s.SemPerm.Mode & 0777)) + return 0, nil, ipcSet(t, id, auth.UID(s.SemPerm.UID), auth.GID(s.SemPerm.GID), perms) + + case linux.GETPID: + v, err := getPID(t, id, num) + return uintptr(v), nil, err + + case linux.IPC_INFO, + linux.SEM_INFO, + linux.IPC_STAT, + linux.SEM_STAT, + linux.SEM_STAT_ANY, + linux.GETNCNT, + linux.GETZCNT: + + t.Kernel().EmitUnimplementedEvent(t) + fallthrough + + default: + return 0, nil, syserror.EINVAL + } +} + +func remove(t *kernel.Task, id int32) error { + r := t.IPCNamespace().SemaphoreRegistry() + creds := auth.CredentialsFromContext(t) + return r.RemoveID(id, creds) +} + +func ipcSet(t *kernel.Task, id int32, uid auth.UID, gid auth.GID, perms fs.FilePermissions) error { + r := t.IPCNamespace().SemaphoreRegistry() + set := r.FindByID(id) + if set == nil { + return syserror.EINVAL + } + + creds := auth.CredentialsFromContext(t) + kuid := creds.UserNamespace.MapToKUID(uid) + if !kuid.Ok() { + return syserror.EINVAL + } + kgid := creds.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return syserror.EINVAL + } + owner := fs.FileOwner{UID: kuid, GID: kgid} + return set.Change(t, creds, owner, perms) +} + +func setVal(t *kernel.Task, id int32, num int32, val int16) error { + r := t.IPCNamespace().SemaphoreRegistry() + set := r.FindByID(id) + if set == nil { + return syserror.EINVAL + } + creds := auth.CredentialsFromContext(t) + pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup()) + return set.SetVal(t, num, val, creds, int32(pid)) +} + +func setValAll(t *kernel.Task, id int32, array usermem.Addr) error { + r := t.IPCNamespace().SemaphoreRegistry() + set := r.FindByID(id) + if set == nil { + return syserror.EINVAL + } + vals := make([]uint16, set.Size()) + if _, err := t.CopyIn(array, vals); err != nil { + return err + } + creds := auth.CredentialsFromContext(t) + pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup()) + return set.SetValAll(t, vals, creds, int32(pid)) +} + +func getVal(t *kernel.Task, id int32, num int32) (int16, error) { + r := t.IPCNamespace().SemaphoreRegistry() + set := r.FindByID(id) + if set == nil { + return 0, syserror.EINVAL + } + creds := auth.CredentialsFromContext(t) + return set.GetVal(num, creds) +} + +func getValAll(t *kernel.Task, id int32, array usermem.Addr) error { + r := t.IPCNamespace().SemaphoreRegistry() + set := r.FindByID(id) + if set == nil { + return syserror.EINVAL + } + creds := auth.CredentialsFromContext(t) + vals, err := set.GetValAll(creds) + if err != nil { + return err + } + _, err = t.CopyOut(array, vals) + return err +} + +func getPID(t *kernel.Task, id int32, num int32) (int32, error) { + r := t.IPCNamespace().SemaphoreRegistry() + set := r.FindByID(id) + if set == nil { + return 0, syserror.EINVAL + } + creds := auth.CredentialsFromContext(t) + gpid, err := set.GetPID(num, creds) + if err != nil { + return 0, err + } + // Convert pid from init namespace to the caller's namespace. + tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(gpid)) + if tg == nil { + return 0, nil + } + return int32(tg.ID()), nil +} diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go new file mode 100644 index 000000000..4a8bc24a2 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_shm.go @@ -0,0 +1,161 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/shm" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Shmget implements shmget(2). +func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + key := shm.Key(args[0].Int()) + size := uint64(args[1].SizeT()) + flag := args[2].Int() + + private := key == linux.IPC_PRIVATE + create := flag&linux.IPC_CREAT == linux.IPC_CREAT + exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL + mode := linux.FileMode(flag & 0777) + + pid := int32(t.ThreadGroup().ID()) + r := t.IPCNamespace().ShmRegistry() + segment, err := r.FindOrCreate(t, pid, key, size, mode, private, create, exclusive) + if err != nil { + return 0, nil, err + } + defer segment.DecRef() + return uintptr(segment.ID), nil, nil +} + +// findSegment retrives a shm segment by the given id. +// +// findSegment returns a reference on Shm. +func findSegment(t *kernel.Task, id shm.ID) (*shm.Shm, error) { + r := t.IPCNamespace().ShmRegistry() + segment := r.FindByID(id) + if segment == nil { + // No segment with provided id. + return nil, syserror.EINVAL + } + return segment, nil +} + +// Shmat implements shmat(2). +func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := shm.ID(args[0].Int()) + addr := args[1].Pointer() + flag := args[2].Int() + + segment, err := findSegment(t, id) + if err != nil { + return 0, nil, syserror.EINVAL + } + defer segment.DecRef() + + opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{ + Execute: flag&linux.SHM_EXEC == linux.SHM_EXEC, + Readonly: flag&linux.SHM_RDONLY == linux.SHM_RDONLY, + Remap: flag&linux.SHM_REMAP == linux.SHM_REMAP, + }) + if err != nil { + return 0, nil, err + } + addr, err = t.MemoryManager().MMap(t, opts) + return uintptr(addr), nil, err +} + +// Shmdt implements shmdt(2). +func Shmdt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + err := t.MemoryManager().DetachShm(t, addr) + return 0, nil, err +} + +// Shmctl implements shmctl(2). +func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := shm.ID(args[0].Int()) + cmd := args[1].Int() + buf := args[2].Pointer() + + r := t.IPCNamespace().ShmRegistry() + + switch cmd { + case linux.SHM_STAT: + // Technically, we should be treating id as "an index into the kernel's + // internal array that maintains information about all shared memory + // segments on the system". Since we don't track segments in an array, + // we'll just pretend the shmid is the index and do the same thing as + // IPC_STAT. Linux also uses the index as the shmid. + fallthrough + case linux.IPC_STAT: + segment, err := findSegment(t, id) + if err != nil { + return 0, nil, syserror.EINVAL + } + defer segment.DecRef() + + stat, err := segment.IPCStat(t) + if err == nil { + _, err = t.CopyOut(buf, stat) + } + return 0, nil, err + + case linux.IPC_INFO: + params := r.IPCInfo() + _, err := t.CopyOut(buf, params) + return 0, nil, err + + case linux.SHM_INFO: + info := r.ShmInfo() + _, err := t.CopyOut(buf, info) + return 0, nil, err + } + + // Remaining commands refer to a specific segment. + segment, err := findSegment(t, id) + if err != nil { + return 0, nil, syserror.EINVAL + } + defer segment.DecRef() + + switch cmd { + case linux.IPC_SET: + var ds linux.ShmidDS + _, err = t.CopyIn(buf, &ds) + if err != nil { + return 0, nil, err + } + err = segment.Set(t, &ds) + return 0, nil, err + + case linux.IPC_RMID: + segment.MarkDestroyed() + return 0, nil, nil + + case linux.SHM_LOCK, linux.SHM_UNLOCK: + // We currently do not support memory locking anywhere. + // mlock(2)/munlock(2) are currently stubbed out as no-ops so do the + // same here. + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, nil + + default: + return 0, nil, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go new file mode 100644 index 000000000..d2b0012ae --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -0,0 +1,590 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "math" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// "For a process to have permission to send a signal it must +// - either be privileged (CAP_KILL), or +// - the real or effective user ID of the sending process must be equal to the +// real or saved set-user-ID of the target process. +// +// In the case of SIGCONT it suffices when the sending and receiving processes +// belong to the same session." - kill(2) +// +// Equivalent to kernel/signal.c:check_kill_permission. +func mayKill(t *kernel.Task, target *kernel.Task, sig linux.Signal) bool { + // kernel/signal.c:check_kill_permission also allows a signal if the + // sending and receiving tasks share a thread group, which is not + // mentioned in kill(2) since kill does not allow task-level + // granularity in signal sending. + if t.ThreadGroup() == target.ThreadGroup() { + return true + } + + if t.HasCapabilityIn(linux.CAP_KILL, target.UserNamespace()) { + return true + } + + creds := t.Credentials() + tcreds := target.Credentials() + if creds.EffectiveKUID == tcreds.SavedKUID || + creds.EffectiveKUID == tcreds.RealKUID || + creds.RealKUID == tcreds.SavedKUID || + creds.RealKUID == tcreds.RealKUID { + return true + } + + if sig == linux.SIGCONT && target.ThreadGroup().Session() == t.ThreadGroup().Session() { + return true + } + return false +} + +// Kill implements linux syscall kill(2). +func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := kernel.ThreadID(args[0].Int()) + sig := linux.Signal(args[1].Int()) + + switch { + case pid > 0: + // "If pid is positive, then signal sig is sent to the process with the + // ID specified by pid." - kill(2) + // This loops to handle races with execve where target dies between + // TaskWithID and SendGroupSignal. Compare Linux's + // kernel/signal.c:kill_pid_info(). + for { + target := t.PIDNamespace().TaskWithID(pid) + if target == nil { + return 0, nil, syserror.ESRCH + } + if !mayKill(t, target, sig) { + return 0, nil, syserror.EPERM + } + info := &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoUser, + } + info.SetPid(int32(target.PIDNamespace().IDOfTask(t))) + info.SetUid(int32(t.Credentials().RealKUID.In(target.UserNamespace()).OrOverflow())) + if err := target.SendGroupSignal(info); err != syserror.ESRCH { + return 0, nil, err + } + } + case pid == -1: + // "If pid equals -1, then sig is sent to every process for which the + // calling process has permission to send signals, except for process 1 + // (init), but see below. ... POSIX.1-2001 requires that kill(-1,sig) + // send sig to all processes that the calling process may send signals + // to, except possibly for some implementation-defined system + // processes. Linux allows a process to signal itself, but on Linux the + // call kill(-1,sig) does not signal the calling process." + var ( + lastErr error + delivered int + ) + for _, tg := range t.PIDNamespace().ThreadGroups() { + if tg == t.ThreadGroup() { + continue + } + if t.PIDNamespace().IDOfThreadGroup(tg) == kernel.InitTID { + continue + } + + // If pid == -1, the returned error is the last non-EPERM error + // from any call to group_send_sig_info. + if !mayKill(t, tg.Leader(), sig) { + continue + } + // Here and below, whether or not kill returns an error may + // depend on the iteration order. We at least implement the + // semantics documented by the man page: "On success (at least + // one signal was sent), zero is returned." + info := &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoUser, + } + info.SetPid(int32(tg.PIDNamespace().IDOfTask(t))) + info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) + err := tg.SendSignal(info) + if err == syserror.ESRCH { + // ESRCH is ignored because it means the task + // exited while we were iterating. This is a + // race which would not normally exist on + // Linux, so we suppress it. + continue + } + delivered++ + if err != nil { + lastErr = err + } + } + if delivered > 0 { + return 0, nil, lastErr + } + return 0, nil, syserror.ESRCH + default: + // "If pid equals 0, then sig is sent to every process in the process + // group of the calling process." + // + // "If pid is less than -1, then sig is sent to every process + // in the process group whose ID is -pid." + pgid := kernel.ProcessGroupID(-pid) + if pgid == 0 { + pgid = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup()) + } + + // If pid != -1 (i.e. signalling a process group), the returned error + // is the last error from any call to group_send_sig_info. + lastErr := syserror.ESRCH + for _, tg := range t.PIDNamespace().ThreadGroups() { + if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid { + if !mayKill(t, tg.Leader(), sig) { + lastErr = syserror.EPERM + continue + } + + info := &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoUser, + } + info.SetPid(int32(tg.PIDNamespace().IDOfTask(t))) + info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) + // See note above regarding ESRCH race above. + if err := tg.SendSignal(info); err != syserror.ESRCH { + lastErr = err + } + } + } + + return 0, nil, lastErr + } +} + +func tkillSigInfo(sender, receiver *kernel.Task, sig linux.Signal) *arch.SignalInfo { + info := &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoTkill, + } + info.SetPid(int32(receiver.PIDNamespace().IDOfThreadGroup(sender.ThreadGroup()))) + info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + return info +} + +// Tkill implements linux syscall tkill(2). +func Tkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tid := kernel.ThreadID(args[0].Int()) + sig := linux.Signal(args[1].Int()) + + // N.B. Inconsistent with man page, linux actually rejects calls with + // tid <=0 by EINVAL. This isn't the same for all signal calls. + if tid <= 0 { + return 0, nil, syserror.EINVAL + } + + target := t.PIDNamespace().TaskWithID(tid) + if target == nil { + return 0, nil, syserror.ESRCH + } + + if !mayKill(t, target, sig) { + return 0, nil, syserror.EPERM + } + return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig)) +} + +// Tgkill implements linux syscall tgkill(2). +func Tgkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tgid := kernel.ThreadID(args[0].Int()) + tid := kernel.ThreadID(args[1].Int()) + sig := linux.Signal(args[2].Int()) + + // N.B. Inconsistent with man page, linux actually rejects calls with + // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls. + if tgid <= 0 || tid <= 0 { + return 0, nil, syserror.EINVAL + } + + targetTG := t.PIDNamespace().ThreadGroupWithID(tgid) + target := t.PIDNamespace().TaskWithID(tid) + if targetTG == nil || target == nil || target.ThreadGroup() != targetTG { + return 0, nil, syserror.ESRCH + } + + if !mayKill(t, target, sig) { + return 0, nil, syserror.EPERM + } + return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig)) +} + +// RtSigaction implements linux syscall rt_sigaction(2). +func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + sig := linux.Signal(args[0].Int()) + newactarg := args[1].Pointer() + oldactarg := args[2].Pointer() + sigsetsize := args[3].SizeT() + + if sigsetsize != linux.SignalSetSize { + return 0, nil, syserror.EINVAL + } + + var newactptr *arch.SignalAct + if newactarg != 0 { + newact, err := t.CopyInSignalAct(newactarg) + if err != nil { + return 0, nil, err + } + newactptr = &newact + } + oldact, err := t.ThreadGroup().SetSignalAct(sig, newactptr) + if err != nil { + return 0, nil, err + } + if oldactarg != 0 { + if err := t.CopyOutSignalAct(oldactarg, &oldact); err != nil { + return 0, nil, err + } + } + return 0, nil, nil +} + +// Sigreturn implements linux syscall sigreturn(2). +func Sigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + ctrl, err := t.SignalReturn(false) + return 0, ctrl, err +} + +// RtSigreturn implements linux syscall rt_sigreturn(2). +func RtSigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + ctrl, err := t.SignalReturn(true) + return 0, ctrl, err +} + +// RtSigprocmask implements linux syscall rt_sigprocmask(2). +func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + how := args[0].Int() + setaddr := args[1].Pointer() + oldaddr := args[2].Pointer() + sigsetsize := args[3].SizeT() + + if sigsetsize != linux.SignalSetSize { + return 0, nil, syserror.EINVAL + } + oldmask := t.SignalMask() + if setaddr != 0 { + mask, err := CopyInSigSet(t, setaddr, sigsetsize) + if err != nil { + return 0, nil, err + } + + switch how { + case linux.SIG_BLOCK: + t.SetSignalMask(oldmask | mask) + case linux.SIG_UNBLOCK: + t.SetSignalMask(oldmask &^ mask) + case linux.SIG_SETMASK: + t.SetSignalMask(mask) + default: + return 0, nil, syserror.EINVAL + } + } + if oldaddr != 0 { + return 0, nil, copyOutSigSet(t, oldaddr, oldmask) + } + + return 0, nil, nil +} + +// Sigaltstack implements linux syscall sigaltstack(2). +func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + setaddr := args[0].Pointer() + oldaddr := args[1].Pointer() + + alt := t.SignalStack() + if oldaddr != 0 { + if err := t.CopyOutSignalStack(oldaddr, &alt); err != nil { + return 0, nil, err + } + } + if setaddr != 0 { + alt, err := t.CopyInSignalStack(setaddr) + if err != nil { + return 0, nil, err + } + // The signal stack cannot be changed if the task is currently + // on the stack. This is enforced at the lowest level because + // these semantics apply to changing the signal stack via a + // ucontext during a signal handler. + if !t.SetSignalStack(alt) { + return 0, nil, syserror.EPERM + } + } + + return 0, nil, nil +} + +// Pause implements linux syscall pause(2). +func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND) +} + +// RtSigpending implements linux syscall rt_sigpending(2). +func RtSigpending(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + pending := t.PendingSignals() + _, err := pending.CopyOut(t, addr) + return 0, nil, err +} + +// RtSigtimedwait implements linux syscall rt_sigtimedwait(2). +func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + sigset := args[0].Pointer() + siginfo := args[1].Pointer() + timespec := args[2].Pointer() + sigsetsize := args[3].SizeT() + + mask, err := CopyInSigSet(t, sigset, sigsetsize) + if err != nil { + return 0, nil, err + } + + var timeout time.Duration + if timespec != 0 { + d, err := copyTimespecIn(t, timespec) + if err != nil { + return 0, nil, err + } + if !d.Valid() { + return 0, nil, syserror.EINVAL + } + timeout = time.Duration(d.ToNsecCapped()) + } else { + timeout = time.Duration(math.MaxInt64) + } + + si, err := t.Sigtimedwait(mask, timeout) + if err != nil { + return 0, nil, err + } + + if siginfo != 0 { + si.FixSignalCodeForUser() + if _, err := si.CopyOut(t, siginfo); err != nil { + return 0, nil, err + } + } + return uintptr(si.Signo), nil, nil +} + +// RtSigqueueinfo implements linux syscall rt_sigqueueinfo(2). +func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := kernel.ThreadID(args[0].Int()) + sig := linux.Signal(args[1].Int()) + infoAddr := args[2].Pointer() + + // Copy in the info. + // + // We must ensure that the Signo is set (Linux overrides this in the + // same way), and that the code is in the allowed set. This same logic + // appears below in RtSigtgqueueinfo and should be kept in sync. + var info arch.SignalInfo + if _, err := info.CopyIn(t, infoAddr); err != nil { + return 0, nil, err + } + info.Signo = int32(sig) + + // This must loop to handle the race with execve described in Kill. + for { + // Deliver to the given task's thread group. + target := t.PIDNamespace().TaskWithID(pid) + if target == nil { + return 0, nil, syserror.ESRCH + } + + // If the sender is not the receiver, it can't use si_codes used by the + // kernel or SI_TKILL. + if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t { + return 0, nil, syserror.EPERM + } + + if !mayKill(t, target, sig) { + return 0, nil, syserror.EPERM + } + + if err := target.SendGroupSignal(&info); err != syserror.ESRCH { + return 0, nil, err + } + } +} + +// RtTgsigqueueinfo implements linux syscall rt_tgsigqueueinfo(2). +func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tgid := kernel.ThreadID(args[0].Int()) + tid := kernel.ThreadID(args[1].Int()) + sig := linux.Signal(args[2].Int()) + infoAddr := args[3].Pointer() + + // N.B. Inconsistent with man page, linux actually rejects calls with + // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls. + if tgid <= 0 || tid <= 0 { + return 0, nil, syserror.EINVAL + } + + // Copy in the info. See RtSigqueueinfo above. + var info arch.SignalInfo + if _, err := info.CopyIn(t, infoAddr); err != nil { + return 0, nil, err + } + info.Signo = int32(sig) + + // Deliver to the given task. + targetTG := t.PIDNamespace().ThreadGroupWithID(tgid) + target := t.PIDNamespace().TaskWithID(tid) + if targetTG == nil || target == nil || target.ThreadGroup() != targetTG { + return 0, nil, syserror.ESRCH + } + + // If the sender is not the receiver, it can't use si_codes used by the + // kernel or SI_TKILL. + if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t { + return 0, nil, syserror.EPERM + } + + if !mayKill(t, target, sig) { + return 0, nil, syserror.EPERM + } + return 0, nil, target.SendSignal(&info) +} + +// RtSigsuspend implements linux syscall rt_sigsuspend(2). +func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + sigset := args[0].Pointer() + + // Copy in the signal mask. + var mask linux.SignalSet + if _, err := mask.CopyIn(t, sigset); err != nil { + return 0, nil, err + } + mask &^= kernel.UnblockableSignals + + // Swap the mask. + oldmask := t.SignalMask() + t.SetSignalMask(mask) + t.SetSavedSignalMask(oldmask) + + // Perform the wait. + return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND) +} + +// RestartSyscall implements the linux syscall restart_syscall(2). +func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + if r := t.SyscallRestartBlock(); r != nil { + n, err := r.Restart(t) + return n, nil, err + } + // The restart block should never be nil here, but it's possible + // ERESTART_RESTARTBLOCK was set by ptrace without the current syscall + // setting up a restart block. If ptrace didn't manipulate the return value, + // finding a nil restart block is a bug. Linux ensures that the restart + // function is never null by (re)initializing it with one that translates + // the restart into EINTR. We'll emulate that behaviour. + t.Debugf("Restart block missing in restart_syscall(2). Did ptrace inject a return value of ERESTART_RESTARTBLOCK?") + return 0, nil, syserror.EINTR +} + +// sharedSignalfd is shared between the two calls. +func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) { + // Copy in the signal mask. + mask, err := CopyInSigSet(t, sigset, sigsetsize) + if err != nil { + return 0, nil, err + } + + // Always check for valid flags, even if not creating. + if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 { + return 0, nil, syserror.EINVAL + } + + // Is this a change to an existing signalfd? + // + // The spec indicates that this should adjust the mask. + if fd != -1 { + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Is this a signalfd? + if s, ok := file.FileOperations.(*signalfd.SignalOperations); ok { + s.SetMask(mask) + return 0, nil, nil + } + + // Not a signalfd. + return 0, nil, syserror.EINVAL + } + + // Create a new file. + file, err := signalfd.New(t, mask) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + // Set appropriate flags. + file.SetFlags(fs.SettableFileFlags{ + NonBlocking: flags&linux.SFD_NONBLOCK != 0, + }) + + // Create a new descriptor. + fd, err = t.NewFDFrom(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.SFD_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + // Done. + return uintptr(fd), nil, nil +} + +// Signalfd implements the linux syscall signalfd(2). +func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + sigset := args[1].Pointer() + sigsetsize := args[2].SizeT() + return sharedSignalfd(t, fd, sigset, sigsetsize, 0) +} + +// Signalfd4 implements the linux syscall signalfd4(2). +func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + sigset := args[1].Pointer() + sigsetsize := args[2].SizeT() + flags := args[3].Int() + return sharedSignalfd(t, fd, sigset, sigsetsize, flags) +} diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go new file mode 100644 index 000000000..0760af77b --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -0,0 +1,1138 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// LINT.IfChange + +// minListenBacklog is the minimum reasonable backlog for listening sockets. +const minListenBacklog = 8 + +// maxListenBacklog is the maximum allowed backlog for listening sockets. +const maxListenBacklog = 1024 + +// maxAddrLen is the maximum socket address length we're willing to accept. +const maxAddrLen = 200 + +// maxOptLen is the maximum sockopt parameter length we're willing to accept. +const maxOptLen = 1024 * 8 + +// maxControlLen is the maximum length of the msghdr.msg_control buffer we're +// willing to accept. Note that this limit is smaller than Linux, which allows +// buffers upto INT_MAX. +const maxControlLen = 10 * 1024 * 1024 + +// nameLenOffset is the offset from the start of the MessageHeader64 struct to +// the NameLen field. +const nameLenOffset = 8 + +// controlLenOffset is the offset form the start of the MessageHeader64 struct +// to the ControlLen field. +const controlLenOffset = 40 + +// flagsOffset is the offset form the start of the MessageHeader64 struct +// to the Flags field. +const flagsOffset = 48 + +const sizeOfInt32 = 4 + +// messageHeader64Len is the length of a MessageHeader64 struct. +var messageHeader64Len = uint64(binary.Size(MessageHeader64{})) + +// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct. +var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{})) + +// baseRecvFlags are the flags that are accepted across recvmsg(2), +// recvmmsg(2), and recvfrom(2). +const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC + +// MessageHeader64 is the 64-bit representation of the msghdr struct used in +// the recvmsg and sendmsg syscalls. +type MessageHeader64 struct { + // Name is the optional pointer to a network address buffer. + Name uint64 + + // NameLen is the length of the buffer pointed to by Name. + NameLen uint32 + _ uint32 + + // Iov is a pointer to an array of io vectors that describe the memory + // locations involved in the io operation. + Iov uint64 + + // IovLen is the length of the array pointed to by Iov. + IovLen uint64 + + // Control is the optional pointer to ancillary control data. + Control uint64 + + // ControlLen is the length of the data pointed to by Control. + ControlLen uint64 + + // Flags on the sent/received message. + Flags int32 + _ int32 +} + +// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in +// the recvmmsg and sendmmsg syscalls. +type multipleMessageHeader64 struct { + msgHdr MessageHeader64 + msgLen uint32 + _ int32 +} + +// CopyInMessageHeader64 copies a message header from user to kernel memory. +func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error { + b := t.CopyScratchBuffer(52) + if _, err := t.CopyInBytes(addr, b); err != nil { + return err + } + + msg.Name = usermem.ByteOrder.Uint64(b[0:]) + msg.NameLen = usermem.ByteOrder.Uint32(b[8:]) + msg.Iov = usermem.ByteOrder.Uint64(b[16:]) + msg.IovLen = usermem.ByteOrder.Uint64(b[24:]) + msg.Control = usermem.ByteOrder.Uint64(b[32:]) + msg.ControlLen = usermem.ByteOrder.Uint64(b[40:]) + msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:])) + + return nil +} + +// CaptureAddress allocates memory for and copies a socket address structure +// from the untrusted address space range. +func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) { + if addrlen > maxAddrLen { + return nil, syserror.EINVAL + } + + addrBuf := make([]byte, addrlen) + if _, err := t.CopyInBytes(addr, addrBuf); err != nil { + return nil, err + } + + return addrBuf, nil +} + +// writeAddress writes a sockaddr structure and its length to an output buffer +// in the unstrusted address space range. If the address is bigger than the +// buffer, it is truncated. +func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error { + // Get the buffer length. + var bufLen uint32 + if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil { + return err + } + + if int32(bufLen) < 0 { + return syserror.EINVAL + } + + // Write the length unconditionally. + if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil { + return err + } + + if addr == nil { + return nil + } + + if bufLen > addrLen { + bufLen = addrLen + } + + // Copy as much of the address as will fit in the buffer. + encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr) + if bufLen > uint32(len(encodedAddr)) { + bufLen = uint32(len(encodedAddr)) + } + _, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)]) + return err +} + +// Socket implements the linux syscall socket(2). +func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + domain := int(args[0].Int()) + stype := args[1].Int() + protocol := int(args[2].Int()) + + // Check and initialize the flags. + if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { + return 0, nil, syserror.EINVAL + } + + // Create the new socket. + s, e := socket.New(t, domain, linux.SockType(stype&0xf), protocol) + if e != nil { + return 0, nil, e.ToError() + } + s.SetFlags(fs.SettableFileFlags{ + NonBlocking: stype&linux.SOCK_NONBLOCK != 0, + }) + defer s.DecRef() + + fd, err := t.NewFDFrom(0, s, kernel.FDFlags{ + CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// SocketPair implements the linux syscall socketpair(2). +func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + domain := int(args[0].Int()) + stype := args[1].Int() + protocol := int(args[2].Int()) + socks := args[3].Pointer() + + // Check and initialize the flags. + if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { + return 0, nil, syserror.EINVAL + } + + fileFlags := fs.SettableFileFlags{ + NonBlocking: stype&linux.SOCK_NONBLOCK != 0, + } + + // Create the socket pair. + s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol) + if e != nil { + return 0, nil, e.ToError() + } + s1.SetFlags(fileFlags) + s2.SetFlags(fileFlags) + defer s1.DecRef() + defer s2.DecRef() + + // Create the FDs for the sockets. + fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{ + CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + // Copy the file descriptors out. + if _, err := t.CopyOut(socks, fds); err != nil { + for _, fd := range fds { + if file, _ := t.FDTable().Remove(fd); file != nil { + file.DecRef() + } + } + return 0, nil, err + } + + return 0, nil, nil +} + +// Connect implements the linux syscall connect(2). +func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Uint() + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Capture address and call syscall implementation. + a, err := CaptureAddress(t, addr, addrlen) + if err != nil { + return 0, nil, err + } + + blocking := !file.Flags().NonBlocking + return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS) +} + +// accept is the implementation of the accept syscall. It is called by accept +// and accept4 syscall handlers. +func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) { + // Check that no unsupported flags are passed in. + if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { + return 0, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, syserror.ENOTSOCK + } + + // Call the syscall implementation for this socket, then copy the + // output address if one is specified. + blocking := !file.Flags().NonBlocking + + peerRequested := addrLen != 0 + nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking) + if e != nil { + return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + } + if peerRequested { + // NOTE(magi): Linux does not give you an error if it can't + // write the data back out so neither do we. + if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL { + return 0, err + } + } + return uintptr(nfd), nil +} + +// Accept4 implements the linux syscall accept4(2). +func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Pointer() + flags := int(args[3].Int()) + + n, err := accept(t, fd, addr, addrlen, flags) + return n, nil, err +} + +// Accept implements the linux syscall accept(2). +func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Pointer() + + n, err := accept(t, fd, addr, addrlen, 0) + return n, nil, err +} + +// Bind implements the linux syscall bind(2). +func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Uint() + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Capture address and call syscall implementation. + a, err := CaptureAddress(t, addr, addrlen) + if err != nil { + return 0, nil, err + } + + return 0, nil, s.Bind(t, a).ToError() +} + +// Listen implements the linux syscall listen(2). +func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + backlog := args[1].Int() + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Per Linux, the backlog is silently capped to reasonable values. + if backlog <= 0 { + backlog = minListenBacklog + } + if backlog > maxListenBacklog { + backlog = maxListenBacklog + } + + return 0, nil, s.Listen(t, int(backlog)).ToError() +} + +// Shutdown implements the linux syscall shutdown(2). +func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + how := args[1].Int() + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Validate how, then call syscall implementation. + switch how { + case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR: + default: + return 0, nil, syserror.EINVAL + } + + return 0, nil, s.Shutdown(t, int(how)).ToError() +} + +// GetSockOpt implements the linux syscall getsockopt(2). +func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + level := args[1].Int() + name := args[2].Int() + optValAddr := args[3].Pointer() + optLenAddr := args[4].Pointer() + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Read the length. Reject negative values. + optLen := int32(0) + if _, err := t.CopyIn(optLenAddr, &optLen); err != nil { + return 0, nil, err + } + if optLen < 0 { + return 0, nil, syserror.EINVAL + } + + // Call syscall implementation then copy both value and value len out. + v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen)) + if e != nil { + return 0, nil, e.ToError() + } + + vLen := int32(binary.Size(v)) + if _, err := t.CopyOut(optLenAddr, vLen); err != nil { + return 0, nil, err + } + + if v != nil { + if _, err := t.CopyOut(optValAddr, v); err != nil { + return 0, nil, err + } + } + + return 0, nil, nil +} + +// getSockOpt tries to handle common socket options, or dispatches to a specific +// socket implementation. +func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) { + if level == linux.SOL_SOCKET { + switch name { + case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL: + if len < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + } + + switch name { + case linux.SO_TYPE: + _, skType, _ := s.Type() + return int32(skType), nil + case linux.SO_DOMAIN: + family, _, _ := s.Type() + return int32(family), nil + case linux.SO_PROTOCOL: + _, _, protocol := s.Type() + return int32(protocol), nil + } + } + + return s.GetSockOpt(t, level, name, optValAddr, len) +} + +// SetSockOpt implements the linux syscall setsockopt(2). +// +// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket. +func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + level := args[1].Int() + name := args[2].Int() + optValAddr := args[3].Pointer() + optLen := args[4].Int() + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + if optLen < 0 { + return 0, nil, syserror.EINVAL + } + if optLen > maxOptLen { + return 0, nil, syserror.EINVAL + } + buf := t.CopyScratchBuffer(int(optLen)) + if _, err := t.CopyIn(optValAddr, &buf); err != nil { + return 0, nil, err + } + + // Call syscall implementation. + if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil { + return 0, nil, err.ToError() + } + + return 0, nil, nil +} + +// GetSockName implements the linux syscall getsockname(2). +func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Pointer() + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Get the socket name and copy it to the caller. + v, vl, err := s.GetSockName(t) + if err != nil { + return 0, nil, err.ToError() + } + + return 0, nil, writeAddress(t, v, vl, addr, addrlen) +} + +// GetPeerName implements the linux syscall getpeername(2). +func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Pointer() + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Get the socket peer name and copy it to the caller. + v, vl, err := s.GetPeerName(t) + if err != nil { + return 0, nil, err.ToError() + } + + return 0, nil, writeAddress(t, v, vl, addr, addrlen) +} + +// RecvMsg implements the linux syscall recvmsg(2). +func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + msgPtr := args[1].Pointer() + flags := args[2].Int() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Reject flags that we don't handle yet. + if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { + return 0, nil, syserror.EINVAL + } + + if file.Flags().NonBlocking { + flags |= linux.MSG_DONTWAIT + } + + var haveDeadline bool + var deadline ktime.Time + if dl := s.RecvTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + + n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline) + return n, nil, err +} + +// RecvMMsg implements the linux syscall recvmmsg(2). +func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + msgPtr := args[1].Pointer() + vlen := args[2].Uint() + flags := args[3].Int() + toPtr := args[4].Pointer() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syserror.EINVAL + } + + // Reject flags that we don't handle yet. + if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { + return 0, nil, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + if file.Flags().NonBlocking { + flags |= linux.MSG_DONTWAIT + } + + var haveDeadline bool + var deadline ktime.Time + if toPtr != 0 { + ts, err := copyTimespecIn(t, toPtr) + if err != nil { + return 0, nil, err + } + if !ts.Valid() { + return 0, nil, syserror.EINVAL + } + deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration()) + haveDeadline = true + } + + if !haveDeadline { + if dl := s.RecvTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + } + + var count uint32 + var err error + for i := uint64(0); i < uint64(vlen); i++ { + mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) + if !ok { + return 0, nil, syserror.EFAULT + } + var n uintptr + if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil { + break + } + + // Copy the received length to the caller. + lp, ok := mp.AddLength(messageHeader64Len) + if !ok { + return 0, nil, syserror.EFAULT + } + if _, err = t.CopyOut(lp, uint32(n)); err != nil { + break + } + count++ + } + + if count == 0 { + return 0, nil, err + } + return uintptr(count), nil, nil +} + +func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) { + // Capture the message header and io vectors. + var msg MessageHeader64 + if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil { + return 0, err + } + + if msg.IovLen > linux.UIO_MAXIOV { + return 0, syserror.EMSGSIZE + } + dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + // FIXME(b/63594852): Pretend we have an empty error queue. + if flags&linux.MSG_ERRQUEUE != 0 { + return 0, syserror.EAGAIN + } + + // Fast path when no control message nor name buffers are provided. + if msg.ControlLen == 0 && msg.NameLen == 0 { + n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0) + if err != nil { + return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS) + } + if !cms.Unix.Empty() { + mflags |= linux.MSG_CTRUNC + cms.Release() + } + + if int(msg.Flags) != mflags { + // Copy out the flags to the caller. + if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil { + return 0, err + } + } + + return uintptr(n), nil + } + + if msg.ControlLen > maxControlLen { + return 0, syserror.ENOBUFS + } + n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) + if e != nil { + return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + } + defer cms.Release() + + controlData := make([]byte, 0, msg.ControlLen) + controlData = control.PackControlMessages(t, cms, controlData) + + if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() { + creds, _ := cms.Unix.Credentials.(control.SCMCredentials) + controlData, mflags = control.PackCredentials(t, creds, controlData, mflags) + } + + if cms.Unix.Rights != nil { + controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags) + } + + // Copy the address to the caller. + if msg.NameLen != 0 { + if err := writeAddress(t, sender, senderLen, usermem.Addr(msg.Name), usermem.Addr(msgPtr+nameLenOffset)); err != nil { + return 0, err + } + } + + // Copy the control data to the caller. + if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil { + return 0, err + } + if len(controlData) > 0 { + if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil { + return 0, err + } + } + + // Copy out the flags to the caller. + if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil { + return 0, err + } + + return uintptr(n), nil +} + +// recvFrom is the implementation of the recvfrom syscall. It is called by +// recvfrom and recv syscall handlers. +func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) { + if int(bufLen) < 0 { + return 0, syserror.EINVAL + } + + // Reject flags that we don't handle yet. + if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 { + return 0, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, syserror.ENOTSOCK + } + + if file.Flags().NonBlocking { + flags |= linux.MSG_DONTWAIT + } + + dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + var haveDeadline bool + var deadline ktime.Time + if dl := s.RecvTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + + n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) + cm.Release() + if e != nil { + return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + } + + // Copy the address to the caller. + if nameLenPtr != 0 { + if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil { + return 0, err + } + } + + return uintptr(n), nil +} + +// RecvFrom implements the linux syscall recvfrom(2). +func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + bufPtr := args[1].Pointer() + bufLen := args[2].Uint64() + flags := args[3].Int() + namePtr := args[4].Pointer() + nameLenPtr := args[5].Pointer() + + n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr) + return n, nil, err +} + +// SendMsg implements the linux syscall sendmsg(2). +func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + msgPtr := args[1].Pointer() + flags := args[2].Int() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Reject flags that we don't handle yet. + if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { + return 0, nil, syserror.EINVAL + } + + if file.Flags().NonBlocking { + flags |= linux.MSG_DONTWAIT + } + + n, err := sendSingleMsg(t, s, file, msgPtr, flags) + return n, nil, err +} + +// SendMMsg implements the linux syscall sendmmsg(2). +func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + msgPtr := args[1].Pointer() + vlen := args[2].Uint() + flags := args[3].Int() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Reject flags that we don't handle yet. + if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { + return 0, nil, syserror.EINVAL + } + + if file.Flags().NonBlocking { + flags |= linux.MSG_DONTWAIT + } + + var count uint32 + var err error + for i := uint64(0); i < uint64(vlen); i++ { + mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) + if !ok { + return 0, nil, syserror.EFAULT + } + var n uintptr + if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil { + break + } + + // Copy the received length to the caller. + lp, ok := mp.AddLength(messageHeader64Len) + if !ok { + return 0, nil, syserror.EFAULT + } + if _, err = t.CopyOut(lp, uint32(n)); err != nil { + break + } + count++ + } + + if count == 0 { + return 0, nil, err + } + return uintptr(count), nil, nil +} + +func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr usermem.Addr, flags int32) (uintptr, error) { + // Capture the message header. + var msg MessageHeader64 + if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil { + return 0, err + } + + var controlData []byte + if msg.ControlLen > 0 { + // Put an upper bound to prevent large allocations. + if msg.ControlLen > maxControlLen { + return 0, syserror.ENOBUFS + } + controlData = make([]byte, msg.ControlLen) + if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil { + return 0, err + } + } + + // Read the destination address if one is specified. + var to []byte + if msg.NameLen != 0 { + var err error + to, err = CaptureAddress(t, usermem.Addr(msg.Name), msg.NameLen) + if err != nil { + return 0, err + } + } + + // Read data then call the sendmsg implementation. + if msg.IovLen > linux.UIO_MAXIOV { + return 0, syserror.EMSGSIZE + } + src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + controlMessages, err := control.Parse(t, s, controlData) + if err != nil { + return 0, err + } + + var haveDeadline bool + var deadline ktime.Time + if dl := s.SendTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + + // Call the syscall implementation. + n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) + err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) + if err != nil { + controlMessages.Release() + } + return uintptr(n), err +} + +// sendTo is the implementation of the sendto syscall. It is called by sendto +// and send syscall handlers. +func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) { + bl := int(bufLen) + if bl < 0 { + return 0, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFile(fd) + if file == nil { + return 0, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, syserror.ENOTSOCK + } + + if file.Flags().NonBlocking { + flags |= linux.MSG_DONTWAIT + } + + // Read the destination address if one is specified. + var to []byte + var err error + if namePtr != 0 { + to, err = CaptureAddress(t, namePtr, nameLen) + if err != nil { + return 0, err + } + } + + src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + var haveDeadline bool + var deadline ktime.Time + if dl := s.SendTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + + // Call the syscall implementation. + n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)}) + return uintptr(n), handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file) +} + +// SendTo implements the linux syscall sendto(2). +func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + bufPtr := args[1].Pointer() + bufLen := args[2].Uint64() + flags := args[3].Int() + namePtr := args[4].Pointer() + nameLen := args[5].Uint() + + n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen) + return n, nil, err +} + +// LINT.ThenChange(./vfs2/socket.go) diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go new file mode 100644 index 000000000..77c78889d --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -0,0 +1,337 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// doSplice implements a blocking splice operation. +func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) { + if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 || (opts.SrcStart+opts.Length < 0) { + return 0, syserror.EINVAL + } + + if opts.Length > int64(kernel.MAX_RW_COUNT) { + opts.Length = int64(kernel.MAX_RW_COUNT) + } + + var ( + total int64 + n int64 + err error + inCh chan struct{} + outCh chan struct{} + ) + for opts.Length > 0 { + n, err = fs.Splice(t, outFile, inFile, opts) + opts.Length -= n + total += n + if err != syserror.ErrWouldBlock { + break + } else if err == syserror.ErrWouldBlock && nonBlocking { + break + } + + // Note that the blocking behavior here is a bit different than the + // normal pattern. Because we need to have both data to read and data + // to write simultaneously, we actually explicitly block on both of + // these cases in turn before returning to the splice operation. + if inFile.Readiness(EventMaskRead) == 0 { + if inCh == nil { + inCh = make(chan struct{}, 1) + inW, _ := waiter.NewChannelEntry(inCh) + inFile.EventRegister(&inW, EventMaskRead) + defer inFile.EventUnregister(&inW) + continue // Need to refresh readiness. + } + if err = t.Block(inCh); err != nil { + break + } + } + if outFile.Readiness(EventMaskWrite) == 0 { + if outCh == nil { + outCh = make(chan struct{}, 1) + outW, _ := waiter.NewChannelEntry(outCh) + outFile.EventRegister(&outW, EventMaskWrite) + defer outFile.EventUnregister(&outW) + continue // Need to refresh readiness. + } + if err = t.Block(outCh); err != nil { + break + } + } + } + + if total > 0 { + // On Linux, inotify behavior is not very consistent with splice(2). We try + // our best to emulate Linux for very basic calls to splice, where for some + // reason, events are generated for output files, but not input files. + outFile.Dirent.InotifyEvent(linux.IN_MODIFY, 0) + } + return total, err +} + +// Sendfile implements linux system call sendfile(2). +func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + outFD := args[0].Int() + inFD := args[1].Int() + offsetAddr := args[2].Pointer() + count := int64(args[3].SizeT()) + + // Get files. + inFile := t.GetFile(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef() + + if !inFile.Flags().Read { + return 0, nil, syserror.EBADF + } + + outFile := t.GetFile(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef() + + if !outFile.Flags().Write { + return 0, nil, syserror.EBADF + } + + // Verify that the outfile Append flag is not set. + if outFile.Flags().Append { + return 0, nil, syserror.EINVAL + } + + // Verify that we have a regular infile. This is a requirement; the + // same check appears in Linux (fs/splice.c:splice_direct_to_actor). + if !fs.IsRegular(inFile.Dirent.Inode.StableAttr) { + return 0, nil, syserror.EINVAL + } + + var ( + n int64 + err error + ) + if offsetAddr != 0 { + // Verify that when offset address is not null, infile must be + // seekable. The fs.Splice routine itself validates basic read. + if !inFile.Flags().Pread { + return 0, nil, syserror.ESPIPE + } + + // Copy in the offset. + var offset int64 + if _, err := t.CopyIn(offsetAddr, &offset); err != nil { + return 0, nil, err + } + + // Do the splice. + n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{ + Length: count, + SrcOffset: true, + SrcStart: offset, + }, outFile.Flags().NonBlocking) + + // Copy out the new offset. + if _, err := t.CopyOut(offsetAddr, n+offset); err != nil { + return 0, nil, err + } + } else { + // Send data using splice. + n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{ + Length: count, + }, outFile.Flags().NonBlocking) + } + + // Sendfile can't lose any data because inFD is always a regual file. + if n != 0 { + err = nil + } + + // We can only pass a single file to handleIOError, so pick inFile + // arbitrarily. This is used only for debugging purposes. + return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "sendfile", inFile) +} + +// Splice implements splice(2). +func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + inFD := args[0].Int() + inOffset := args[1].Pointer() + outFD := args[2].Int() + outOffset := args[3].Pointer() + count := int64(args[4].SizeT()) + flags := args[5].Int() + + // Check for invalid flags. + if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { + return 0, nil, syserror.EINVAL + } + + // Get files. + outFile := t.GetFile(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef() + + inFile := t.GetFile(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef() + + // The operation is non-blocking if anything is non-blocking. + // + // N.B. This is a rather simplistic heuristic that avoids some + // poor edge case behavior since the exact semantics here are + // underspecified and vary between versions of Linux itself. + nonBlock := inFile.Flags().NonBlocking || outFile.Flags().NonBlocking || (flags&linux.SPLICE_F_NONBLOCK != 0) + + // Construct our options. + // + // Note that exactly one of the underlying buffers must be a pipe. We + // don't actually have this constraint internally, but we enforce it + // for the semantics of the call. + opts := fs.SpliceOpts{ + Length: count, + } + inFileAttr := inFile.Dirent.Inode.StableAttr + outFileAttr := outFile.Dirent.Inode.StableAttr + switch { + case fs.IsPipe(inFileAttr) && !fs.IsPipe(outFileAttr): + if inOffset != 0 { + return 0, nil, syserror.ESPIPE + } + if outOffset != 0 { + if !outFile.Flags().Pwrite { + return 0, nil, syserror.EINVAL + } + + var offset int64 + if _, err := t.CopyIn(outOffset, &offset); err != nil { + return 0, nil, err + } + + // Use the destination offset. + opts.DstOffset = true + opts.DstStart = offset + } + case !fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr): + if outOffset != 0 { + return 0, nil, syserror.ESPIPE + } + if inOffset != 0 { + if !inFile.Flags().Pread { + return 0, nil, syserror.EINVAL + } + + var offset int64 + if _, err := t.CopyIn(inOffset, &offset); err != nil { + return 0, nil, err + } + + // Use the source offset. + opts.SrcOffset = true + opts.SrcStart = offset + } + case fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr): + if inOffset != 0 || outOffset != 0 { + return 0, nil, syserror.ESPIPE + } + + // We may not refer to the same pipe; otherwise it's a continuous loop. + if inFileAttr.InodeID == outFileAttr.InodeID { + return 0, nil, syserror.EINVAL + } + default: + return 0, nil, syserror.EINVAL + } + + // Splice data. + n, err := doSplice(t, outFile, inFile, opts, nonBlock) + + // Special files can have additional requirements for granularity. For + // example, read from eventfd returns EINVAL if a size is less 8 bytes. + // Inotify is another example. read will return EINVAL is a buffer is + // too small to return the next event, but a size of an event isn't + // fixed, it is sizeof(struct inotify_event) + {NAME_LEN} + 1. + if n != 0 && err != nil && (fs.IsAnonymous(inFileAttr) || fs.IsAnonymous(outFileAttr)) { + err = nil + } + + // See above; inFile is chosen arbitrarily here. + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "splice", inFile) +} + +// Tee imlements tee(2). +func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + inFD := args[0].Int() + outFD := args[1].Int() + count := int64(args[2].SizeT()) + flags := args[3].Int() + + // Check for invalid flags. + if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { + return 0, nil, syserror.EINVAL + } + + // Get files. + outFile := t.GetFile(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef() + + inFile := t.GetFile(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef() + + // All files must be pipes. + if !fs.IsPipe(inFile.Dirent.Inode.StableAttr) || !fs.IsPipe(outFile.Dirent.Inode.StableAttr) { + return 0, nil, syserror.EINVAL + } + + // We may not refer to the same pipe; see above. + if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID { + return 0, nil, syserror.EINVAL + } + + // The operation is non-blocking if anything is non-blocking. + nonBlock := inFile.Flags().NonBlocking || outFile.Flags().NonBlocking || (flags&linux.SPLICE_F_NONBLOCK != 0) + + // Splice data. + n, err := doSplice(t, outFile, inFile, fs.SpliceOpts{ + Length: count, + Dup: true, + }, nonBlock) + + // Tee doesn't change a state of inFD, so it can't lose any data. + if n != 0 { + err = nil + } + + // See above; inFile is chosen arbitrarily here. + return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "tee", inFile) +} diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go new file mode 100644 index 000000000..46ebf27a2 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -0,0 +1,290 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// LINT.IfChange + +// Stat implements linux syscall stat(2). +func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + statAddr := args[1].Pointer() + + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + return stat(t, d, dirPath, statAddr) + }) +} + +// Fstatat implements linux syscall newfstatat, i.e. fstatat(2). +func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + statAddr := args[2].Pointer() + flags := args[3].Int() + + path, dirPath, err := copyInPath(t, addr, flags&linux.AT_EMPTY_PATH != 0) + if err != nil { + return 0, nil, err + } + + if path == "" { + // Annoying. What's wrong with fstat? + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, fstat(t, file, statAddr) + } + + // If the path ends in a slash (i.e. dirPath is true) or if AT_SYMLINK_NOFOLLOW is unset, + // then we must resolve the final component. + resolve := dirPath || flags&linux.AT_SYMLINK_NOFOLLOW == 0 + + return 0, nil, fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + return stat(t, d, dirPath, statAddr) + }) +} + +// Lstat implements linux syscall lstat(2). +func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + statAddr := args[1].Pointer() + + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + // If the path ends in a slash (i.e. dirPath is true), then we *do* + // want to resolve the final component. + resolve := dirPath + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + return stat(t, d, dirPath, statAddr) + }) +} + +// Fstat implements linux syscall fstat(2). +func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + statAddr := args[1].Pointer() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, fstat(t, file, statAddr) +} + +// stat implements stat from the given *fs.Dirent. +func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + uattr, err := d.Inode.UnstableAttr(t) + if err != nil { + return err + } + s := statFromAttrs(t, d.Inode.StableAttr, uattr) + _, err = s.CopyOut(t, statAddr) + return err +} + +// fstat implements fstat for the given *fs.File. +func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error { + uattr, err := f.UnstableAttr(t) + if err != nil { + return err + } + s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr) + _, err = s.CopyOut(t, statAddr) + return err +} + +// Statx implements linux syscall statx(2). +func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + pathAddr := args[1].Pointer() + flags := args[2].Int() + mask := args[3].Uint() + statxAddr := args[4].Pointer() + + if mask&linux.STATX__RESERVED != 0 { + return 0, nil, syserror.EINVAL + } + if flags&^(linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH|linux.AT_STATX_SYNC_TYPE) != 0 { + return 0, nil, syserror.EINVAL + } + if flags&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { + return 0, nil, syserror.EINVAL + } + + path, dirPath, err := copyInPath(t, pathAddr, flags&linux.AT_EMPTY_PATH != 0) + if err != nil { + return 0, nil, err + } + + if path == "" { + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + uattr, err := file.UnstableAttr(t) + if err != nil { + return 0, nil, err + } + return 0, nil, statx(t, file.Dirent.Inode.StableAttr, uattr, statxAddr) + } + + resolve := dirPath || flags&linux.AT_SYMLINK_NOFOLLOW == 0 + + return 0, nil, fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + uattr, err := d.Inode.UnstableAttr(t) + if err != nil { + return err + } + return statx(t, d.Inode.StableAttr, uattr, statxAddr) + }) +} + +func statx(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr, statxAddr usermem.Addr) error { + // "[T]he kernel may return fields that weren't requested and may fail to + // return fields that were requested, depending on what the backing + // filesystem supports. + // [...] + // A filesystem may also fill in fields that the caller didn't ask for + // if it has values for them available and the information is available + // at no extra cost. If this happens, the corresponding bits will be + // set in stx_mask." -- statx(2) + // + // We fill in all the values we have (which currently does not include + // btime, see b/135608823), regardless of what the user asked for. The + // STATX_BASIC_STATS mask indicates that all fields are present except + // for btime. + + devMajor, devMinor := linux.DecodeDeviceID(uint32(sattr.DeviceID)) + s := linux.Statx{ + // TODO(b/135608823): Support btime, and then change this to + // STATX_ALL to indicate presence of btime. + Mask: linux.STATX_BASIC_STATS, + + // No attributes, and none supported. + Attributes: 0, + AttributesMask: 0, + + Blksize: uint32(sattr.BlockSize), + Nlink: uint32(uattr.Links), + UID: uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()), + GID: uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()), + Mode: uint16(sattr.Type.LinuxType()) | uint16(uattr.Perms.LinuxMode()), + Ino: sattr.InodeID, + Size: uint64(uattr.Size), + Blocks: uint64(uattr.Usage) / 512, + Atime: uattr.AccessTime.StatxTimestamp(), + Ctime: uattr.StatusChangeTime.StatxTimestamp(), + Mtime: uattr.ModificationTime.StatxTimestamp(), + RdevMajor: uint32(sattr.DeviceFileMajor), + RdevMinor: sattr.DeviceFileMinor, + DevMajor: uint32(devMajor), + DevMinor: devMinor, + } + _, err := t.CopyOut(statxAddr, &s) + return err +} + +// Statfs implements linux syscall statfs(2). +func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + statfsAddr := args[1].Pointer() + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + return statfsImpl(t, d, statfsAddr) + }) +} + +// Fstatfs implements linux syscall fstatfs(2). +func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + statfsAddr := args[1].Pointer() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, statfsImpl(t, file.Dirent, statfsAddr) +} + +// statfsImpl implements the linux syscall statfs and fstatfs based on a Dirent, +// copying the statfs structure out to addr on success, otherwise an error is +// returned. +func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error { + info, err := d.Inode.StatFS(t) + if err != nil { + return err + } + // Construct the statfs structure and copy it out. + statfs := linux.Statfs{ + Type: info.Type, + // Treat block size and fragment size as the same, as + // most consumers of this structure will expect one + // or the other to be filled in. + BlockSize: d.Inode.StableAttr.BlockSize, + Blocks: info.TotalBlocks, + // We don't have the concept of reserved blocks, so + // report blocks free the same as available blocks. + // This is a normal thing for filesystems, to do, see + // udf, hugetlbfs, tmpfs, among others. + BlocksFree: info.FreeBlocks, + BlocksAvailable: info.FreeBlocks, + Files: info.TotalFiles, + FilesFree: info.FreeFiles, + // Same as Linux for simple_statfs, see fs/libfs.c. + NameLength: linux.NAME_MAX, + FragmentSize: d.Inode.StableAttr.BlockSize, + // Leave other fields 0 like simple_statfs does. + } + _, err = t.CopyOut(addr, &statfs) + return err +} + +// LINT.ThenChange(vfs2/stat.go) diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go new file mode 100644 index 000000000..0a04a6113 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go @@ -0,0 +1,45 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" +) + +// LINT.IfChange + +func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat { + return linux.Stat{ + Dev: sattr.DeviceID, + Ino: sattr.InodeID, + Nlink: uattr.Links, + Mode: sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()), + UID: uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()), + GID: uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()), + Rdev: uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)), + Size: uattr.Size, + Blksize: sattr.BlockSize, + Blocks: uattr.Usage / 512, + ATime: uattr.AccessTime.Timespec(), + MTime: uattr.ModificationTime.Timespec(), + CTime: uattr.StatusChangeTime.Timespec(), + } +} + +// LINT.ThenChange(vfs2/stat_amd64.go) diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go new file mode 100644 index 000000000..5a3b1bfad --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go @@ -0,0 +1,45 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" +) + +// LINT.IfChange + +func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat { + return linux.Stat{ + Dev: sattr.DeviceID, + Ino: sattr.InodeID, + Nlink: uint32(uattr.Links), + Mode: sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()), + UID: uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()), + GID: uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()), + Rdev: uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)), + Size: uattr.Size, + Blksize: int32(sattr.BlockSize), + Blocks: uattr.Usage / 512, + ATime: uattr.AccessTime.Timespec(), + MTime: uattr.ModificationTime.Timespec(), + CTime: uattr.StatusChangeTime.Timespec(), + } +} + +// LINT.ThenChange(vfs2/stat_arm64.go) diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go new file mode 100644 index 000000000..5ad465ae3 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -0,0 +1,141 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// LINT.IfChange + +// Sync implements linux system call sync(2). +func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + t.MountNamespace().SyncAll(t) + // Sync is always successful. + return 0, nil, nil +} + +// Syncfs implements linux system call syncfs(2). +func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Use "sync-the-world" for now, it's guaranteed that fd is at least + // on the root filesystem. + return Sync(t, args) +} + +// Fsync implements linux syscall fsync(2). +func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll) + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) +} + +// Fdatasync implements linux syscall fdatasync(2). +// +// At the moment, it just calls Fsync, which is a big hammer, but correct. +func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData) + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) +} + +// SyncFileRange implements linux syscall sync_file_rage(2) +func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + var err error + + fd := args[0].Int() + offset := args[1].Int64() + nbytes := args[2].Int64() + uflags := args[3].Uint() + + if offset < 0 || offset+nbytes < offset { + return 0, nil, syserror.EINVAL + } + + if uflags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE| + linux.SYNC_FILE_RANGE_WRITE| + linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 { + return 0, nil, syserror.EINVAL + } + + if nbytes == 0 { + nbytes = fs.FileMaxOffset + } + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // SYNC_FILE_RANGE_WAIT_BEFORE waits upon write-out of all pages in the + // specified range that have already been submitted to the device + // driver for write-out before performing any write. + if uflags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 && + uflags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 { + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.ENOSYS + } + + // SYNC_FILE_RANGE_WRITE initiates write-out of all dirty pages in the + // specified range which are not presently submitted write-out. + // + // It looks impossible to implement this functionality without a + // massive rework of the vfs subsystem. file.Fsync() take a file lock + // for the entire operation, so even if it is running in a go routing, + // it blocks other file operations instead of flushing data in the + // background. + // + // It should be safe to skipped this flag while nobody uses + // SYNC_FILE_RANGE_WAIT_BEFORE. + + // SYNC_FILE_RANGE_WAIT_AFTER waits upon write-out of all pages in the + // range after performing any write. + // + // In Linux, sync_file_range() doesn't writes out the file's + // meta-data, but fdatasync() does if a file size is changed. + if uflags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 { + err = file.Fsync(t, offset, fs.FileMaxOffset, fs.SyncData) + } + + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) +} + +// LINT.ThenChange(vfs2/sync.go) diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go new file mode 100644 index 000000000..297de052a --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go @@ -0,0 +1,48 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usage" +) + +// Sysinfo implements the sysinfo syscall as described in man 2 sysinfo. +func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + mf := t.Kernel().MemoryFile() + mf.UpdateUsage() + _, totalUsage := usage.MemoryAccounting.Copy() + totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) + memFree := totalSize - totalUsage + if memFree > totalSize { + // Underflow. + memFree = 0 + } + + // Only a subset of the fields in sysinfo_t make sense to return. + si := linux.Sysinfo{ + Procs: uint16(len(t.PIDNamespace().Tasks())), + Uptime: t.Kernel().MonotonicClock().Now().Seconds(), + TotalRAM: totalSize, + FreeRAM: memFree, + Unit: 1, + } + _, err := t.CopyOut(addr, si) + return 0, nil, err +} diff --git a/pkg/sentry/syscalls/linux/sys_syslog.go b/pkg/sentry/syscalls/linux/sys_syslog.go new file mode 100644 index 000000000..40c8bb061 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_syslog.go @@ -0,0 +1,61 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + _SYSLOG_ACTION_READ_ALL = 3 + _SYSLOG_ACTION_SIZE_BUFFER = 10 +) + +// logBufLen is the default syslog buffer size on Linux. +const logBufLen = 1 << 17 + +// Syslog implements part of Linux syscall syslog. +// +// Only the unpriviledged commands are implemented, allowing applications to +// read a fun dmesg. +func Syslog(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + command := args[0].Int() + buf := args[1].Pointer() + size := int(args[2].Int()) + + switch command { + case _SYSLOG_ACTION_READ_ALL: + if size < 0 { + return 0, nil, syserror.EINVAL + } + if size > logBufLen { + size = logBufLen + } + + log := t.Kernel().Syslog().Log() + if len(log) > size { + log = log[:size] + } + + n, err := t.CopyOutBytes(buf, log) + return uintptr(n), nil, err + case _SYSLOG_ACTION_SIZE_BUFFER: + return logBufLen, nil, nil + default: + return 0, nil, syserror.ENOSYS + } +} diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go new file mode 100644 index 000000000..00915fdde --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -0,0 +1,769 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "path" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/sched" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const ( + // ExecMaxTotalSize is the maximum length of all argv and envv entries. + // + // N.B. The behavior here is different than Linux. Linux provides a limit on + // individual arguments of 32 pages, and an aggregate limit of at least 32 pages + // but otherwise bounded by min(stack size / 4, 8 MB * 3 / 4). We don't implement + // any behavior based on the stack size, and instead provide a fixed hard-limit of + // 2 MB (which should work well given that 8 MB stack limits are common). + ExecMaxTotalSize = 2 * 1024 * 1024 + + // ExecMaxElemSize is the maximum length of a single argv or envv entry. + ExecMaxElemSize = 32 * usermem.PageSize + + // exitSignalMask is the signal mask to be sent at exit. Same as CSIGNAL in linux. + exitSignalMask = 0xff +) + +// Getppid implements linux syscall getppid(2). +func Getppid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + parent := t.Parent() + if parent == nil { + return 0, nil, nil + } + return uintptr(t.PIDNamespace().IDOfThreadGroup(parent.ThreadGroup())), nil, nil +} + +// Getpid implements linux syscall getpid(2). +func Getpid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return uintptr(t.ThreadGroup().ID()), nil, nil +} + +// Gettid implements linux syscall gettid(2). +func Gettid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return uintptr(t.ThreadID()), nil, nil +} + +// Execve implements linux syscall execve(2). +func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + filenameAddr := args[0].Pointer() + argvAddr := args[1].Pointer() + envvAddr := args[2].Pointer() + + return execveat(t, linux.AT_FDCWD, filenameAddr, argvAddr, envvAddr, 0) +} + +// Execveat implements linux syscall execveat(2). +func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := args[0].Int() + pathnameAddr := args[1].Pointer() + argvAddr := args[2].Pointer() + envvAddr := args[3].Pointer() + flags := args[4].Int() + + return execveat(t, dirFD, pathnameAddr, argvAddr, envvAddr, flags) +} + +func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) { + pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX) + if err != nil { + return 0, nil, err + } + + var argv, envv []string + if argvAddr != 0 { + var err error + argv, err = t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize) + if err != nil { + return 0, nil, err + } + } + if envvAddr != 0 { + var err error + envv, err = t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize) + if err != nil { + return 0, nil, err + } + } + + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return 0, nil, syserror.EINVAL + } + atEmptyPath := flags&linux.AT_EMPTY_PATH != 0 + if !atEmptyPath && len(pathname) == 0 { + return 0, nil, syserror.ENOENT + } + resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0 + + root := t.FSContext().RootDirectory() + defer root.DecRef() + + var wd *fs.Dirent + var executable fsbridge.File + var closeOnExec bool + if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) { + // Even if the pathname is absolute, we may still need the wd + // for interpreter scripts if the path of the interpreter is + // relative. + wd = t.FSContext().WorkingDirectory() + } else { + // Need to extract the given FD. + f, fdFlags := t.FDTable().Get(dirFD) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + closeOnExec = fdFlags.CloseOnExec + + if atEmptyPath && len(pathname) == 0 { + // TODO(gvisor.dev/issue/160): Linux requires only execute permission, + // not read. However, our backing filesystems may prevent us from reading + // the file without read permission. Additionally, a task with a + // non-readable executable has additional constraints on access via + // ptrace and procfs. + if err := f.Dirent.Inode.CheckPermission(t, fs.PermMask{Read: true, Execute: true}); err != nil { + return 0, nil, err + } + executable = fsbridge.NewFSFile(f) + } else { + wd = f.Dirent + wd.IncRef() + if !fs.IsDir(wd.Inode.StableAttr) { + return 0, nil, syserror.ENOTDIR + } + } + } + if wd != nil { + defer wd.DecRef() + } + + // Load the new TaskContext. + remainingTraversals := uint(linux.MaxSymlinkTraversals) + loadArgs := loader.LoadArgs{ + Opener: fsbridge.NewFSLookup(t.MountNamespace(), root, wd), + RemainingTraversals: &remainingTraversals, + ResolveFinal: resolveFinal, + Filename: pathname, + File: executable, + CloseOnExec: closeOnExec, + Argv: argv, + Envv: envv, + Features: t.Arch().FeatureSet(), + } + + tc, se := t.Kernel().LoadTaskImage(t, loadArgs) + if se != nil { + return 0, nil, se.ToError() + } + + ctrl, err := t.Execve(tc) + return 0, ctrl, err +} + +// Exit implements linux syscall exit(2). +func Exit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + status := int(args[0].Int()) + t.PrepareExit(kernel.ExitStatus{Code: status}) + return 0, kernel.CtrlDoExit, nil +} + +// ExitGroup implements linux syscall exit_group(2). +func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + status := int(args[0].Int()) + t.PrepareGroupExit(kernel.ExitStatus{Code: status}) + return 0, kernel.CtrlDoExit, nil +} + +// clone is used by Clone, Fork, and VFork. +func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr, childTID usermem.Addr, tls usermem.Addr) (uintptr, *kernel.SyscallControl, error) { + opts := kernel.CloneOptions{ + SharingOptions: kernel.SharingOptions{ + NewAddressSpace: flags&linux.CLONE_VM == 0, + NewSignalHandlers: flags&linux.CLONE_SIGHAND == 0, + NewThreadGroup: flags&linux.CLONE_THREAD == 0, + TerminationSignal: linux.Signal(flags & exitSignalMask), + NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, + NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, + NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, + NewFiles: flags&linux.CLONE_FILES == 0, + NewFSContext: flags&linux.CLONE_FS == 0, + NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, + NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, + }, + Stack: stack, + SetTLS: flags&linux.CLONE_SETTLS == linux.CLONE_SETTLS, + TLS: tls, + ChildClearTID: flags&linux.CLONE_CHILD_CLEARTID == linux.CLONE_CHILD_CLEARTID, + ChildSetTID: flags&linux.CLONE_CHILD_SETTID == linux.CLONE_CHILD_SETTID, + ChildTID: childTID, + ParentSetTID: flags&linux.CLONE_PARENT_SETTID == linux.CLONE_PARENT_SETTID, + ParentTID: parentTID, + Vfork: flags&linux.CLONE_VFORK == linux.CLONE_VFORK, + Untraced: flags&linux.CLONE_UNTRACED == linux.CLONE_UNTRACED, + InheritTracer: flags&linux.CLONE_PTRACE == linux.CLONE_PTRACE, + } + ntid, ctrl, err := t.Clone(&opts) + return uintptr(ntid), ctrl, err +} + +// Fork implements Linux syscall fork(2). +func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // "A call to fork() is equivalent to a call to clone(2) specifying flags + // as just SIGCHLD." - fork(2) + return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0) +} + +// Vfork implements Linux syscall vfork(2). +func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // """ + // A call to vfork() is equivalent to calling clone(2) with flags specified as: + // + // CLONE_VM | CLONE_VFORK | SIGCHLD + // """ - vfork(2) + return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0) +} + +// parseCommonWaitOptions applies the options common to wait4 and waitid to +// wopts. +func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { + switch options & (linux.WCLONE | linux.WALL) { + case 0: + wopts.NonCloneTasks = true + case linux.WCLONE: + wopts.CloneTasks = true + case linux.WALL: + wopts.NonCloneTasks = true + wopts.CloneTasks = true + default: + return syserror.EINVAL + } + if options&linux.WCONTINUED != 0 { + wopts.Events |= kernel.EventGroupContinue + } + if options&linux.WNOHANG == 0 { + wopts.BlockInterruptErr = kernel.ERESTARTSYS + } + if options&linux.WNOTHREAD == 0 { + wopts.SiblingChildren = true + } + return nil +} + +// wait4 waits for the given child process to exit. +func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) { + if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { + return 0, syserror.EINVAL + } + wopts := kernel.WaitOptions{ + Events: kernel.EventExit | kernel.EventTraceeStop, + ConsumeEvent: true, + } + // There are four cases to consider: + // + // pid < -1 any child process whose process group ID is equal to the absolute value of pid + // pid == -1 any child process + // pid == 0 any child process whose process group ID is equal to that of the calling process + // pid > 0 the child whose process ID is equal to the value of pid + switch { + case pid < -1: + wopts.SpecificPGID = kernel.ProcessGroupID(-pid) + case pid == -1: + // Any process is the default. + case pid == 0: + wopts.SpecificPGID = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup()) + default: + wopts.SpecificTID = kernel.ThreadID(pid) + } + + if err := parseCommonWaitOptions(&wopts, options); err != nil { + return 0, err + } + if options&linux.WUNTRACED != 0 { + wopts.Events |= kernel.EventChildGroupStop + } + + wr, err := t.Wait(&wopts) + if err != nil { + if err == kernel.ErrNoWaitableEvent { + return 0, nil + } + return 0, err + } + if statusAddr != 0 { + if _, err := t.CopyOut(statusAddr, wr.Status); err != nil { + return 0, err + } + } + if rusageAddr != 0 { + ru := getrusage(wr.Task, linux.RUSAGE_BOTH) + if _, err := t.CopyOut(rusageAddr, &ru); err != nil { + return 0, err + } + } + return uintptr(wr.TID), nil +} + +// Wait4 implements linux syscall wait4(2). +func Wait4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := int(args[0].Int()) + statusAddr := args[1].Pointer() + options := int(args[2].Uint()) + rusageAddr := args[3].Pointer() + + n, err := wait4(t, pid, statusAddr, options, rusageAddr) + return n, nil, err +} + +// WaitPid implements linux syscall waitpid(2). +func WaitPid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := int(args[0].Int()) + statusAddr := args[1].Pointer() + options := int(args[2].Uint()) + + n, err := wait4(t, pid, statusAddr, options, 0) + return n, nil, err +} + +// Waitid implements linux syscall waitid(2). +func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + idtype := args[0].Int() + id := args[1].Int() + infop := args[2].Pointer() + options := int(args[3].Uint()) + rusageAddr := args[4].Pointer() + + if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { + return 0, nil, syserror.EINVAL + } + if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 { + return 0, nil, syserror.EINVAL + } + wopts := kernel.WaitOptions{ + Events: kernel.EventTraceeStop, + ConsumeEvent: options&linux.WNOWAIT == 0, + } + switch idtype { + case linux.P_ALL: + case linux.P_PID: + wopts.SpecificTID = kernel.ThreadID(id) + case linux.P_PGID: + wopts.SpecificPGID = kernel.ProcessGroupID(id) + default: + return 0, nil, syserror.EINVAL + } + + if err := parseCommonWaitOptions(&wopts, options); err != nil { + return 0, nil, err + } + if options&linux.WEXITED != 0 { + wopts.Events |= kernel.EventExit + } + if options&linux.WSTOPPED != 0 { + wopts.Events |= kernel.EventChildGroupStop + } + + wr, err := t.Wait(&wopts) + if err != nil { + if err == kernel.ErrNoWaitableEvent { + err = nil + // "If WNOHANG was specified in options and there were no children + // in a waitable state, then waitid() returns 0 immediately and the + // state of the siginfo_t structure pointed to by infop is + // unspecified." - waitid(2). But Linux's waitid actually zeroes + // out the fields it would set for a successful waitid in this case + // as well. + if infop != 0 { + var si arch.SignalInfo + _, err = t.CopyOut(infop, &si) + } + } + return 0, nil, err + } + if rusageAddr != 0 { + ru := getrusage(wr.Task, linux.RUSAGE_BOTH) + if _, err := t.CopyOut(rusageAddr, &ru); err != nil { + return 0, nil, err + } + } + if infop == 0 { + return 0, nil, nil + } + si := arch.SignalInfo{ + Signo: int32(linux.SIGCHLD), + } + si.SetPid(int32(wr.TID)) + si.SetUid(int32(wr.UID)) + // TODO(b/73541790): convert kernel.ExitStatus to functions and make + // WaitResult.Status a linux.WaitStatus. + s := syscall.WaitStatus(wr.Status) + switch { + case s.Exited(): + si.Code = arch.CLD_EXITED + si.SetStatus(int32(s.ExitStatus())) + case s.Signaled(): + si.Code = arch.CLD_KILLED + si.SetStatus(int32(s.Signal())) + case s.CoreDump(): + si.Code = arch.CLD_DUMPED + si.SetStatus(int32(s.Signal())) + case s.Stopped(): + if wr.Event == kernel.EventTraceeStop { + si.Code = arch.CLD_TRAPPED + si.SetStatus(int32(s.TrapCause())) + } else { + si.Code = arch.CLD_STOPPED + si.SetStatus(int32(s.StopSignal())) + } + case s.Continued(): + si.Code = arch.CLD_CONTINUED + si.SetStatus(int32(linux.SIGCONT)) + default: + t.Warningf("waitid got incomprehensible wait status %d", s) + } + _, err = t.CopyOut(infop, &si) + return 0, nil, err +} + +// SetTidAddress implements linux syscall set_tid_address(2). +func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + // Always succeed, return caller's tid. + t.SetClearTID(addr) + return uintptr(t.ThreadID()), nil, nil +} + +// Unshare implements linux syscall unshare(2). +func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + opts := kernel.SharingOptions{ + NewAddressSpace: flags&linux.CLONE_VM == linux.CLONE_VM, + NewSignalHandlers: flags&linux.CLONE_SIGHAND == linux.CLONE_SIGHAND, + NewThreadGroup: flags&linux.CLONE_THREAD == linux.CLONE_THREAD, + NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, + NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, + NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, + NewFiles: flags&linux.CLONE_FILES == linux.CLONE_FILES, + NewFSContext: flags&linux.CLONE_FS == linux.CLONE_FS, + NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, + NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, + } + // "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2) + if opts.NewPIDNamespace { + opts.NewThreadGroup = true + } + // "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since + // Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS." + if opts.NewUserNamespace { + opts.NewThreadGroup = true + opts.NewFSContext = true + } + return 0, nil, t.Unshare(&opts) +} + +// SchedYield implements linux syscall sched_yield(2). +func SchedYield(t *kernel.Task, _ arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + t.Yield() + return 0, nil, nil +} + +// SchedSetaffinity implements linux syscall sched_setaffinity(2). +func SchedSetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tid := args[0].Int() + size := args[1].SizeT() + maskAddr := args[2].Pointer() + + var task *kernel.Task + if tid == 0 { + task = t + } else { + task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) + if task == nil { + return 0, nil, syserror.ESRCH + } + } + + mask := sched.NewCPUSet(t.Kernel().ApplicationCores()) + if size > mask.Size() { + size = mask.Size() + } + if _, err := t.CopyInBytes(maskAddr, mask[:size]); err != nil { + return 0, nil, err + } + return 0, nil, task.SetCPUMask(mask) +} + +// SchedGetaffinity implements linux syscall sched_getaffinity(2). +func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tid := args[0].Int() + size := args[1].SizeT() + maskAddr := args[2].Pointer() + + // This limitation is because linux stores the cpumask + // in an array of "unsigned long" so the buffer needs to + // be a multiple of the word size. + if size&(t.Arch().Width()-1) > 0 { + return 0, nil, syserror.EINVAL + } + + var task *kernel.Task + if tid == 0 { + task = t + } else { + task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) + if task == nil { + return 0, nil, syserror.ESRCH + } + } + + mask := task.CPUMask() + // The buffer needs to be big enough to hold a cpumask with + // all possible cpus. + if size < mask.Size() { + return 0, nil, syserror.EINVAL + } + _, err := t.CopyOutBytes(maskAddr, mask) + + // NOTE: The syscall interface is slightly different than the glibc + // interface. The raw sched_getaffinity syscall returns the number of + // bytes used to represent a cpu mask. + return uintptr(mask.Size()), nil, err +} + +// Getcpu implements linux syscall getcpu(2). +func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + cpu := args[0].Pointer() + node := args[1].Pointer() + // third argument to this system call is nowadays unused. + + if cpu != 0 { + buf := t.CopyScratchBuffer(4) + usermem.ByteOrder.PutUint32(buf, uint32(t.CPU())) + if _, err := t.CopyOutBytes(cpu, buf); err != nil { + return 0, nil, err + } + } + // We always return node 0. + if node != 0 { + if _, err := t.MemoryManager().ZeroOut(t, node, 4, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, nil, err + } + } + return 0, nil, nil +} + +// Setpgid implements the linux syscall setpgid(2). +func Setpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // Note that throughout this function, pgid is interpreted with respect + // to t's namespace, not with respect to the selected ThreadGroup's + // namespace (which may be different). + pid := kernel.ThreadID(args[0].Int()) + pgid := kernel.ProcessGroupID(args[1].Int()) + + // "If pid is zero, then the process ID of the calling process is used." + tg := t.ThreadGroup() + if pid != 0 { + ot := t.PIDNamespace().TaskWithID(pid) + if ot == nil { + return 0, nil, syserror.ESRCH + } + tg = ot.ThreadGroup() + if tg.Leader() != ot { + return 0, nil, syserror.EINVAL + } + + // Setpgid only operates on child threadgroups. + if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) { + return 0, nil, syserror.ESRCH + } + } + + // "If pgid is zero, then the PGID of the process specified by pid is made + // the same as its process ID." + defaultPGID := kernel.ProcessGroupID(t.PIDNamespace().IDOfThreadGroup(tg)) + if pgid == 0 { + pgid = defaultPGID + } else if pgid < 0 { + return 0, nil, syserror.EINVAL + } + + // If the pgid is the same as the group, then create a new one. Otherwise, + // we attempt to join an existing process group. + if pgid == defaultPGID { + // For convenience, errors line up with Linux syscall API. + if err := tg.CreateProcessGroup(); err != nil { + // Is the process group already as expected? If so, + // just return success. This is the same behavior as + // Linux. + if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == defaultPGID { + return 0, nil, nil + } + return 0, nil, err + } + } else { + // Same as CreateProcessGroup, above. + if err := tg.JoinProcessGroup(t.PIDNamespace(), pgid, tg != t.ThreadGroup()); err != nil { + // See above. + if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid { + return 0, nil, nil + } + return 0, nil, err + } + } + + // Success. + return 0, nil, nil +} + +// Getpgrp implements the linux syscall getpgrp(2). +func Getpgrp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return uintptr(t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), nil, nil +} + +// Getpgid implements the linux syscall getpgid(2). +func Getpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tid := kernel.ThreadID(args[0].Int()) + if tid == 0 { + return Getpgrp(t, args) + } + + target := t.PIDNamespace().TaskWithID(tid) + if target == nil { + return 0, nil, syserror.ESRCH + } + + return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil +} + +// Setsid implements the linux syscall setsid(2). +func Setsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, t.ThreadGroup().CreateSession() +} + +// Getsid implements the linux syscall getsid(2). +func Getsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tid := kernel.ThreadID(args[0].Int()) + if tid == 0 { + return uintptr(t.PIDNamespace().IDOfSession(t.ThreadGroup().Session())), nil, nil + } + + target := t.PIDNamespace().TaskWithID(tid) + if target == nil { + return 0, nil, syserror.ESRCH + } + + return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil +} + +// Getpriority pretends to implement the linux syscall getpriority(2). +// +// This is a stub; real priorities require a full scheduler. +func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + which := args[0].Int() + who := kernel.ThreadID(args[1].Int()) + + switch which { + case linux.PRIO_PROCESS: + // Look for who, return ESRCH if not found. + var task *kernel.Task + if who == 0 { + task = t + } else { + task = t.PIDNamespace().TaskWithID(who) + } + + if task == nil { + return 0, nil, syserror.ESRCH + } + + // From kernel/sys.c:getpriority: + // "To avoid negative return values, 'getpriority()' + // will not return the normal nice-value, but a negated + // value that has been offset by 20" + return uintptr(20 - task.Niceness()), nil, nil + case linux.PRIO_USER: + fallthrough + case linux.PRIO_PGRP: + // PRIO_USER and PRIO_PGRP have no further implementation yet. + return 0, nil, nil + default: + return 0, nil, syserror.EINVAL + } +} + +// Setpriority pretends to implement the linux syscall setpriority(2). +// +// This is a stub; real priorities require a full scheduler. +func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + which := args[0].Int() + who := kernel.ThreadID(args[1].Int()) + niceval := int(args[2].Int()) + + // In the kernel's implementation, values outside the range + // of [-20, 19] are truncated to these minimum and maximum + // values. + if niceval < -20 /* min niceval */ { + niceval = -20 + } else if niceval > 19 /* max niceval */ { + niceval = 19 + } + + switch which { + case linux.PRIO_PROCESS: + // Look for who, return ESRCH if not found. + var task *kernel.Task + if who == 0 { + task = t + } else { + task = t.PIDNamespace().TaskWithID(who) + } + + if task == nil { + return 0, nil, syserror.ESRCH + } + + task.SetNiceness(niceval) + case linux.PRIO_USER: + fallthrough + case linux.PRIO_PGRP: + // PRIO_USER and PRIO_PGRP have no further implementation yet. + return 0, nil, nil + default: + return 0, nil, syserror.EINVAL + } + + return 0, nil, nil +} + +// Ptrace implements linux system call ptrace(2). +func Ptrace(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + req := args[0].Int64() + pid := kernel.ThreadID(args[1].Int()) + addr := args[2].Pointer() + data := args[3].Pointer() + + return 0, nil, t.Ptrace(req, pid, addr, data) +} diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go new file mode 100644 index 000000000..2d2aa0819 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_time.go @@ -0,0 +1,342 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "fmt" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// The most significant 29 bits hold either a pid or a file descriptor. +func pidOfClockID(c int32) kernel.ThreadID { + return kernel.ThreadID(^(c >> 3)) +} + +// whichCPUClock returns one of CPUCLOCK_PERF, CPUCLOCK_VIRT, CPUCLOCK_SCHED or +// CLOCK_FD. +func whichCPUClock(c int32) int32 { + return c & linux.CPUCLOCK_CLOCK_MASK +} + +// isCPUClockPerThread returns true if the CPUCLOCK_PERTHREAD bit is set in the +// clock id. +func isCPUClockPerThread(c int32) bool { + return c&linux.CPUCLOCK_PERTHREAD_MASK != 0 +} + +// isValidCPUClock returns checks that the cpu clock id is valid. +func isValidCPUClock(c int32) bool { + // Bits 0, 1, and 2 cannot all be set. + if c&7 == 7 { + return false + } + if whichCPUClock(c) >= linux.CPUCLOCK_MAX { + return false + } + return true +} + +// targetTask returns the kernel.Task for the given clock id. +func targetTask(t *kernel.Task, c int32) *kernel.Task { + pid := pidOfClockID(c) + if pid == 0 { + return t + } + return t.PIDNamespace().TaskWithID(pid) +} + +// ClockGetres implements linux syscall clock_getres(2). +func ClockGetres(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + clockID := int32(args[0].Int()) + addr := args[1].Pointer() + r := linux.Timespec{ + Sec: 0, + Nsec: 1, + } + + if _, err := getClock(t, clockID); err != nil { + return 0, nil, syserror.EINVAL + } + + if addr == 0 { + // Don't need to copy out. + return 0, nil, nil + } + + return 0, nil, copyTimespecOut(t, addr, &r) +} + +type cpuClocker interface { + UserCPUClock() ktime.Clock + CPUClock() ktime.Clock +} + +func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) { + if clockID < 0 { + if !isValidCPUClock(clockID) { + return nil, syserror.EINVAL + } + + targetTask := targetTask(t, clockID) + if targetTask == nil { + return nil, syserror.EINVAL + } + + var target cpuClocker + if isCPUClockPerThread(clockID) { + target = targetTask + } else { + target = targetTask.ThreadGroup() + } + + switch whichCPUClock(clockID) { + case linux.CPUCLOCK_VIRT: + return target.UserCPUClock(), nil + case linux.CPUCLOCK_PROF, linux.CPUCLOCK_SCHED: + // CPUCLOCK_SCHED is approximated by CPUCLOCK_PROF. + return target.CPUClock(), nil + default: + return nil, syserror.EINVAL + } + } + + switch clockID { + case linux.CLOCK_REALTIME, linux.CLOCK_REALTIME_COARSE: + return t.Kernel().RealtimeClock(), nil + case linux.CLOCK_MONOTONIC, linux.CLOCK_MONOTONIC_COARSE, + linux.CLOCK_MONOTONIC_RAW, linux.CLOCK_BOOTTIME: + // CLOCK_MONOTONIC approximates CLOCK_MONOTONIC_RAW. + // CLOCK_BOOTTIME is internally mapped to CLOCK_MONOTONIC, as: + // - CLOCK_BOOTTIME should behave as CLOCK_MONOTONIC while also + // including suspend time. + // - gVisor has no concept of suspend/resume. + // - CLOCK_MONOTONIC already includes save/restore time, which is + // the closest to suspend time. + return t.Kernel().MonotonicClock(), nil + case linux.CLOCK_PROCESS_CPUTIME_ID: + return t.ThreadGroup().CPUClock(), nil + case linux.CLOCK_THREAD_CPUTIME_ID: + return t.CPUClock(), nil + default: + return nil, syserror.EINVAL + } +} + +// ClockGettime implements linux syscall clock_gettime(2). +func ClockGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + clockID := int32(args[0].Int()) + addr := args[1].Pointer() + + c, err := getClock(t, clockID) + if err != nil { + return 0, nil, err + } + ts := c.Now().Timespec() + return 0, nil, copyTimespecOut(t, addr, &ts) +} + +// ClockSettime implements linux syscall clock_settime(2). +func ClockSettime(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, syserror.EPERM +} + +// Time implements linux syscall time(2). +func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + r := t.Kernel().RealtimeClock().Now().TimeT() + if addr == usermem.Addr(0) { + return uintptr(r), nil, nil + } + + if _, err := t.CopyOut(addr, r); err != nil { + return 0, nil, err + } + return uintptr(r), nil, nil +} + +// clockNanosleepRestartBlock encapsulates the state required to restart +// clock_nanosleep(2) via restart_syscall(2). +// +// +stateify savable +type clockNanosleepRestartBlock struct { + c ktime.Clock + duration time.Duration + rem usermem.Addr +} + +// Restart implements kernel.SyscallRestartBlock.Restart. +func (n *clockNanosleepRestartBlock) Restart(t *kernel.Task) (uintptr, error) { + return 0, clockNanosleepFor(t, n.c, n.duration, n.rem) +} + +// clockNanosleepUntil blocks until a specified time. +// +// If blocking is interrupted, the syscall is restarted with the original +// arguments. +func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, ts linux.Timespec) error { + notifier, tchan := ktime.NewChannelNotifier() + timer := ktime.NewTimer(c, notifier) + + // Turn on the timer. + timer.Swap(ktime.Setting{ + Period: 0, + Enabled: true, + Next: ktime.FromTimespec(ts), + }) + + err := t.BlockWithTimer(nil, tchan) + + timer.Destroy() + + // Did we just block until the timeout happened? + if err == syserror.ETIMEDOUT { + return nil + } + + return syserror.ConvertIntr(err, kernel.ERESTARTNOHAND) +} + +// clockNanosleepFor blocks for a specified duration. +// +// If blocking is interrupted, the syscall is restarted with the remaining +// duration timeout. +func clockNanosleepFor(t *kernel.Task, c ktime.Clock, dur time.Duration, rem usermem.Addr) error { + timer, start, tchan := ktime.After(c, dur) + + err := t.BlockWithTimer(nil, tchan) + + after := c.Now() + + timer.Destroy() + + switch err { + case syserror.ETIMEDOUT: + // Slept for entire timeout. + return nil + case syserror.ErrInterrupted: + // Interrupted. + remaining := dur - after.Sub(start) + if remaining < 0 { + remaining = time.Duration(0) + } + + // Copy out remaining time. + if rem != 0 { + timeleft := linux.NsecToTimespec(remaining.Nanoseconds()) + if err := copyTimespecOut(t, rem, &timeleft); err != nil { + return err + } + } + + // Arrange for a restart with the remaining duration. + t.SetSyscallRestartBlock(&clockNanosleepRestartBlock{ + c: c, + duration: remaining, + rem: rem, + }) + return kernel.ERESTART_RESTARTBLOCK + default: + panic(fmt.Sprintf("Impossible BlockWithTimer error %v", err)) + } +} + +// Nanosleep implements linux syscall Nanosleep(2). +func Nanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + rem := args[1].Pointer() + + ts, err := copyTimespecIn(t, addr) + if err != nil { + return 0, nil, err + } + + if !ts.Valid() { + return 0, nil, syserror.EINVAL + } + + // Just like linux, we cap the timeout with the max number that int64 can + // represent which is roughly 292 years. + dur := time.Duration(ts.ToNsecCapped()) * time.Nanosecond + return 0, nil, clockNanosleepFor(t, t.Kernel().MonotonicClock(), dur, rem) +} + +// ClockNanosleep implements linux syscall clock_nanosleep(2). +func ClockNanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + clockID := int32(args[0].Int()) + flags := args[1].Int() + addr := args[2].Pointer() + rem := args[3].Pointer() + + req, err := copyTimespecIn(t, addr) + if err != nil { + return 0, nil, err + } + + if !req.Valid() { + return 0, nil, syserror.EINVAL + } + + // Only allow clock constants also allowed by Linux. + if clockID > 0 { + if clockID != linux.CLOCK_REALTIME && + clockID != linux.CLOCK_MONOTONIC && + clockID != linux.CLOCK_PROCESS_CPUTIME_ID { + return 0, nil, syserror.EINVAL + } + } + + c, err := getClock(t, clockID) + if err != nil { + return 0, nil, err + } + + if flags&linux.TIMER_ABSTIME != 0 { + return 0, nil, clockNanosleepUntil(t, c, req) + } + + dur := time.Duration(req.ToNsecCapped()) * time.Nanosecond + return 0, nil, clockNanosleepFor(t, c, dur, rem) +} + +// Gettimeofday implements linux syscall gettimeofday(2). +func Gettimeofday(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tv := args[0].Pointer() + tz := args[1].Pointer() + + if tv != usermem.Addr(0) { + nowTv := t.Kernel().RealtimeClock().Now().Timeval() + if err := copyTimevalOut(t, tv, &nowTv); err != nil { + return 0, nil, err + } + } + + if tz != usermem.Addr(0) { + // Ask the time package for the timezone. + _, offset := time.Now().Zone() + // This int32 array mimics linux's struct timezone. + timezone := [2]int32{-int32(offset) / 60, 0} + _, err := t.CopyOut(tz, timezone) + return 0, nil, err + } + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go new file mode 100644 index 000000000..a4c400f87 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_timer.go @@ -0,0 +1,203 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const nsecPerSec = int64(time.Second) + +// copyItimerValIn copies an ItimerVal from the untrusted app range to the +// kernel. The ItimerVal may be either 32 or 64 bits. +// A NULL address is allowed because because Linux allows +// setitimer(which, NULL, &old_value) which disables the timer. +// There is a KERN_WARN message saying this misfeature will be removed. +// However, that hasn't happened as of 3.19, so we continue to support it. +func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) { + if addr == usermem.Addr(0) { + return linux.ItimerVal{}, nil + } + + switch t.Arch().Width() { + case 8: + // Native size, just copy directly. + var itv linux.ItimerVal + if _, err := t.CopyIn(addr, &itv); err != nil { + return linux.ItimerVal{}, err + } + + return itv, nil + default: + return linux.ItimerVal{}, syserror.ENOSYS + } +} + +// copyItimerValOut copies an ItimerVal to the untrusted app range. +// The ItimerVal may be either 32 or 64 bits. +// A NULL address is allowed, in which case no copy takes place +func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) error { + if addr == usermem.Addr(0) { + return nil + } + + switch t.Arch().Width() { + case 8: + // Native size, just copy directly. + _, err := t.CopyOut(addr, itv) + return err + default: + return syserror.ENOSYS + } +} + +// Getitimer implements linux syscall getitimer(2). +func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + timerID := args[0].Int() + val := args[1].Pointer() + + olditv, err := t.Getitimer(timerID) + if err != nil { + return 0, nil, err + } + return 0, nil, copyItimerValOut(t, val, &olditv) +} + +// Setitimer implements linux syscall setitimer(2). +func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + timerID := args[0].Int() + newVal := args[1].Pointer() + oldVal := args[2].Pointer() + + newitv, err := copyItimerValIn(t, newVal) + if err != nil { + return 0, nil, err + } + olditv, err := t.Setitimer(timerID, newitv) + if err != nil { + return 0, nil, err + } + return 0, nil, copyItimerValOut(t, oldVal, &olditv) +} + +// Alarm implements linux syscall alarm(2). +func Alarm(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + duration := time.Duration(args[0].Uint()) * time.Second + + olditv, err := t.Setitimer(linux.ITIMER_REAL, linux.ItimerVal{ + Value: linux.DurationToTimeval(duration), + }) + if err != nil { + return 0, nil, err + } + olddur := olditv.Value.ToDuration() + secs := olddur.Round(time.Second).Nanoseconds() / nsecPerSec + if secs == 0 && olddur != 0 { + // We can't return 0 if an alarm was previously scheduled. + secs = 1 + } + return uintptr(secs), nil, nil +} + +// TimerCreate implements linux syscall timer_create(2). +func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + clockID := args[0].Int() + sevp := args[1].Pointer() + timerIDp := args[2].Pointer() + + c, err := getClock(t, clockID) + if err != nil { + return 0, nil, err + } + + var sev *linux.Sigevent + if sevp != 0 { + sev = &linux.Sigevent{} + if _, err = t.CopyIn(sevp, sev); err != nil { + return 0, nil, err + } + } + + id, err := t.IntervalTimerCreate(c, sev) + if err != nil { + return 0, nil, err + } + + if _, err := t.CopyOut(timerIDp, &id); err != nil { + t.IntervalTimerDelete(id) + return 0, nil, err + } + + return 0, nil, nil +} + +// TimerSettime implements linux syscall timer_settime(2). +func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + timerID := linux.TimerID(args[0].Value) + flags := args[1].Int() + newValAddr := args[2].Pointer() + oldValAddr := args[3].Pointer() + + var newVal linux.Itimerspec + if _, err := t.CopyIn(newValAddr, &newVal); err != nil { + return 0, nil, err + } + oldVal, err := t.IntervalTimerSettime(timerID, newVal, flags&linux.TIMER_ABSTIME != 0) + if err != nil { + return 0, nil, err + } + if oldValAddr != 0 { + if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil { + return 0, nil, err + } + } + return 0, nil, nil +} + +// TimerGettime implements linux syscall timer_gettime(2). +func TimerGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + timerID := linux.TimerID(args[0].Value) + curValAddr := args[1].Pointer() + + curVal, err := t.IntervalTimerGettime(timerID) + if err != nil { + return 0, nil, err + } + _, err = t.CopyOut(curValAddr, &curVal) + return 0, nil, err +} + +// TimerGetoverrun implements linux syscall timer_getoverrun(2). +func TimerGetoverrun(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + timerID := linux.TimerID(args[0].Value) + + o, err := t.IntervalTimerGetoverrun(timerID) + if err != nil { + return 0, nil, err + } + return uintptr(o), nil, nil +} + +// TimerDelete implements linux syscall timer_delete(2). +func TimerDelete(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + timerID := linux.TimerID(args[0].Value) + return 0, nil, t.IntervalTimerDelete(timerID) +} diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go new file mode 100644 index 000000000..cf49b43db --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_timerfd.go @@ -0,0 +1,121 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/syserror" +) + +// TimerfdCreate implements Linux syscall timerfd_create(2). +func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + clockID := args[0].Int() + flags := args[1].Int() + + if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 { + return 0, nil, syserror.EINVAL + } + + var c ktime.Clock + switch clockID { + case linux.CLOCK_REALTIME: + c = t.Kernel().RealtimeClock() + case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME: + c = t.Kernel().MonotonicClock() + default: + return 0, nil, syserror.EINVAL + } + f := timerfd.NewFile(t, c) + defer f.DecRef() + f.SetFlags(fs.SettableFileFlags{ + NonBlocking: flags&linux.TFD_NONBLOCK != 0, + }) + + fd, err := t.NewFDFrom(0, f, kernel.FDFlags{ + CloseOnExec: flags&linux.TFD_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// TimerfdSettime implements Linux syscall timerfd_settime(2). +func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + flags := args[1].Int() + newValAddr := args[2].Pointer() + oldValAddr := args[3].Pointer() + + if flags&^(linux.TFD_TIMER_ABSTIME) != 0 { + return 0, nil, syserror.EINVAL + } + + f := t.GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + tf, ok := f.FileOperations.(*timerfd.TimerOperations) + if !ok { + return 0, nil, syserror.EINVAL + } + + var newVal linux.Itimerspec + if _, err := t.CopyIn(newValAddr, &newVal); err != nil { + return 0, nil, err + } + newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tf.Clock()) + if err != nil { + return 0, nil, err + } + tm, oldS := tf.SetTime(newS) + if oldValAddr != 0 { + oldVal := ktime.ItimerspecFromSetting(tm, oldS) + if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil { + return 0, nil, err + } + } + return 0, nil, nil +} + +// TimerfdGettime implements Linux syscall timerfd_gettime(2). +func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + curValAddr := args[1].Pointer() + + f := t.GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + tf, ok := f.FileOperations.(*timerfd.TimerOperations) + if !ok { + return 0, nil, syserror.EINVAL + } + + tm, s := tf.GetTime() + curVal := ktime.ItimerspecFromSetting(tm, s) + _, err := t.CopyOut(curValAddr, &curVal) + return 0, nil, err +} diff --git a/pkg/sentry/syscalls/linux/sys_tls_amd64.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go new file mode 100644 index 000000000..b3eb96a1c --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go @@ -0,0 +1,52 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//+build amd64 + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// ArchPrctl implements linux syscall arch_prctl(2). +// It sets architecture-specific process or thread state for t. +func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + switch args[0].Int() { + case linux.ARCH_GET_FS: + addr := args[1].Pointer() + fsbase := t.Arch().TLS() + _, err := t.CopyOut(addr, uint64(fsbase)) + if err != nil { + return 0, nil, err + } + + case linux.ARCH_SET_FS: + fsbase := args[1].Uint64() + if !t.Arch().SetTLS(uintptr(fsbase)) { + return 0, nil, syserror.EPERM + } + + case linux.ARCH_GET_GS, linux.ARCH_SET_GS: + t.Kernel().EmitUnimplementedEvent(t) + fallthrough + default: + return 0, nil, syserror.EINVAL + } + + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_tls_arm64.go b/pkg/sentry/syscalls/linux/sys_tls_arm64.go new file mode 100644 index 000000000..fb08a356e --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_tls_arm64.go @@ -0,0 +1,28 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//+build arm64 + +package linux + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// ArchPrctl is not defined for ARM64. +func ArchPrctl(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, syserror.ENOSYS +} diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go new file mode 100644 index 000000000..e9d702e8e --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_utsname.go @@ -0,0 +1,95 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Uname implements linux syscall uname. +func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + version := t.SyscallTable().Version + + uts := t.UTSNamespace() + + // Fill in structure fields. + var u linux.UtsName + copy(u.Sysname[:], version.Sysname) + copy(u.Nodename[:], uts.HostName()) + copy(u.Release[:], version.Release) + copy(u.Version[:], version.Version) + // build tag above. + switch t.SyscallTable().Arch { + case arch.AMD64: + copy(u.Machine[:], "x86_64") + case arch.ARM64: + copy(u.Machine[:], "aarch64") + default: + copy(u.Machine[:], "unknown") + } + copy(u.Domainname[:], uts.DomainName()) + + // Copy out the result. + va := args[0].Pointer() + _, err := t.CopyOut(va, u) + return 0, nil, err +} + +// Setdomainname implements Linux syscall setdomainname. +func Setdomainname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nameAddr := args[0].Pointer() + size := args[1].Int() + + utsns := t.UTSNamespace() + if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) { + return 0, nil, syserror.EPERM + } + if size < 0 || size > linux.UTSLen { + return 0, nil, syserror.EINVAL + } + + name, err := t.CopyInString(nameAddr, int(size)) + if err != nil { + return 0, nil, err + } + + utsns.SetDomainName(name) + return 0, nil, nil +} + +// Sethostname implements Linux syscall sethostname. +func Sethostname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nameAddr := args[0].Pointer() + size := args[1].Int() + + utsns := t.UTSNamespace() + if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) { + return 0, nil, syserror.EPERM + } + if size < 0 || size > linux.UTSLen { + return 0, nil, syserror.EINVAL + } + + name := make([]byte, size) + if _, err := t.CopyInBytes(nameAddr, name); err != nil { + return 0, nil, err + } + + utsns.SetHostName(string(name)) + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go new file mode 100644 index 000000000..6ec0de96e --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -0,0 +1,364 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// LINT.IfChange + +const ( + // EventMaskWrite contains events that can be triggered on writes. + // + // Note that EventHUp is not going to happen for pipes but may for + // implementations of poll on some sockets, see net/core/datagram.c. + EventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr +) + +// Write implements linux syscall write(2). +func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is writable. + if !file.Flags().Write { + return 0, nil, syserror.EBADF + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := writev(t, file, src) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "write", file) +} + +// Pwrite64 implements linux syscall pwrite64(2). +func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + offset := args[3].Int64() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate and does not overflow. + if offset < 0 || offset+int64(size) < 0 { + return 0, nil, syserror.EINVAL + } + + // Is writing at an offset supported? + if !file.Flags().Pwrite { + return 0, nil, syserror.ESPIPE + } + + // Check that the file is writable. + if !file.Flags().Write { + return 0, nil, syserror.EBADF + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pwritev(t, file, src, offset) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file) +} + +// Writev implements linux syscall writev(2). +func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is writable. + if !file.Flags().Write { + return 0, nil, syserror.EBADF + } + + // Read the iovecs that specify the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := writev(t, file, src) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "writev", file) +} + +// Pwritev implements linux syscall pwritev(2). +func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Is writing at an offset supported? + if !file.Flags().Pwrite { + return 0, nil, syserror.ESPIPE + } + + // Check that the file is writable. + if !file.Flags().Write { + return 0, nil, syserror.EBADF + } + + // Read the iovecs that specify the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pwritev(t, file, src, offset) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file) +} + +// Pwritev2 implements linux syscall pwritev2(2). +func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // While the syscall is + // pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) + // the linux internal call + // (https://elixir.bootlin.com/linux/v4.18/source/fs/read_write.c#L1354) + // splits the offset argument into a high/low value for compatibility with + // 32-bit architectures. The flags argument is the 5th argument. + + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + flags := int(args[5].Int()) + + if int(args[4].Int())&0x4 == 1 { + return 0, nil, syserror.EACCES + } + + file := t.GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < -1 { + return 0, nil, syserror.EINVAL + } + + // Is writing at an offset supported? + if offset > -1 && !file.Flags().Pwrite { + return 0, nil, syserror.ESPIPE + } + + // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is + // accepted as a valid flag argument for pwritev2. + if flags&^linux.RWF_VALID != 0 { + return uintptr(flags), nil, syserror.EOPNOTSUPP + } + + // Check that the file is writeable. + if !file.Flags().Write { + return 0, nil, syserror.EBADF + } + + // Read the iovecs that specify the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + // If pwritev2 is called with an offset of -1, writev is called. + if offset == -1 { + n, err := writev(t, file, src) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file) + } + + n, err := pwritev(t, file, src, offset) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file) +} + +func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) { + n, err := f.Writev(t, src) + if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if n > 0 { + // Queue notification if we wrote anything. + f.Dirent.InotifyEvent(linux.IN_MODIFY, 0) + } + return n, err + } + + // Sockets support write timeouts. + var haveDeadline bool + var deadline ktime.Time + if s, ok := f.FileOperations.(socket.Socket); ok { + dl := s.SendTimeout() + if dl < 0 && err == syserror.ErrWouldBlock { + return n, err + } + if dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + f.EventRegister(&w, EventMaskWrite) + + total := n + for { + // Shorten src to reflect bytes previously written. + src = src.DropFirst64(n) + + // Issue the request and break out if it completes with + // anything other than "would block". + n, err = f.Writev(t, src) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = syserror.ErrWouldBlock + } + break + } + } + + f.EventUnregister(&w) + + if total > 0 { + // Queue notification if we wrote anything. + f.Dirent.InotifyEvent(linux.IN_MODIFY, 0) + } + + return total, err +} + +func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + n, err := f.Pwritev(t, src, offset) + if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if n > 0 { + // Queue notification if we wrote anything. + f.Dirent.InotifyEvent(linux.IN_MODIFY, 0) + } + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + f.EventRegister(&w, EventMaskWrite) + + total := n + for { + // Shorten src to reflect bytes previously written. + src = src.DropFirst64(n) + + // Issue the request and break out if it completes with + // anything other than "would block". + n, err = f.Pwritev(t, src, offset+total) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + if err = t.Block(ch); err != nil { + break + } + } + + f.EventUnregister(&w) + + if total > 0 { + // Queue notification if we wrote anything. + f.Dirent.InotifyEvent(linux.IN_MODIFY, 0) + } + + return total, err +} + +// LINT.ThenChange(vfs2/read_write.go) diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go new file mode 100644 index 000000000..c24946160 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -0,0 +1,432 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// LINT.IfChange + +// GetXattr implements linux syscall getxattr(2). +func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getXattrFromPath(t, args, true) +} + +// LGetXattr implements linux syscall lgetxattr(2). +func LGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getXattrFromPath(t, args, false) +} + +// FGetXattr implements linux syscall fgetxattr(2). +func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := uint64(args[3].SizeT()) + + // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. + f := t.GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + n, err := getXattr(t, f.Dirent, nameAddr, valueAddr, size) + if err != nil { + return 0, nil, err + } + + return uintptr(n), nil, nil +} + +func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := uint64(args[3].SizeT()) + + path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + n := 0 + err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + n, err = getXattr(t, d, nameAddr, valueAddr, size) + return err + }) + if err != nil { + return 0, nil, err + } + + return uintptr(n), nil, nil +} + +// getXattr implements getxattr(2) from the given *fs.Dirent. +func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64) (int, error) { + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, err + } + + if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil { + return 0, err + } + + // TODO(b/148380782): Support xattrs in namespaces other than "user". + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return 0, syserror.EOPNOTSUPP + } + + // If getxattr(2) is called with size 0, the size of the value will be + // returned successfully even if it is nonzero. In that case, we need to + // retrieve the entire attribute value so we can return the correct size. + requestedSize := size + if size == 0 || size > linux.XATTR_SIZE_MAX { + requestedSize = linux.XATTR_SIZE_MAX + } + + value, err := d.Inode.GetXattr(t, name, requestedSize) + if err != nil { + return 0, err + } + n := len(value) + if uint64(n) > requestedSize { + return 0, syserror.ERANGE + } + + // Don't copy out the attribute value if size is 0. + if size == 0 { + return n, nil + } + + if _, err = t.CopyOutBytes(valueAddr, []byte(value)); err != nil { + return 0, err + } + return n, nil +} + +// SetXattr implements linux syscall setxattr(2). +func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return setXattrFromPath(t, args, true) +} + +// LSetXattr implements linux syscall lsetxattr(2). +func LSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return setXattrFromPath(t, args, false) +} + +// FSetXattr implements linux syscall fsetxattr(2). +func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := uint64(args[3].SizeT()) + flags := args[4].Uint() + + // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. + f := t.GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags) +} + +func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := uint64(args[3].SizeT()) + flags := args[4].Uint() + + path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + return setXattr(t, d, nameAddr, valueAddr, uint64(size), flags) + }) +} + +// setXattr implements setxattr(2) from the given *fs.Dirent. +func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error { + if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { + return syserror.EINVAL + } + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + + if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil { + return err + } + + if size > linux.XATTR_SIZE_MAX { + return syserror.E2BIG + } + buf := make([]byte, size) + if _, err := t.CopyInBytes(valueAddr, buf); err != nil { + return err + } + value := string(buf) + + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + + if err := d.Inode.SetXattr(t, d, name, value, flags); err != nil { + return err + } + d.InotifyEvent(linux.IN_ATTRIB, 0) + return nil +} + +func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) { + name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1) + if err != nil { + if err == syserror.ENAMETOOLONG { + return "", syserror.ERANGE + } + return "", err + } + if len(name) == 0 { + return "", syserror.ERANGE + } + return name, nil +} + +// Restrict xattrs to regular files and directories. +// +// TODO(b/148380782): In Linux, this restriction technically only applies to +// xattrs in the "user.*" namespace. Make file type checks specific to the +// namespace once we allow other xattr prefixes. +func xattrFileTypeOk(i *fs.Inode) bool { + return fs.IsRegular(i.StableAttr) || fs.IsDir(i.StableAttr) +} + +func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error { + // Restrict xattrs to regular files and directories. + if !xattrFileTypeOk(i) { + if perms.Write { + return syserror.EPERM + } + return syserror.ENODATA + } + + return i.CheckPermission(t, perms) +} + +// ListXattr implements linux syscall listxattr(2). +func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listXattrFromPath(t, args, true) +} + +// LListXattr implements linux syscall llistxattr(2). +func LListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listXattrFromPath(t, args, false) +} + +// FListXattr implements linux syscall flistxattr(2). +func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + listAddr := args[1].Pointer() + size := uint64(args[2].SizeT()) + + // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. + f := t.GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + n, err := listXattr(t, f.Dirent, listAddr, size) + if err != nil { + return 0, nil, err + } + + return uintptr(n), nil, nil +} + +func listXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + listAddr := args[1].Pointer() + size := uint64(args[2].SizeT()) + + path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + n := 0 + err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + n, err = listXattr(t, d, listAddr, size) + return err + }) + if err != nil { + return 0, nil, err + } + + return uintptr(n), nil, nil +} + +func listXattr(t *kernel.Task, d *fs.Dirent, addr usermem.Addr, size uint64) (int, error) { + if !xattrFileTypeOk(d.Inode) { + return 0, nil + } + + // If listxattr(2) is called with size 0, the buffer size needed to contain + // the xattr list will be returned successfully even if it is nonzero. In + // that case, we need to retrieve the entire list so we can compute and + // return the correct size. + requestedSize := size + if size == 0 || size > linux.XATTR_SIZE_MAX { + requestedSize = linux.XATTR_SIZE_MAX + } + xattrs, err := d.Inode.ListXattr(t, requestedSize) + if err != nil { + return 0, err + } + + // TODO(b/148380782): support namespaces other than "user". + for x := range xattrs { + if !strings.HasPrefix(x, linux.XATTR_USER_PREFIX) { + delete(xattrs, x) + } + } + + listSize := xattrListSize(xattrs) + if listSize > linux.XATTR_SIZE_MAX { + return 0, syserror.E2BIG + } + if uint64(listSize) > requestedSize { + return 0, syserror.ERANGE + } + + // Don't copy out the attributes if size is 0. + if size == 0 { + return listSize, nil + } + + buf := make([]byte, 0, listSize) + for x := range xattrs { + buf = append(buf, []byte(x)...) + buf = append(buf, 0) + } + if _, err := t.CopyOutBytes(addr, buf); err != nil { + return 0, err + } + + return len(buf), nil +} + +func xattrListSize(xattrs map[string]struct{}) int { + size := 0 + for x := range xattrs { + size += len(x) + 1 + } + return size +} + +// RemoveXattr implements linux syscall removexattr(2). +func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return removeXattrFromPath(t, args, true) +} + +// LRemoveXattr implements linux syscall lremovexattr(2). +func LRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return removeXattrFromPath(t, args, false) +} + +// FRemoveXattr implements linux syscall fremovexattr(2). +func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + + // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. + f := t.GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + return 0, nil, removeXattr(t, f.Dirent, nameAddr) +} + +func removeXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + + path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + return removeXattr(t, d, nameAddr) + }) +} + +// removeXattr implements removexattr(2) from the given *fs.Dirent. +func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error { + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + + if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil { + return err + } + + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + + if err := d.Inode.RemoveXattr(t, d, name); err != nil { + return err + } + d.InotifyEvent(linux.IN_ATTRIB, 0) + return nil +} + +// LINT.ThenChange(vfs2/xattr.go) diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go new file mode 100644 index 000000000..ddc3ee26e --- /dev/null +++ b/pkg/sentry/syscalls/linux/timespec.go @@ -0,0 +1,111 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// copyTimespecIn copies a Timespec from the untrusted app range to the kernel. +func copyTimespecIn(t *kernel.Task, addr usermem.Addr) (linux.Timespec, error) { + switch t.Arch().Width() { + case 8: + ts := linux.Timespec{} + in := t.CopyScratchBuffer(16) + _, err := t.CopyInBytes(addr, in) + if err != nil { + return ts, err + } + ts.Sec = int64(usermem.ByteOrder.Uint64(in[0:])) + ts.Nsec = int64(usermem.ByteOrder.Uint64(in[8:])) + return ts, nil + default: + return linux.Timespec{}, syserror.ENOSYS + } +} + +// copyTimespecOut copies a Timespec to the untrusted app range. +func copyTimespecOut(t *kernel.Task, addr usermem.Addr, ts *linux.Timespec) error { + switch t.Arch().Width() { + case 8: + out := t.CopyScratchBuffer(16) + usermem.ByteOrder.PutUint64(out[0:], uint64(ts.Sec)) + usermem.ByteOrder.PutUint64(out[8:], uint64(ts.Nsec)) + _, err := t.CopyOutBytes(addr, out) + return err + default: + return syserror.ENOSYS + } +} + +// copyTimevalIn copies a Timeval from the untrusted app range to the kernel. +func copyTimevalIn(t *kernel.Task, addr usermem.Addr) (linux.Timeval, error) { + switch t.Arch().Width() { + case 8: + tv := linux.Timeval{} + in := t.CopyScratchBuffer(16) + _, err := t.CopyInBytes(addr, in) + if err != nil { + return tv, err + } + tv.Sec = int64(usermem.ByteOrder.Uint64(in[0:])) + tv.Usec = int64(usermem.ByteOrder.Uint64(in[8:])) + return tv, nil + default: + return linux.Timeval{}, syserror.ENOSYS + } +} + +// copyTimevalOut copies a Timeval to the untrusted app range. +func copyTimevalOut(t *kernel.Task, addr usermem.Addr, tv *linux.Timeval) error { + switch t.Arch().Width() { + case 8: + out := t.CopyScratchBuffer(16) + usermem.ByteOrder.PutUint64(out[0:], uint64(tv.Sec)) + usermem.ByteOrder.PutUint64(out[8:], uint64(tv.Usec)) + _, err := t.CopyOutBytes(addr, out) + return err + default: + return syserror.ENOSYS + } +} + +// copyTimespecInToDuration copies a Timespec from the untrusted app range, +// validates it and converts it to a Duration. +// +// If the Timespec is larger than what can be represented in a Duration, the +// returned value is the maximum that Duration will allow. +// +// If timespecAddr is NULL, the returned value is negative. +func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) { + // Use a negative Duration to indicate "no timeout". + timeout := time.Duration(-1) + if timespecAddr != 0 { + timespec, err := copyTimespecIn(t, timespecAddr) + if err != nil { + return 0, err + } + if !timespec.Valid() { + return 0, syserror.EINVAL + } + timeout = time.Duration(timespec.ToNsecCapped()) + } + return timeout, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD new file mode 100644 index 000000000..0c740335b --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -0,0 +1,76 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "vfs2", + srcs = [ + "aio.go", + "epoll.go", + "eventfd.go", + "execve.go", + "fd.go", + "filesystem.go", + "fscontext.go", + "getdents.go", + "inotify.go", + "ioctl.go", + "lock.go", + "memfd.go", + "mmap.go", + "mount.go", + "path.go", + "pipe.go", + "poll.go", + "read_write.go", + "setstat.go", + "signal.go", + "socket.go", + "splice.go", + "stat.go", + "stat_amd64.go", + "stat_arm64.go", + "sync.go", + "timerfd.go", + "vfs2.go", + "xattr.go", + ], + marshal = True, + visibility = ["//:sandbox"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/bits", + "//pkg/context", + "//pkg/fspath", + "//pkg/gohacks", + "//pkg/sentry/arch", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/eventfd", + "//pkg/sentry/fsimpl/pipefs", + "//pkg/sentry/fsimpl/signalfd", + "//pkg/sentry/fsimpl/timerfd", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/fasync", + "//pkg/sentry/kernel/pipe", + "//pkg/sentry/kernel/time", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/memmap", + "//pkg/sentry/mm", + "//pkg/sentry/socket", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/syscalls", + "//pkg/sentry/syscalls/linux", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserr", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go new file mode 100644 index 000000000..e5cdefc50 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/aio.go @@ -0,0 +1,216 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/mm" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// IoSubmit implements linux syscall io_submit(2). +func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Uint64() + nrEvents := args[1].Int() + addr := args[2].Pointer() + + if nrEvents < 0 { + return 0, nil, syserror.EINVAL + } + + for i := int32(0); i < nrEvents; i++ { + // Copy in the address. + cbAddrNative := t.Arch().Native(0) + if _, err := t.CopyIn(addr, cbAddrNative); err != nil { + if i > 0 { + // Some successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Copy in this callback. + var cb linux.IOCallback + cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative)) + if _, err := t.CopyIn(cbAddr, &cb); err != nil { + if i > 0 { + // Some have been successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Process this callback. + if err := submitCallback(t, id, &cb, cbAddr); err != nil { + if i > 0 { + // Partial success. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Advance to the next one. + addr += usermem.Addr(t.Arch().Width()) + } + + return uintptr(nrEvents), nil, nil +} + +// submitCallback processes a single callback. +func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error { + if cb.Reserved2 != 0 { + return syserror.EINVAL + } + + fd := t.GetFileVFS2(cb.FD) + if fd == nil { + return syserror.EBADF + } + defer fd.DecRef() + + // Was there an eventFD? Extract it. + var eventFD *vfs.FileDescription + if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { + eventFD = t.GetFileVFS2(cb.ResFD) + if eventFD == nil { + return syserror.EBADF + } + defer eventFD.DecRef() + + // Check that it is an eventfd. + if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok { + return syserror.EINVAL + } + } + + ioseq, err := memoryFor(t, cb) + if err != nil { + return err + } + + // Check offset for reads/writes. + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + if cb.Offset < 0 { + return syserror.EINVAL + } + } + + // Prepare the request. + aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id) + if !ok { + return syserror.EINVAL + } + if ready := aioCtx.Prepare(); !ready { + // Context is busy. + return syserror.EAGAIN + } + + if eventFD != nil { + // The request is set. Make sure there's a ref on the file. + // + // This is necessary when the callback executes on completion, + // which is also what will release this reference. + eventFD.IncRef() + } + + // Perform the request asynchronously. + fd.IncRef() + t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx)) + return nil +} + +func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback { + return func(ctx context.Context) { + if aioCtx.Dead() { + aioCtx.CancelPendingRequest() + return + } + ev := &linux.IOEvent{ + Data: cb.Data, + Obj: uint64(cbAddr), + } + + var err error + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV: + ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{}) + case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{}) + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC: + err = fd.Sync(ctx) + } + + // Update the result. + if err != nil { + err = slinux.HandleIOErrorVFS2(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd) + ev.Result = -int64(kernel.ExtractErrno(err, 0)) + } + + fd.DecRef() + + // Queue the result for delivery. + aioCtx.FinishRequest(ev) + + // Notify the event file if one was specified. This needs to happen + // *after* queueing the result to avoid racing with the thread we may + // wake up. + if eventFD != nil { + eventFD.Impl().(*eventfd.EventFileDescription).Signal(1) + eventFD.DecRef() + } + } +} + +// memoryFor returns appropriate memory for the given callback. +func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) { + bytes := int(cb.Bytes) + if bytes < 0 { + // Linux also requires that this field fit in ssize_t. + return usermem.IOSequence{}, syserror.EINVAL + } + + // Since this I/O will be asynchronous with respect to t's task goroutine, + // we have no guarantee that t's AddressSpace will be active during the + // I/O. + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE: + return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV: + return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP: + return usermem.IOSequence{}, nil + + default: + // Not a supported command. + return usermem.IOSequence{}, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go new file mode 100644 index 000000000..34c90ae3e --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go @@ -0,0 +1,228 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "math" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +var sizeofEpollEvent = (*linux.EpollEvent)(nil).SizeBytes() + +// EpollCreate1 implements Linux syscall epoll_create1(2). +func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + if flags&^linux.EPOLL_CLOEXEC != 0 { + return 0, nil, syserror.EINVAL + } + + file, err := t.Kernel().VFS().NewEpollInstanceFD() + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// EpollCreate implements Linux syscall epoll_create(2). +func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + size := args[0].Int() + + // "Since Linux 2.6.8, the size argument is ignored, but must be greater + // than zero" - epoll_create(2) + if size <= 0 { + return 0, nil, syserror.EINVAL + } + + file, err := t.Kernel().VFS().NewEpollInstanceFD() + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// EpollCtl implements Linux syscall epoll_ctl(2). +func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + op := args[1].Int() + fd := args[2].Int() + eventAddr := args[3].Pointer() + + epfile := t.GetFileVFS2(epfd) + if epfile == nil { + return 0, nil, syserror.EBADF + } + defer epfile.DecRef() + ep, ok := epfile.Impl().(*vfs.EpollInstance) + if !ok { + return 0, nil, syserror.EINVAL + } + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + if epfile == file { + return 0, nil, syserror.EINVAL + } + + var event linux.EpollEvent + switch op { + case linux.EPOLL_CTL_ADD: + if _, err := event.CopyIn(t, eventAddr); err != nil { + return 0, nil, err + } + return 0, nil, ep.AddInterest(file, fd, event) + case linux.EPOLL_CTL_DEL: + return 0, nil, ep.DeleteInterest(file, fd) + case linux.EPOLL_CTL_MOD: + if _, err := event.CopyIn(t, eventAddr); err != nil { + return 0, nil, err + } + return 0, nil, ep.ModifyInterest(file, fd, event) + default: + return 0, nil, syserror.EINVAL + } +} + +// EpollWait implements Linux syscall epoll_wait(2). +func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + eventsAddr := args[1].Pointer() + maxEvents := int(args[2].Int()) + timeout := int(args[3].Int()) + + var _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS + if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS { + return 0, nil, syserror.EINVAL + } + + epfile := t.GetFileVFS2(epfd) + if epfile == nil { + return 0, nil, syserror.EBADF + } + defer epfile.DecRef() + ep, ok := epfile.Impl().(*vfs.EpollInstance) + if !ok { + return 0, nil, syserror.EINVAL + } + + // Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent, + // maxEvents), so that the buffer can be allocated on the stack. + var ( + events [16]linux.EpollEvent + total int + ch chan struct{} + haveDeadline bool + deadline ktime.Time + ) + for { + batchEvents := len(events) + if batchEvents > maxEvents { + batchEvents = maxEvents + } + n := ep.ReadEvents(events[:batchEvents]) + maxEvents -= n + if n != 0 { + // Copy what we read out. + copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events[:n]) + copiedEvents := copiedBytes / sizeofEpollEvent // rounded down + eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent) + total += copiedEvents + if err != nil { + if total != 0 { + return uintptr(total), nil, nil + } + return 0, nil, err + } + // If we've filled the application's event buffer, we're done. + if maxEvents == 0 { + return uintptr(total), nil, nil + } + // Loop if we read a full batch, under the expectation that there + // may be more events to read. + if n == batchEvents { + continue + } + } + // We get here if n != batchEvents. If we read any number of events + // (just now, or in a previous iteration of this loop), or if timeout + // is 0 (such that epoll_wait should be non-blocking), return the + // events we've read so far to the application. + if total != 0 || timeout == 0 { + return uintptr(total), nil, nil + } + // In the first iteration of this loop, register with the epoll + // instance for readability events, but then immediately continue the + // loop since we need to retry ReadEvents() before blocking. In all + // subsequent iterations, block until events are available, the timeout + // expires, or an interrupt arrives. + if ch == nil { + var w waiter.Entry + w, ch = waiter.NewChannelEntry(nil) + epfile.EventRegister(&w, waiter.EventIn) + defer epfile.EventUnregister(&w) + } else { + // Set up the timer if a timeout was specified. + if timeout > 0 && !haveDeadline { + timeoutDur := time.Duration(timeout) * time.Millisecond + deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur) + haveDeadline = true + } + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = nil + } + // total must be 0 since otherwise we would have returned + // above. + return 0, nil, err + } + } + } +} + +// EpollPwait implements Linux syscall epoll_pwait(2). +func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + maskAddr := args[4].Pointer() + maskSize := uint(args[5].Uint()) + + if err := setTempSignalSet(t, maskAddr, maskSize); err != nil { + return 0, nil, err + } + + return EpollWait(t, args) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go new file mode 100644 index 000000000..aff1a2070 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go @@ -0,0 +1,61 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Eventfd2 implements linux syscall eventfd2(2). +func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + initVal := uint64(args[0].Uint()) + flags := uint(args[1].Uint()) + allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC) + + if flags & ^allOps != 0 { + return 0, nil, syserror.EINVAL + } + + vfsObj := t.Kernel().VFS() + fileFlags := uint32(linux.O_RDWR) + if flags&linux.EFD_NONBLOCK != 0 { + fileFlags |= linux.O_NONBLOCK + } + semMode := flags&linux.EFD_SEMAPHORE != 0 + eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags) + if err != nil { + return 0, nil, err + } + defer eventfd.DecRef() + + fd, err := t.NewFDFromVFS2(0, eventfd, kernel.FDFlags{ + CloseOnExec: flags&linux.EFD_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// Eventfd implements linux syscall eventfd(2). +func Eventfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + args[1].Value = 0 + return Eventfd2(t, args) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go new file mode 100644 index 000000000..aef0078a8 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/execve.go @@ -0,0 +1,137 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/loader" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Execve implements linux syscall execve(2). +func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathnameAddr := args[0].Pointer() + argvAddr := args[1].Pointer() + envvAddr := args[2].Pointer() + return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */) +} + +// Execveat implements linux syscall execveat(2). +func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathnameAddr := args[1].Pointer() + argvAddr := args[2].Pointer() + envvAddr := args[3].Pointer() + flags := args[4].Int() + return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags) +} + +func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return 0, nil, syserror.EINVAL + } + + pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX) + if err != nil { + return 0, nil, err + } + var argv, envv []string + if argvAddr != 0 { + var err error + argv, err = t.CopyInVector(argvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize) + if err != nil { + return 0, nil, err + } + } + if envvAddr != 0 { + var err error + envv, err = t.CopyInVector(envvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize) + if err != nil { + return 0, nil, err + } + } + + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + var executable fsbridge.File + closeOnExec := false + if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute { + // We must open the executable ourselves since dirfd is used as the + // starting point while resolving path, but the task working directory + // is used as the starting point while resolving interpreters (Linux: + // fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() => + // do_open_execat(fd=AT_FDCWD)), and the loader package is currently + // incapable of handling this correctly. + if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { + return 0, nil, syserror.ENOENT + } + dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd) + if dirfile == nil { + return 0, nil, syserror.EBADF + } + start := dirfile.VirtualDentry() + start.IncRef() + dirfile.DecRef() + closeOnExec = dirfileFlags.CloseOnExec + file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + FileExec: true, + }) + start.DecRef() + if err != nil { + return 0, nil, err + } + defer file.DecRef() + executable = fsbridge.NewVFSFile(file) + } + + // Load the new TaskContext. + mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change + defer mntns.DecRef() + wd := t.FSContext().WorkingDirectoryVFS2() + defer wd.DecRef() + remainingTraversals := uint(linux.MaxSymlinkTraversals) + loadArgs := loader.LoadArgs{ + Opener: fsbridge.NewVFSLookup(mntns, root, wd), + RemainingTraversals: &remainingTraversals, + ResolveFinal: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + Filename: pathname, + File: executable, + CloseOnExec: closeOnExec, + Argv: argv, + Envv: envv, + Features: t.Arch().FeatureSet(), + } + + tc, se := t.Kernel().LoadTaskImage(t, loadArgs) + if se != nil { + return 0, nil, se.ToError() + } + + ctrl, err := t.Execve(tc) + return 0, ctrl, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go new file mode 100644 index 000000000..517394ba9 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -0,0 +1,355 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/fasync" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Close implements Linux syscall close(2). +func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + // Note that Remove provides a reference on the file that we may use to + // flush. It is still active until we drop the final reference below + // (and other reference-holding operations complete). + _, file := t.FDTable().Remove(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := file.OnClose(t) + return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file) +} + +// Dup implements Linux syscall dup(2). +func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) + if err != nil { + return 0, nil, syserror.EMFILE + } + return uintptr(newFD), nil, nil +} + +// Dup2 implements Linux syscall dup2(2). +func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldfd := args[0].Int() + newfd := args[1].Int() + + if oldfd == newfd { + // As long as oldfd is valid, dup2() does nothing and returns newfd. + file := t.GetFileVFS2(oldfd) + if file == nil { + return 0, nil, syserror.EBADF + } + file.DecRef() + return uintptr(newfd), nil, nil + } + + return dup3(t, oldfd, newfd, 0) +} + +// Dup3 implements Linux syscall dup3(2). +func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldfd := args[0].Int() + newfd := args[1].Int() + flags := args[2].Uint() + + if oldfd == newfd { + return 0, nil, syserror.EINVAL + } + + return dup3(t, oldfd, newfd, flags) +} + +func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) { + if flags&^linux.O_CLOEXEC != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(oldfd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + return uintptr(newfd), nil, nil +} + +// Fcntl implements linux syscall fcntl(2). +func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + cmd := args[1].Int() + + file, flags := t.FDTable().GetVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + switch cmd { + case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: + minfd := args[2].Int() + fd, err := t.NewFDFromVFS2(minfd, file, kernel.FDFlags{ + CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil + case linux.F_GETFD: + return uintptr(flags.ToLinuxFDFlags()), nil, nil + case linux.F_SETFD: + flags := args[2].Uint() + err := t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{ + CloseOnExec: flags&linux.FD_CLOEXEC != 0, + }) + return 0, nil, err + case linux.F_GETFL: + return uintptr(file.StatusFlags()), nil, nil + case linux.F_SETFL: + return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint()) + case linux.F_SETPIPE_SZ: + pipefile, ok := file.Impl().(*pipe.VFSPipeFD) + if !ok { + return 0, nil, syserror.EBADF + } + n, err := pipefile.SetPipeSize(int64(args[2].Int())) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil + case linux.F_GETOWN: + owner, hasOwner := getAsyncOwner(t, file) + if !hasOwner { + return 0, nil, nil + } + if owner.Type == linux.F_OWNER_PGRP { + return uintptr(-owner.PID), nil, nil + } + return uintptr(owner.PID), nil, nil + case linux.F_SETOWN: + who := args[2].Int() + ownerType := int32(linux.F_OWNER_PID) + if who < 0 { + // Check for overflow before flipping the sign. + if who-1 > who { + return 0, nil, syserror.EINVAL + } + ownerType = linux.F_OWNER_PGRP + who = -who + } + return 0, nil, setAsyncOwner(t, file, ownerType, who) + case linux.F_GETOWN_EX: + owner, hasOwner := getAsyncOwner(t, file) + if !hasOwner { + return 0, nil, nil + } + _, err := t.CopyOut(args[2].Pointer(), &owner) + return 0, nil, err + case linux.F_SETOWN_EX: + var owner linux.FOwnerEx + n, err := t.CopyIn(args[2].Pointer(), &owner) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, setAsyncOwner(t, file, owner.Type, owner.PID) + case linux.F_GETPIPE_SZ: + pipefile, ok := file.Impl().(*pipe.VFSPipeFD) + if !ok { + return 0, nil, syserror.EBADF + } + return uintptr(pipefile.PipeSize()), nil, nil + case linux.F_GET_SEALS: + val, err := tmpfs.GetSeals(file) + return uintptr(val), nil, err + case linux.F_ADD_SEALS: + if !file.IsWritable() { + return 0, nil, syserror.EPERM + } + err := tmpfs.AddSeals(file, args[2].Uint()) + return 0, nil, err + case linux.F_SETLK, linux.F_SETLKW: + return 0, nil, posixLock(t, args, file, cmd) + default: + // TODO(gvisor.dev/issue/2920): Everything else is not yet supported. + return 0, nil, syserror.EINVAL + } +} + +func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) { + a := fd.AsyncHandler() + if a == nil { + return linux.FOwnerEx{}, false + } + + ot, otg, opg := a.(*fasync.FileAsync).Owner() + switch { + case ot != nil: + return linux.FOwnerEx{ + Type: linux.F_OWNER_TID, + PID: int32(t.PIDNamespace().IDOfTask(ot)), + }, true + case otg != nil: + return linux.FOwnerEx{ + Type: linux.F_OWNER_PID, + PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)), + }, true + case opg != nil: + return linux.FOwnerEx{ + Type: linux.F_OWNER_PGRP, + PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)), + }, true + default: + return linux.FOwnerEx{}, true + } +} + +func setAsyncOwner(t *kernel.Task, fd *vfs.FileDescription, ownerType, pid int32) error { + switch ownerType { + case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP: + // Acceptable type. + default: + return syserror.EINVAL + } + + a := fd.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync) + if pid == 0 { + a.ClearOwner() + return nil + } + + switch ownerType { + case linux.F_OWNER_TID: + task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) + if task == nil { + return syserror.ESRCH + } + a.SetOwnerTask(t, task) + return nil + case linux.F_OWNER_PID: + tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid)) + if tg == nil { + return syserror.ESRCH + } + a.SetOwnerThreadGroup(t, tg) + return nil + case linux.F_OWNER_PGRP: + pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid)) + if pg == nil { + return syserror.ESRCH + } + a.SetOwnerProcessGroup(t, pg) + return nil + default: + return syserror.EINVAL + } +} + +func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, cmd int32) error { + // Copy in the lock request. + flockAddr := args[2].Pointer() + var flock linux.Flock + if _, err := t.CopyIn(flockAddr, &flock); err != nil { + return err + } + + var blocker lock.Blocker + if cmd == linux.F_SETLKW { + blocker = t + } + + switch flock.Type { + case linux.F_RDLCK: + if !file.IsReadable() { + return syserror.EBADF + } + return file.LockPOSIX(t, t.FDTable(), lock.ReadLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker) + + case linux.F_WRLCK: + if !file.IsWritable() { + return syserror.EBADF + } + return file.LockPOSIX(t, t.FDTable(), lock.WriteLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker) + + case linux.F_UNLCK: + return file.UnlockPOSIX(t, t.FDTable(), uint64(flock.Start), uint64(flock.Len), flock.Whence) + + default: + return syserror.EINVAL + } +} + +// Fadvise64 implements fadvise64(2). +// This implementation currently ignores the provided advice. +func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + length := args[2].Int64() + advice := args[3].Int() + + // Note: offset is allowed to be negative. + if length < 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // If the FD refers to a pipe or FIFO, return error. + if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe { + return 0, nil, syserror.ESPIPE + } + + switch advice { + case linux.POSIX_FADV_NORMAL: + case linux.POSIX_FADV_RANDOM: + case linux.POSIX_FADV_SEQUENTIAL: + case linux.POSIX_FADV_WILLNEED: + case linux.POSIX_FADV_DONTNEED: + case linux.POSIX_FADV_NOREUSE: + default: + return 0, nil, syserror.EINVAL + } + + // Sure, whatever. + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go new file mode 100644 index 000000000..6b14c2bef --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -0,0 +1,384 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Link implements Linux syscall link(2). +func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldpathAddr := args[0].Pointer() + newpathAddr := args[1].Pointer() + return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) +} + +// Linkat implements Linux syscall linkat(2). +func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + olddirfd := args[0].Int() + oldpathAddr := args[1].Pointer() + newdirfd := args[2].Int() + newpathAddr := args[3].Pointer() + flags := args[4].Int() + return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) +} + +func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags int32) error { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 { + return syserror.EINVAL + } + if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) { + return syserror.ENOENT + } + + oldpath, err := copyInPath(t, oldpathAddr) + if err != nil { + return err + } + oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0)) + if err != nil { + return err + } + defer oldtpop.Release() + + newpath, err := copyInPath(t, newpathAddr) + if err != nil { + return err + } + newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer newtpop.Release() + + return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop) +} + +// Mkdir implements Linux syscall mkdir(2). +func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode) +} + +// Mkdirat implements Linux syscall mkdirat(2). +func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + addr := args[1].Pointer() + mode := args[2].ModeT() + return 0, nil, mkdirat(t, dirfd, addr, mode) +} + +func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error { + path, err := copyInPath(t, addr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{ + Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()), + }) +} + +// Mknod implements Linux syscall mknod(2). +func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + dev := args[2].Uint() + return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev) +} + +// Mknodat implements Linux syscall mknodat(2). +func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + addr := args[1].Pointer() + mode := args[2].ModeT() + dev := args[3].Uint() + return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev) +} + +func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode linux.FileMode, dev uint32) error { + path, err := copyInPath(t, addr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + + // "Zero file type is equivalent to type S_IFREG." - mknod(2) + if mode.FileType() == 0 { + mode |= linux.ModeRegular + } + major, minor := linux.DecodeDeviceID(dev) + return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{ + Mode: mode &^ linux.FileMode(t.FSContext().Umask()), + DevMajor: uint32(major), + DevMinor: minor, + }) +} + +// Open implements Linux syscall open(2). +func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Uint() + mode := args[2].ModeT() + return openat(t, linux.AT_FDCWD, addr, flags, mode) +} + +// Openat implements Linux syscall openat(2). +func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + addr := args[1].Pointer() + flags := args[2].Uint() + mode := args[3].ModeT() + return openat(t, dirfd, addr, flags, mode) +} + +// Creat implements Linux syscall creat(2). +func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode) +} + +func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) { + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0)) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{ + Flags: flags | linux.O_LARGEFILE, + Mode: linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()), + }) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + return uintptr(fd), nil, err +} + +// Rename implements Linux syscall rename(2). +func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldpathAddr := args[0].Pointer() + newpathAddr := args[1].Pointer() + return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) +} + +// Renameat implements Linux syscall renameat(2). +func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + olddirfd := args[0].Int() + oldpathAddr := args[1].Pointer() + newdirfd := args[2].Int() + newpathAddr := args[3].Pointer() + return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */) +} + +// Renameat2 implements Linux syscall renameat2(2). +func Renameat2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + olddirfd := args[0].Int() + oldpathAddr := args[1].Pointer() + newdirfd := args[2].Int() + newpathAddr := args[3].Pointer() + flags := args[4].Uint() + return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) +} + +func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags uint32) error { + oldpath, err := copyInPath(t, oldpathAddr) + if err != nil { + return err + } + // "If oldpath refers to a symbolic link, the link is renamed" - rename(2) + oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer oldtpop.Release() + + newpath, err := copyInPath(t, newpathAddr) + if err != nil { + return err + } + newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer newtpop.Release() + + return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{ + Flags: flags, + }) +} + +// Fallocate implements linux system call fallocate(2). +func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + mode := args[1].Uint64() + offset := args[2].Int64() + length := args[3].Int64() + + file := t.GetFileVFS2(fd) + + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + if !file.IsWritable() { + return 0, nil, syserror.EBADF + } + + if mode != 0 { + return 0, nil, syserror.ENOTSUP + } + + if offset < 0 || length <= 0 { + return 0, nil, syserror.EINVAL + } + + size := offset + length + + if size < 0 { + return 0, nil, syserror.EFBIG + } + + limit := limits.FromContext(t).Get(limits.FileSize).Cur + + if uint64(size) >= limit { + t.SendSignal(&arch.SignalInfo{ + Signo: int32(linux.SIGXFSZ), + Code: arch.SignalInfoUser, + }) + return 0, nil, syserror.EFBIG + } + + return 0, nil, file.Impl().Allocate(t, mode, uint64(offset), uint64(length)) + + // File length modified, generate notification. + // TODO(gvisor.dev/issue/1479): Reenable when Inotify is ported. + // file.Dirent.InotifyEvent(linux.IN_MODIFY, 0) +} + +// Rmdir implements Linux syscall rmdir(2). +func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr) +} + +func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop) +} + +// Unlink implements Linux syscall unlink(2). +func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr) +} + +func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop) +} + +// Unlinkat implements Linux syscall unlinkat(2). +func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + flags := args[2].Int() + + if flags&^linux.AT_REMOVEDIR != 0 { + return 0, nil, syserror.EINVAL + } + + if flags&linux.AT_REMOVEDIR != 0 { + return 0, nil, rmdirat(t, dirfd, pathAddr) + } + return 0, nil, unlinkat(t, dirfd, pathAddr) +} + +// Symlink implements Linux syscall symlink(2). +func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + targetAddr := args[0].Pointer() + linkpathAddr := args[1].Pointer() + return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr) +} + +// Symlinkat implements Linux syscall symlinkat(2). +func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + targetAddr := args[0].Pointer() + newdirfd := args[1].Int() + linkpathAddr := args[2].Pointer() + return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr) +} + +func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpathAddr usermem.Addr) error { + target, err := t.CopyInString(targetAddr, linux.PATH_MAX) + if err != nil { + return err + } + if len(target) == 0 { + return syserror.ENOENT + } + linkpath, err := copyInPath(t, linkpathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go new file mode 100644 index 000000000..317409a18 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Getcwd implements Linux syscall getcwd(2). +func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + size := args[1].SizeT() + + root := t.FSContext().RootDirectoryVFS2() + wd := t.FSContext().WorkingDirectoryVFS2() + s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd) + root.DecRef() + wd.DecRef() + if err != nil { + return 0, nil, err + } + + // Note this is >= because we need a terminator. + if uint(len(s)) >= size { + return 0, nil, syserror.ERANGE + } + + // Construct a byte slice containing a NUL terminator. + buf := t.CopyScratchBuffer(len(s) + 1) + copy(buf, s) + buf[len(buf)-1] = 0 + + // Write the pathname slice. + n, err := t.CopyOutBytes(addr, buf) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Chdir implements Linux syscall chdir(2). +func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return 0, nil, err + } + t.FSContext().SetWorkingDirectoryVFS2(vd) + vd.DecRef() + return 0, nil, nil +} + +// Fchdir implements Linux syscall fchdir(2). +func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return 0, nil, err + } + t.FSContext().SetWorkingDirectoryVFS2(vd) + vd.DecRef() + return 0, nil, nil +} + +// Chroot implements Linux syscall chroot(2). +func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + if !t.HasCapability(linux.CAP_SYS_CHROOT) { + return 0, nil, syserror.EPERM + } + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return 0, nil, err + } + t.FSContext().SetRootDirectoryVFS2(vd) + vd.DecRef() + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go new file mode 100644 index 000000000..c7c7bf7ce --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go @@ -0,0 +1,161 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Getdents implements Linux syscall getdents(2). +func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getdents(t, args, false /* isGetdents64 */) +} + +// Getdents64 implements Linux syscall getdents64(2). +func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getdents(t, args, true /* isGetdents64 */) +} + +func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := int(args[2].Uint()) + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + cb := getGetdentsCallback(t, addr, size, isGetdents64) + err := file.IterDirents(t, cb) + n := size - cb.remaining + putGetdentsCallback(cb) + if n == 0 { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +type getdentsCallback struct { + t *kernel.Task + addr usermem.Addr + remaining int + isGetdents64 bool +} + +var getdentsCallbackPool = sync.Pool{ + New: func() interface{} { + return &getdentsCallback{} + }, +} + +func getGetdentsCallback(t *kernel.Task, addr usermem.Addr, size int, isGetdents64 bool) *getdentsCallback { + cb := getdentsCallbackPool.Get().(*getdentsCallback) + *cb = getdentsCallback{ + t: t, + addr: addr, + remaining: size, + isGetdents64: isGetdents64, + } + return cb +} + +func putGetdentsCallback(cb *getdentsCallback) { + cb.t = nil + getdentsCallbackPool.Put(cb) +} + +// Handle implements vfs.IterDirentsCallback.Handle. +func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error { + var buf []byte + if cb.isGetdents64 { + // struct linux_dirent64 { + // ino64_t d_ino; /* 64-bit inode number */ + // off64_t d_off; /* 64-bit offset to next structure */ + // unsigned short d_reclen; /* Size of this dirent */ + // unsigned char d_type; /* File type */ + // char d_name[]; /* Filename (null-terminated) */ + // }; + size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name) + size = (size + 7) &^ 7 // round up to multiple of 8 + if size > cb.remaining { + return syserror.EINVAL + } + buf = cb.t.CopyScratchBuffer(size) + usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino) + usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff)) + usermem.ByteOrder.PutUint16(buf[16:18], uint16(size)) + buf[18] = dirent.Type + copy(buf[19:], dirent.Name) + // Zero out all remaining bytes in buf, including the NUL terminator + // after dirent.Name. + bufTail := buf[19+len(dirent.Name):] + for i := range bufTail { + bufTail[i] = 0 + } + } else { + // struct linux_dirent { + // unsigned long d_ino; /* Inode number */ + // unsigned long d_off; /* Offset to next linux_dirent */ + // unsigned short d_reclen; /* Length of this linux_dirent */ + // char d_name[]; /* Filename (null-terminated) */ + // /* length is actually (d_reclen - 2 - + // offsetof(struct linux_dirent, d_name)) */ + // /* + // char pad; // Zero padding byte + // char d_type; // File type (only since Linux + // // 2.6.4); offset is (d_reclen - 1) + // */ + // }; + if cb.t.Arch().Width() != 8 { + panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width())) + } + size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name) + size = (size + 7) &^ 7 // round up to multiple of sizeof(long) + if size > cb.remaining { + return syserror.EINVAL + } + buf = cb.t.CopyScratchBuffer(size) + usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino) + usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff)) + usermem.ByteOrder.PutUint16(buf[16:18], uint16(size)) + copy(buf[18:], dirent.Name) + // Zero out all remaining bytes in buf, including the NUL terminator + // after dirent.Name and the zero padding byte between the name and + // dirent type. + bufTail := buf[18+len(dirent.Name) : size-1] + for i := range bufTail { + bufTail[i] = 0 + } + buf[size-1] = dirent.Type + } + n, err := cb.t.CopyOutBytes(cb.addr, buf) + if err != nil { + // Don't report partially-written dirents by advancing cb.addr or + // cb.remaining. + return err + } + cb.addr += usermem.Addr(n) + cb.remaining -= n + return nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go new file mode 100644 index 000000000..5d98134a5 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go @@ -0,0 +1,137 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC + +// InotifyInit1 implements the inotify_init1() syscalls. +func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + if flags&^allFlags != 0 { + return 0, nil, syserror.EINVAL + } + + ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags)) + if err != nil { + return 0, nil, err + } + defer ino.DecRef() + + fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{ + CloseOnExec: flags&linux.IN_CLOEXEC != 0, + }) + + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// InotifyInit implements the inotify_init() syscalls. +func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + args[0].Value = 0 + return InotifyInit1(t, args) +} + +// fdToInotify resolves an fd to an inotify object. If successful, the file will +// have an extra ref and the caller is responsible for releasing the ref. +func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) { + f := t.GetFileVFS2(fd) + if f == nil { + // Invalid fd. + return nil, nil, syserror.EBADF + } + + ino, ok := f.Impl().(*vfs.Inotify) + if !ok { + // Not an inotify fd. + f.DecRef() + return nil, nil, syserror.EINVAL + } + + return ino, f, nil +} + +// InotifyAddWatch implements the inotify_add_watch() syscall. +func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + mask := args[2].Uint() + + // "EINVAL: The given event mask contains no valid events." + // -- inotify_add_watch(2) + if mask&linux.ALL_INOTIFY_BITS == 0 { + return 0, nil, syserror.EINVAL + } + + // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link." + // -- inotify(7) + follow := followFinalSymlink + if mask&linux.IN_DONT_FOLLOW == 0 { + follow = nofollowFinalSymlink + } + + ino, f, err := fdToInotify(t, fd) + if err != nil { + return 0, nil, err + } + defer f.DecRef() + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + if mask&linux.IN_ONLYDIR != 0 { + path.Dir = true + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{}) + if err != nil { + return 0, nil, err + } + defer d.DecRef() + + fd, err = ino.AddWatch(d.Dentry(), mask) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// InotifyRmWatch implements the inotify_rm_watch() syscall. +func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + wd := args[1].Int() + + ino, f, err := fdToInotify(t, fd) + if err != nil { + return 0, nil, err + } + defer f.DecRef() + return 0, nil, ino.RmWatch(wd) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go new file mode 100644 index 000000000..fd6ab94b2 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -0,0 +1,107 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Ioctl implements Linux syscall ioctl(2). +func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Handle ioctls that apply to all FDs. + switch args[1].Int() { + case linux.FIONCLEX: + t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{ + CloseOnExec: false, + }) + return 0, nil, nil + + case linux.FIOCLEX: + t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{ + CloseOnExec: true, + }) + return 0, nil, nil + + case linux.FIONBIO: + var set int32 + if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { + return 0, nil, err + } + flags := file.StatusFlags() + if set != 0 { + flags |= linux.O_NONBLOCK + } else { + flags &^= linux.O_NONBLOCK + } + return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags) + + case linux.FIOASYNC: + var set int32 + if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { + return 0, nil, err + } + flags := file.StatusFlags() + if set != 0 { + flags |= linux.O_ASYNC + } else { + flags &^= linux.O_ASYNC + } + file.SetStatusFlags(t, t.Credentials(), flags) + return 0, nil, nil + + case linux.FIOGETOWN, linux.SIOCGPGRP: + var who int32 + owner, hasOwner := getAsyncOwner(t, file) + if hasOwner { + if owner.Type == linux.F_OWNER_PGRP { + who = -owner.PID + } else { + who = owner.PID + } + } + _, err := t.CopyOut(args[2].Pointer(), &who) + return 0, nil, err + + case linux.FIOSETOWN, linux.SIOCSPGRP: + var who int32 + if _, err := t.CopyIn(args[2].Pointer(), &who); err != nil { + return 0, nil, err + } + ownerType := int32(linux.F_OWNER_PID) + if who < 0 { + // Check for overflow before flipping the sign. + if who-1 > who { + return 0, nil, syserror.EINVAL + } + ownerType = linux.F_OWNER_PGRP + who = -who + } + return 0, nil, setAsyncOwner(t, file, ownerType, who) + } + + ret, err := file.Ioctl(t, t.MemoryManager(), args) + return ret, nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go new file mode 100644 index 000000000..bf19028c4 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/lock.go @@ -0,0 +1,64 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Flock implements linux syscall flock(2). +func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + operation := args[1].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + // flock(2): EBADF fd is not an open file descriptor. + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + nonblocking := operation&linux.LOCK_NB != 0 + operation &^= linux.LOCK_NB + + var blocker lock.Blocker + if !nonblocking { + blocker = t + } + + switch operation { + case linux.LOCK_EX: + if err := file.LockBSD(t, lock.WriteLock, blocker); err != nil { + return 0, nil, err + } + case linux.LOCK_SH: + if err := file.LockBSD(t, lock.ReadLock, blocker); err != nil { + return 0, nil, err + } + case linux.LOCK_UN: + if err := file.UnlockBSD(t); err != nil { + return 0, nil, err + } + default: + // flock(2): EINVAL operation is invalid. + return 0, nil, syserror.EINVAL + } + + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go new file mode 100644 index 000000000..bbe248d17 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + memfdPrefix = "memfd:" + memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING) +) + +// MemfdCreate implements the linux syscall memfd_create(2). +func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Uint() + + if flags&^memfdAllFlags != 0 { + // Unknown bits in flags. + return 0, nil, syserror.EINVAL + } + + allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 + cloExec := flags&linux.MFD_CLOEXEC != 0 + + name, err := t.CopyInString(addr, memfdMaxNameLen) + if err != nil { + return 0, nil, err + } + + shmMount := t.Kernel().ShmMount() + file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name) + if err != nil { + return 0, nil, err + } + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: cloExec, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go new file mode 100644 index 000000000..60a43f0a0 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go @@ -0,0 +1,92 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Mmap implements Linux syscall mmap(2). +func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + prot := args[2].Int() + flags := args[3].Int() + fd := args[4].Int() + fixed := flags&linux.MAP_FIXED != 0 + private := flags&linux.MAP_PRIVATE != 0 + shared := flags&linux.MAP_SHARED != 0 + anon := flags&linux.MAP_ANONYMOUS != 0 + map32bit := flags&linux.MAP_32BIT != 0 + + // Require exactly one of MAP_PRIVATE and MAP_SHARED. + if private == shared { + return 0, nil, syserror.EINVAL + } + + opts := memmap.MMapOpts{ + Length: args[1].Uint64(), + Offset: args[5].Uint64(), + Addr: args[0].Pointer(), + Fixed: fixed, + Unmap: fixed, + Map32Bit: map32bit, + Private: private, + Perms: usermem.AccessType{ + Read: linux.PROT_READ&prot != 0, + Write: linux.PROT_WRITE&prot != 0, + Execute: linux.PROT_EXEC&prot != 0, + }, + MaxPerms: usermem.AnyAccess, + GrowsDown: linux.MAP_GROWSDOWN&flags != 0, + Precommit: linux.MAP_POPULATE&flags != 0, + } + if linux.MAP_LOCKED&flags != 0 { + opts.MLockMode = memmap.MLockEager + } + defer func() { + if opts.MappingIdentity != nil { + opts.MappingIdentity.DecRef() + } + }() + + if !anon { + // Convert the passed FD to a file reference. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // mmap unconditionally requires that the FD is readable. + if !file.IsReadable() { + return 0, nil, syserror.EACCES + } + // MAP_SHARED requires that the FD be writable for PROT_WRITE. + if shared && !file.IsWritable() { + opts.MaxPerms.Write = false + } + + if err := file.ConfigureMMap(t, &opts); err != nil { + return 0, nil, err + } + } + + rv, err := t.MemoryManager().MMap(t, opts) + return uintptr(rv), nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go new file mode 100644 index 000000000..adeaa39cc --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/mount.go @@ -0,0 +1,145 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Mount implements Linux syscall mount(2). +func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + sourceAddr := args[0].Pointer() + targetAddr := args[1].Pointer() + typeAddr := args[2].Pointer() + flags := args[3].Uint64() + dataAddr := args[4].Pointer() + + // For null-terminated strings related to mount(2), Linux copies in at most + // a page worth of data. See fs/namespace.c:copy_mount_string(). + fsType, err := t.CopyInString(typeAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + source, err := t.CopyInString(sourceAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + + targetPath, err := copyInPath(t, targetAddr) + if err != nil { + return 0, nil, err + } + + data := "" + if dataAddr != 0 { + // In Linux, a full page is always copied in regardless of null + // character placement, and the address is passed to each file system. + // Most file systems always treat this data as a string, though, and so + // do all of the ones we implement. + data, err = t.CopyInString(dataAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + } + + // Ignore magic value that was required before Linux 2.4. + if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL { + flags = flags &^ linux.MS_MGC_MSK + } + + // Must have CAP_SYS_ADMIN in the current mount namespace's associated user + // namespace. + creds := t.Credentials() + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) { + return 0, nil, syserror.EPERM + } + + const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND | + linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE | + linux.MS_UNBINDABLE | linux.MS_MOVE + + // Silently allow MS_NOSUID, since we don't implement set-id bits + // anyway. + const unsupportedFlags = linux.MS_NODEV | + linux.MS_NODIRATIME | linux.MS_STRICTATIME + + // Linux just allows passing any flags to mount(2) - it won't fail when + // unknown or unsupported flags are passed. Since we don't implement + // everything, we fail explicitly on flags that are unimplemented. + if flags&(unsupportedOps|unsupportedFlags) != 0 { + return 0, nil, syserror.EINVAL + } + + var opts vfs.MountOptions + if flags&linux.MS_NOATIME == linux.MS_NOATIME { + opts.Flags.NoATime = true + } + if flags&linux.MS_NOEXEC == linux.MS_NOEXEC { + opts.Flags.NoExec = true + } + if flags&linux.MS_RDONLY == linux.MS_RDONLY { + opts.ReadOnly = true + } + opts.GetFilesystemOptions.Data = data + + target, err := getTaskPathOperation(t, linux.AT_FDCWD, targetPath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer target.Release() + + return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts) +} + +// Umount2 implements Linux syscall umount2(2). +func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Int() + + // Must have CAP_SYS_ADMIN in the mount namespace's associated user + // namespace. + // + // Currently, this is always the init task's user namespace. + creds := t.Credentials() + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) { + return 0, nil, syserror.EPERM + } + + const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE + if flags&unsupported != 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + opts := vfs.UmountOptions{ + Flags: uint32(flags), + } + + return 0, nil, t.Kernel().VFS().UmountAt(t, creds, &tpop.pop, &opts) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go new file mode 100644 index 000000000..97da6c647 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/path.go @@ -0,0 +1,94 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func copyInPath(t *kernel.Task, addr usermem.Addr) (fspath.Path, error) { + pathname, err := t.CopyInString(addr, linux.PATH_MAX) + if err != nil { + return fspath.Path{}, err + } + return fspath.Parse(pathname), nil +} + +type taskPathOperation struct { + pop vfs.PathOperation + haveStartRef bool +} + +func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink) (taskPathOperation, error) { + root := t.FSContext().RootDirectoryVFS2() + start := root + haveStartRef := false + if !path.Absolute { + if !path.HasComponents() && !bool(shouldAllowEmptyPath) { + root.DecRef() + return taskPathOperation{}, syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + haveStartRef = true + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + root.DecRef() + return taskPathOperation{}, syserror.EBADF + } + start = dirfile.VirtualDentry() + start.IncRef() + haveStartRef = true + dirfile.DecRef() + } + } + return taskPathOperation{ + pop: vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: bool(shouldFollowFinalSymlink), + }, + haveStartRef: haveStartRef, + }, nil +} + +func (tpop *taskPathOperation) Release() { + tpop.pop.Root.DecRef() + if tpop.haveStartRef { + tpop.pop.Start.DecRef() + tpop.haveStartRef = false + } +} + +type shouldAllowEmptyPath bool + +const ( + disallowEmptyPath shouldAllowEmptyPath = false + allowEmptyPath shouldAllowEmptyPath = true +) + +type shouldFollowFinalSymlink bool + +const ( + nofollowFinalSymlink shouldFollowFinalSymlink = false + followFinalSymlink shouldFollowFinalSymlink = true +) diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go new file mode 100644 index 000000000..4a01e4209 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Pipe implements Linux syscall pipe(2). +func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + return 0, nil, pipe2(t, addr, 0) +} + +// Pipe2 implements Linux syscall pipe2(2). +func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Int() + return 0, nil, pipe2(t, addr, flags) +} + +func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error { + if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { + return syserror.EINVAL + } + r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK)) + defer r.DecRef() + defer w.DecRef() + + fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + if err != nil { + return err + } + if _, err := t.CopyOut(addr, fds); err != nil { + for _, fd := range fds { + if _, file := t.FDTable().Remove(fd); file != nil { + file.DecRef() + } + } + return err + } + return nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go new file mode 100644 index 000000000..ff1b25d7b --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/poll.go @@ -0,0 +1,586 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "fmt" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// fileCap is the maximum allowable files for poll & select. This has no +// equivalent in Linux; it exists in gVisor since allocation failure in Go is +// unrecoverable. +const fileCap = 1024 * 1024 + +// Masks for "readable", "writable", and "exceptional" events as defined by +// select(2). +const ( + // selectReadEvents is analogous to the Linux kernel's + // fs/select.c:POLLIN_SET. + selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR + + // selectWriteEvents is analogous to the Linux kernel's + // fs/select.c:POLLOUT_SET. + selectWriteEvents = linux.POLLOUT | linux.POLLERR + + // selectExceptEvents is analogous to the Linux kernel's + // fs/select.c:POLLEX_SET. + selectExceptEvents = linux.POLLPRI +) + +// pollState tracks the associated file description and waiter of a PollFD. +type pollState struct { + file *vfs.FileDescription + waiter waiter.Entry +} + +// initReadiness gets the current ready mask for the file represented by the FD +// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is +// used to register with the file for event notifications, and a reference to +// the file is stored in "state". +func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) { + if pfd.FD < 0 { + pfd.REvents = 0 + return + } + + file := t.GetFileVFS2(pfd.FD) + if file == nil { + pfd.REvents = linux.POLLNVAL + return + } + + if ch == nil { + defer file.DecRef() + } else { + state.file = file + state.waiter, _ = waiter.NewChannelEntry(ch) + file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events))) + } + + r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events))) + pfd.REvents = int16(r.ToLinux()) & pfd.Events +} + +// releaseState releases all the pollState in "state". +func releaseState(state []pollState) { + for i := range state { + if state[i].file != nil { + state[i].file.EventUnregister(&state[i].waiter) + state[i].file.DecRef() + } + } +} + +// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout" +// when "timeout" is greater than zero. +// +// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or +// positive if interrupted by a signal. +func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) { + var ch chan struct{} + if timeout != 0 { + ch = make(chan struct{}, 1) + } + + // Register for event notification in the files involved if we may + // block (timeout not zero). Once we find a file that has a non-zero + // result, we stop registering for events but still go through all files + // to get their ready masks. + state := make([]pollState, len(pfd)) + defer releaseState(state) + n := uintptr(0) + for i := range pfd { + initReadiness(t, &pfd[i], &state[i], ch) + if pfd[i].REvents != 0 { + n++ + ch = nil + } + } + + if timeout == 0 { + return timeout, n, nil + } + + haveTimeout := timeout >= 0 + + for n == 0 { + var err error + // Wait for a notification. + timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout) + if err != nil { + if err == syserror.ETIMEDOUT { + err = nil + } + return timeout, 0, err + } + + // We got notified, count how many files are ready. If none, + // then this was a spurious notification, and we just go back + // to sleep with the remaining timeout. + for i := range state { + if state[i].file == nil { + continue + } + + r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events))) + rl := int16(r.ToLinux()) & pfd[i].Events + if rl != 0 { + pfd[i].REvents = rl + n++ + } + } + } + + return timeout, n, nil +} + +// copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max. +func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) { + if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) { + return nil, syserror.EINVAL + } + + pfd := make([]linux.PollFD, nfds) + if nfds > 0 { + if _, err := t.CopyIn(addr, &pfd); err != nil { + return nil, err + } + } + + return pfd, nil +} + +func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) { + pfd, err := copyInPollFDs(t, addr, nfds) + if err != nil { + return timeout, 0, err + } + + // Compatibility warning: Linux adds POLLHUP and POLLERR just before + // polling, in fs/select.c:do_pollfd(). Since pfd is copied out after + // polling, changing event masks here is an application-visible difference. + // (Linux also doesn't copy out event masks at all, only revents.) + for i := range pfd { + pfd[i].Events |= linux.POLLHUP | linux.POLLERR + } + remainingTimeout, n, err := pollBlock(t, pfd, timeout) + err = syserror.ConvertIntr(err, syserror.EINTR) + + // The poll entries are copied out regardless of whether + // any are set or not. This aligns with the Linux behavior. + if nfds > 0 && err == nil { + if _, err := t.CopyOut(addr, pfd); err != nil { + return remainingTimeout, 0, err + } + } + + return remainingTimeout, n, err +} + +// CopyInFDSet copies an fd set from select(2)/pselect(2). +func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) { + set := make([]byte, nBytes) + + if addr != 0 { + if _, err := t.CopyIn(addr, &set); err != nil { + return nil, err + } + // If we only use part of the last byte, mask out the extraneous bits. + // + // N.B. This only works on little-endian architectures. + if nBitsInLastPartialByte != 0 { + set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte + } + } + return set, nil +} + +func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) { + if nfds < 0 || nfds > fileCap { + return 0, syserror.EINVAL + } + + // Calculate the size of the fd sets (one bit per fd). + nBytes := (nfds + 7) / 8 + nBitsInLastPartialByte := nfds % 8 + + // Capture all the provided input vectors. + r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + + // Count how many FDs are actually being requested so that we can build + // a PollFD array. + fdCount := 0 + for i := 0; i < nBytes; i++ { + v := r[i] | w[i] | e[i] + for v != 0 { + v &= (v - 1) + fdCount++ + } + } + + // Build the PollFD array. + pfd := make([]linux.PollFD, 0, fdCount) + var fd int32 + for i := 0; i < nBytes; i++ { + rV, wV, eV := r[i], w[i], e[i] + v := rV | wV | eV + m := byte(1) + for j := 0; j < 8; j++ { + if (v & m) != 0 { + // Make sure the fd is valid and decrement the reference + // immediately to ensure we don't leak. Note, another thread + // might be about to close fd. This is racy, but that's + // OK. Linux is racy in the same way. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, syserror.EBADF + } + file.DecRef() + + var mask int16 + if (rV & m) != 0 { + mask |= selectReadEvents + } + + if (wV & m) != 0 { + mask |= selectWriteEvents + } + + if (eV & m) != 0 { + mask |= selectExceptEvents + } + + pfd = append(pfd, linux.PollFD{ + FD: fd, + Events: mask, + }) + } + + fd++ + m <<= 1 + } + } + + // Do the syscall, then count the number of bits set. + if _, _, err = pollBlock(t, pfd, timeout); err != nil { + return 0, syserror.ConvertIntr(err, syserror.EINTR) + } + + // r, w, and e are currently event mask bitsets; unset bits corresponding + // to events that *didn't* occur. + bitSetCount := uintptr(0) + for idx := range pfd { + events := pfd[idx].REvents + i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8) + m := byte(1) << j + if r[i]&m != 0 { + if (events & selectReadEvents) != 0 { + bitSetCount++ + } else { + r[i] &^= m + } + } + if w[i]&m != 0 { + if (events & selectWriteEvents) != 0 { + bitSetCount++ + } else { + w[i] &^= m + } + } + if e[i]&m != 0 { + if (events & selectExceptEvents) != 0 { + bitSetCount++ + } else { + e[i] &^= m + } + } + } + + // Copy updated vectors back. + if readFDs != 0 { + if _, err := t.CopyOut(readFDs, r); err != nil { + return 0, err + } + } + + if writeFDs != 0 { + if _, err := t.CopyOut(writeFDs, w); err != nil { + return 0, err + } + } + + if exceptFDs != 0 { + if _, err := t.CopyOut(exceptFDs, e); err != nil { + return 0, err + } + } + + return bitSetCount, nil +} + +// timeoutRemaining returns the amount of time remaining for the specified +// timeout or 0 if it has elapsed. +// +// startNs must be from CLOCK_MONOTONIC. +func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration { + now := t.Kernel().MonotonicClock().Now() + remaining := timeout - now.Sub(startNs) + if remaining < 0 { + remaining = 0 + } + return remaining +} + +// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr. +// +// startNs must be from CLOCK_MONOTONIC. +func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error { + if timeout <= 0 { + return nil + } + remaining := timeoutRemaining(t, startNs, timeout) + tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds()) + _, err := tsRemaining.CopyOut(t, timespecAddr) + return err +} + +// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr. +// +// startNs must be from CLOCK_MONOTONIC. +func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error { + if timeout <= 0 { + return nil + } + remaining := timeoutRemaining(t, startNs, timeout) + tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds()) + _, err := tvRemaining.CopyOut(t, timevalAddr) + return err +} + +// pollRestartBlock encapsulates the state required to restart poll(2) via +// restart_syscall(2). +// +// +stateify savable +type pollRestartBlock struct { + pfdAddr usermem.Addr + nfds uint + timeout time.Duration +} + +// Restart implements kernel.SyscallRestartBlock.Restart. +func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) { + return poll(t, p.pfdAddr, p.nfds, p.timeout) +} + +func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) { + remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout) + // On an interrupt poll(2) is restarted with the remaining timeout. + if err == syserror.EINTR { + t.SetSyscallRestartBlock(&pollRestartBlock{ + pfdAddr: pfdAddr, + nfds: nfds, + timeout: remainingTimeout, + }) + return 0, kernel.ERESTART_RESTARTBLOCK + } + return n, err +} + +// Poll implements linux syscall poll(2). +func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pfdAddr := args[0].Pointer() + nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. + timeout := time.Duration(args[2].Int()) * time.Millisecond + n, err := poll(t, pfdAddr, nfds, timeout) + return n, nil, err +} + +// Ppoll implements linux syscall ppoll(2). +func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pfdAddr := args[0].Pointer() + nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. + timespecAddr := args[2].Pointer() + maskAddr := args[3].Pointer() + maskSize := uint(args[4].Uint()) + + timeout, err := copyTimespecInToDuration(t, timespecAddr) + if err != nil { + return 0, nil, err + } + + var startNs ktime.Time + if timeout > 0 { + startNs = t.Kernel().MonotonicClock().Now() + } + + if err := setTempSignalSet(t, maskAddr, maskSize); err != nil { + return 0, nil, err + } + + _, n, err := doPoll(t, pfdAddr, nfds, timeout) + copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) + // doPoll returns EINTR if interrupted, but ppoll is normally restartable + // if interrupted by something other than a signal handled by the + // application (i.e. returns ERESTARTNOHAND). However, if + // copyOutTimespecRemaining failed, then the restarted ppoll would use the + // wrong timeout, so the error should be left as EINTR. + // + // Note that this means that if err is nil but copyErr is not, copyErr is + // ignored. This is consistent with Linux. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// Select implements linux syscall select(2). +func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nfds := int(args[0].Int()) // select(2) uses an int. + readFDs := args[1].Pointer() + writeFDs := args[2].Pointer() + exceptFDs := args[3].Pointer() + timevalAddr := args[4].Pointer() + + // Use a negative Duration to indicate "no timeout". + timeout := time.Duration(-1) + if timevalAddr != 0 { + var timeval linux.Timeval + if _, err := timeval.CopyIn(t, timevalAddr); err != nil { + return 0, nil, err + } + if timeval.Sec < 0 || timeval.Usec < 0 { + return 0, nil, syserror.EINVAL + } + timeout = time.Duration(timeval.ToNsecCapped()) + } + startNs := t.Kernel().MonotonicClock().Now() + n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) + copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) + // See comment in Ppoll. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// Pselect implements linux syscall pselect(2). +func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nfds := int(args[0].Int()) // select(2) uses an int. + readFDs := args[1].Pointer() + writeFDs := args[2].Pointer() + exceptFDs := args[3].Pointer() + timespecAddr := args[4].Pointer() + maskWithSizeAddr := args[5].Pointer() + + timeout, err := copyTimespecInToDuration(t, timespecAddr) + if err != nil { + return 0, nil, err + } + + var startNs ktime.Time + if timeout > 0 { + startNs = t.Kernel().MonotonicClock().Now() + } + + if maskWithSizeAddr != 0 { + if t.Arch().Width() != 8 { + panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width())) + } + var maskStruct sigSetWithSize + if _, err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil { + return 0, nil, err + } + if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil { + return 0, nil, err + } + } + + n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) + copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) + // See comment in Ppoll. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// +marshal +type sigSetWithSize struct { + sigsetAddr uint64 + sizeofSigset uint64 +} + +// copyTimespecInToDuration copies a Timespec from the untrusted app range, +// validates it and converts it to a Duration. +// +// If the Timespec is larger than what can be represented in a Duration, the +// returned value is the maximum that Duration will allow. +// +// If timespecAddr is NULL, the returned value is negative. +func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) { + // Use a negative Duration to indicate "no timeout". + timeout := time.Duration(-1) + if timespecAddr != 0 { + var timespec linux.Timespec + if _, err := timespec.CopyIn(t, timespecAddr); err != nil { + return 0, err + } + if !timespec.Valid() { + return 0, syserror.EINVAL + } + timeout = time.Duration(timespec.ToNsecCapped()) + } + return timeout, nil +} + +func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) error { + if maskAddr == 0 { + return nil + } + if maskSize != linux.SignalSetSize { + return syserror.EINVAL + } + var mask linux.SignalSet + if _, err := mask.CopyIn(t, maskAddr); err != nil { + return err + } + mask &^= kernel.UnblockableSignals + oldmask := t.SignalMask() + t.SetSignalMask(mask) + t.SetSavedSignalMask(oldmask) + return nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go new file mode 100644 index 000000000..cd25597a7 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -0,0 +1,641 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/socket" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + eventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr + eventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr +) + +// Read implements Linux syscall read(2). +func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := read(t, file, dst, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file) +} + +// Readv implements Linux syscall readv(2). +func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Get the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := read(t, file, dst, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file) +} + +func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + n, err := file.Read(t, dst, opts) + if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } + return n, err + } + + allowBlock, deadline, hasDeadline := blockPolicy(t, file) + if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err = file.Read(t, dst, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = syserror.ErrWouldBlock + } + break + } + } + file.EventUnregister(&w) + + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } + return total, err +} + +// Pread64 implements Linux syscall pread64(2). +func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate and does not overflow. + if offset < 0 || offset+int64(size) < 0 { + return 0, nil, syserror.EINVAL + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file) +} + +// Preadv implements Linux syscall preadv(2). +func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file) +} + +// Preadv2 implements Linux syscall preadv2(2). +func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // While the glibc signature is + // preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) + // the actual syscall + // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142) + // splits the offset argument into a high/low value for compatibility with + // 32-bit architectures. The flags argument is the 6th argument (index 5). + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + flags := args[5].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < -1 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + opts := vfs.ReadOptions{ + Flags: uint32(flags), + } + var n int64 + if offset == -1 { + n, err = read(t, file, dst, opts) + } else { + n, err = pread(t, file, dst, offset, opts) + } + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file) +} + +func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + n, err := file.PRead(t, dst, offset, opts) + if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } + return n, err + } + + allowBlock, deadline, hasDeadline := blockPolicy(t, file) + if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err = file.PRead(t, dst, offset+total, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = syserror.ErrWouldBlock + } + break + } + } + file.EventUnregister(&w) + + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } + return total, err +} + +// Write implements Linux syscall write(2). +func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := write(t, file, src, vfs.WriteOptions{}) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file) +} + +// Writev implements Linux syscall writev(2). +func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Get the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := write(t, file, src, vfs.WriteOptions{}) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file) +} + +func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + n, err := file.Write(t, src, opts) + if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + } + return n, err + } + + allowBlock, deadline, hasDeadline := blockPolicy(t, file) + if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + } + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskWrite) + + total := n + for { + // Shorten src to reflect bytes previously written. + src = src.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err = file.Write(t, src, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = syserror.ErrWouldBlock + } + break + } + } + file.EventUnregister(&w) + + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + } + return total, err +} + +// Pwrite64 implements Linux syscall pwrite64(2). +func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate and does not overflow. + if offset < 0 || offset+int64(size) < 0 { + return 0, nil, syserror.EINVAL + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file) +} + +// Pwritev implements Linux syscall pwritev(2). +func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file) +} + +// Pwritev2 implements Linux syscall pwritev2(2). +func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // While the glibc signature is + // pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) + // the actual syscall + // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162) + // splits the offset argument into a high/low value for compatibility with + // 32-bit architectures. The flags argument is the 6th argument (index 5). + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + flags := args[5].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < -1 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + opts := vfs.WriteOptions{ + Flags: uint32(flags), + } + var n int64 + if offset == -1 { + n, err = write(t, file, src, opts) + } else { + n, err = pwrite(t, file, src, offset, opts) + } + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file) +} + +func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, err := file.PWrite(t, src, offset, opts) + if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + } + return n, err + } + + allowBlock, deadline, hasDeadline := blockPolicy(t, file) + if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskWrite) + + total := n + for { + // Shorten src to reflect bytes previously written. + src = src.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err = file.PWrite(t, src, offset+total, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = syserror.ErrWouldBlock + } + break + } + } + file.EventUnregister(&w) + + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } + return total, err +} + +func blockPolicy(t *kernel.Task, file *vfs.FileDescription) (allowBlock bool, deadline ktime.Time, hasDeadline bool) { + if file.StatusFlags()&linux.O_NONBLOCK != 0 { + return false, ktime.Time{}, false + } + // Sockets support read/write timeouts. + if s, ok := file.Impl().(socket.SocketVFS2); ok { + dl := s.RecvTimeout() + if dl < 0 { + return false, ktime.Time{}, false + } + if dl > 0 { + return true, t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond), true + } + } + return true, ktime.Time{}, false +} + +// Lseek implements Linux syscall lseek(2). +func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + offset := args[1].Int64() + whence := args[2].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + newoff, err := file.Seek(t, offset, whence) + return uintptr(newoff), nil, err +} + +// Readahead implements readahead(2). +func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + offset := args[1].Int64() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is readable. + if !file.IsReadable() { + return 0, nil, syserror.EBADF + } + + // Check that the size is valid. + if int(size) < 0 { + return 0, nil, syserror.EINVAL + } + + // Check that the offset is legitimate and does not overflow. + if offset < 0 || offset+int64(size) < 0 { + return 0, nil, syserror.EINVAL + } + + // Return EINVAL; if the underlying file type does not support readahead, + // then Linux will return EINVAL to indicate as much. In the future, we + // may extend this function to actually support readahead hints. + return 0, nil, syserror.EINVAL +} diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go new file mode 100644 index 000000000..09ecfed26 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -0,0 +1,428 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX + +// Chmod implements Linux syscall chmod(2). +func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + mode := args[1].ModeT() + return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode) +} + +// Fchmodat implements Linux syscall fchmodat(2). +func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + mode := args[2].ModeT() + return 0, nil, fchmodat(t, dirfd, pathAddr, mode) +} + +func fchmodat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error { + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + + return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_MODE, + Mode: uint16(mode & chmodMask), + }, + }) +} + +// Fchmod implements Linux syscall fchmod(2). +func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + mode := args[1].ModeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.SetStat(t, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_MODE, + Mode: uint16(mode & chmodMask), + }, + }) +} + +// Chown implements Linux syscall chown(2). +func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + owner := args[1].Int() + group := args[2].Int() + return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */) +} + +// Lchown implements Linux syscall lchown(2). +func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + owner := args[1].Int() + group := args[2].Int() + return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW) +} + +// Fchownat implements Linux syscall fchownat(2). +func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + owner := args[2].Int() + group := args[3].Int() + flags := args[4].Int() + return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags) +} + +func fchownat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, owner, group, flags int32) error { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { + return err + } + + return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts) +} + +func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error { + userns := t.UserNamespace() + if owner != -1 { + kuid := userns.MapToKUID(auth.UID(owner)) + if !kuid.Ok() { + return syserror.EINVAL + } + opts.Stat.Mask |= linux.STATX_UID + opts.Stat.UID = uint32(kuid) + } + if group != -1 { + kgid := userns.MapToKGID(auth.GID(group)) + if !kgid.Ok() { + return syserror.EINVAL + } + opts.Stat.Mask |= linux.STATX_GID + opts.Stat.GID = uint32(kgid) + } + return nil +} + +// Fchown implements Linux syscall fchown(2). +func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + owner := args[1].Int() + group := args[2].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { + return 0, nil, err + } + return 0, nil, file.SetStat(t, opts) +} + +// Truncate implements Linux syscall truncate(2). +func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].Int64() + + if length < 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + + err = setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: uint64(length), + }, + }) + return 0, nil, handleSetSizeError(t, err) +} + +// Ftruncate implements Linux syscall ftruncate(2). +func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + length := args[1].Int64() + + if length < 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := file.SetStat(t, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: uint64(length), + }, + }) + return 0, nil, handleSetSizeError(t, err) +} + +// Utime implements Linux syscall utime(2). +func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + timesAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + opts := vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_ATIME | linux.STATX_MTIME, + }, + } + if timesAddr == 0 { + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + } else { + var times linux.Utime + if _, err := times.CopyIn(t, timesAddr); err != nil { + return 0, nil, err + } + opts.Stat.Atime.Sec = times.Actime + opts.Stat.Mtime.Sec = times.Modtime + } + + return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts) +} + +// Utimes implements Linux syscall utimes(2). +func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + timesAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil { + return 0, nil, err + } + + return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts) +} + +// Futimesat implements Linux syscall futimesat(2). +func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + timesAddr := args[2].Pointer() + + // "If filename is NULL and dfd refers to an open file, then operate on the + // file. Otherwise look up filename, possibly using dfd as a starting + // point." - fs/utimes.c + var path fspath.Path + shouldAllowEmptyPath := allowEmptyPath + if dirfd == linux.AT_FDCWD || pathAddr != 0 { + var err error + path, err = copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + shouldAllowEmptyPath = disallowEmptyPath + } + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil { + return 0, nil, err + } + + return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, followFinalSymlink, &opts) +} + +func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error { + if timesAddr == 0 { + opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + return nil + } + var times [2]linux.Timeval + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return err + } + if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 { + return syserror.EINVAL + } + opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME + opts.Stat.Atime = linux.StatxTimestamp{ + Sec: times[0].Sec, + Nsec: uint32(times[0].Usec * 1000), + } + opts.Stat.Mtime = linux.StatxTimestamp{ + Sec: times[1].Sec, + Nsec: uint32(times[1].Usec * 1000), + } + return nil +} + +// Utimensat implements Linux syscall utimensat(2). +func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + timesAddr := args[2].Pointer() + flags := args[3].Int() + + // Linux requires that the UTIME_OMIT check occur before checking path or + // flags. + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { + return 0, nil, err + } + if opts.Stat.Mask == 0 { + return 0, nil, nil + } + + if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 { + return 0, nil, syserror.EINVAL + } + + // "If filename is NULL and dfd refers to an open file, then operate on the + // file. Otherwise look up filename, possibly using dfd as a starting + // point." - fs/utimes.c + var path fspath.Path + shouldAllowEmptyPath := allowEmptyPath + if dirfd == linux.AT_FDCWD || pathAddr != 0 { + var err error + path, err = copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + shouldAllowEmptyPath = disallowEmptyPath + } + + return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts) +} + +func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error { + if timesAddr == 0 { + opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + return nil + } + var times [2]linux.Timespec + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return err + } + if times[0].Nsec != linux.UTIME_OMIT { + if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) { + return syserror.EINVAL + } + opts.Stat.Mask |= linux.STATX_ATIME + opts.Stat.Atime = linux.StatxTimestamp{ + Sec: times[0].Sec, + Nsec: uint32(times[0].Nsec), + } + } + if times[1].Nsec != linux.UTIME_OMIT { + if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) { + return syserror.EINVAL + } + opts.Stat.Mask |= linux.STATX_MTIME + opts.Stat.Mtime = linux.StatxTimestamp{ + Sec: times[1].Sec, + Nsec: uint32(times[1].Nsec), + } + } + return nil +} + +func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error { + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + if !path.Absolute { + if !path.HasComponents() && !bool(shouldAllowEmptyPath) { + return syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + return syserror.EBADF + } + if !path.HasComponents() { + // Use FileDescription.SetStat() instead of + // VirtualFilesystem.SetStatAt(), since the former may be able + // to use opened file state to expedite the SetStat. + err := dirfile.SetStat(t, *opts) + dirfile.DecRef() + return err + } + start = dirfile.VirtualDentry() + start.IncRef() + defer start.DecRef() + dirfile.DecRef() + } + } + return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: bool(shouldFollowFinalSymlink), + }, opts) +} + +func handleSetSizeError(t *kernel.Task, err error) error { + if err == syserror.ErrExceedsFileSizeLimit { + // Convert error to EFBIG and send a SIGXFSZ per setrlimit(2). + t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) + return syserror.EFBIG + } + return err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go new file mode 100644 index 000000000..623992f6f --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/signal.go @@ -0,0 +1,100 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/signalfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// sharedSignalfd is shared between the two calls. +func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) { + // Copy in the signal mask. + mask, err := slinux.CopyInSigSet(t, sigset, sigsetsize) + if err != nil { + return 0, nil, err + } + + // Always check for valid flags, even if not creating. + if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 { + return 0, nil, syserror.EINVAL + } + + // Is this a change to an existing signalfd? + // + // The spec indicates that this should adjust the mask. + if fd != -1 { + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Is this a signalfd? + if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok { + sfd.SetMask(mask) + return 0, nil, nil + } + + // Not a signalfd. + return 0, nil, syserror.EINVAL + } + + fileFlags := uint32(linux.O_RDWR) + if flags&linux.SFD_NONBLOCK != 0 { + fileFlags |= linux.O_NONBLOCK + } + + // Create a new file. + vfsObj := t.Kernel().VFS() + file, err := signalfd.New(vfsObj, t, mask, fileFlags) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + // Create a new descriptor. + fd, err = t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.SFD_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + // Done. + return uintptr(fd), nil, nil +} + +// Signalfd implements the linux syscall signalfd(2). +func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + sigset := args[1].Pointer() + sigsetsize := args[2].SizeT() + return sharedSignalfd(t, fd, sigset, sigsetsize, 0) +} + +// Signalfd4 implements the linux syscall signalfd4(2). +func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + sigset := args[1].Pointer() + sigsetsize := args[2].SizeT() + flags := args[3].Int() + return sharedSignalfd(t, fd, sigset, sigsetsize, flags) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go new file mode 100644 index 000000000..10b668477 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -0,0 +1,1139 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// minListenBacklog is the minimum reasonable backlog for listening sockets. +const minListenBacklog = 8 + +// maxListenBacklog is the maximum allowed backlog for listening sockets. +const maxListenBacklog = 1024 + +// maxAddrLen is the maximum socket address length we're willing to accept. +const maxAddrLen = 200 + +// maxOptLen is the maximum sockopt parameter length we're willing to accept. +const maxOptLen = 1024 * 8 + +// maxControlLen is the maximum length of the msghdr.msg_control buffer we're +// willing to accept. Note that this limit is smaller than Linux, which allows +// buffers upto INT_MAX. +const maxControlLen = 10 * 1024 * 1024 + +// nameLenOffset is the offset from the start of the MessageHeader64 struct to +// the NameLen field. +const nameLenOffset = 8 + +// controlLenOffset is the offset form the start of the MessageHeader64 struct +// to the ControlLen field. +const controlLenOffset = 40 + +// flagsOffset is the offset form the start of the MessageHeader64 struct +// to the Flags field. +const flagsOffset = 48 + +const sizeOfInt32 = 4 + +// messageHeader64Len is the length of a MessageHeader64 struct. +var messageHeader64Len = uint64(binary.Size(MessageHeader64{})) + +// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct. +var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{})) + +// baseRecvFlags are the flags that are accepted across recvmsg(2), +// recvmmsg(2), and recvfrom(2). +const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC + +// MessageHeader64 is the 64-bit representation of the msghdr struct used in +// the recvmsg and sendmsg syscalls. +type MessageHeader64 struct { + // Name is the optional pointer to a network address buffer. + Name uint64 + + // NameLen is the length of the buffer pointed to by Name. + NameLen uint32 + _ uint32 + + // Iov is a pointer to an array of io vectors that describe the memory + // locations involved in the io operation. + Iov uint64 + + // IovLen is the length of the array pointed to by Iov. + IovLen uint64 + + // Control is the optional pointer to ancillary control data. + Control uint64 + + // ControlLen is the length of the data pointed to by Control. + ControlLen uint64 + + // Flags on the sent/received message. + Flags int32 + _ int32 +} + +// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in +// the recvmmsg and sendmmsg syscalls. +type multipleMessageHeader64 struct { + msgHdr MessageHeader64 + msgLen uint32 + _ int32 +} + +// CopyInMessageHeader64 copies a message header from user to kernel memory. +func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error { + b := t.CopyScratchBuffer(52) + if _, err := t.CopyInBytes(addr, b); err != nil { + return err + } + + msg.Name = usermem.ByteOrder.Uint64(b[0:]) + msg.NameLen = usermem.ByteOrder.Uint32(b[8:]) + msg.Iov = usermem.ByteOrder.Uint64(b[16:]) + msg.IovLen = usermem.ByteOrder.Uint64(b[24:]) + msg.Control = usermem.ByteOrder.Uint64(b[32:]) + msg.ControlLen = usermem.ByteOrder.Uint64(b[40:]) + msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:])) + + return nil +} + +// CaptureAddress allocates memory for and copies a socket address structure +// from the untrusted address space range. +func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) { + if addrlen > maxAddrLen { + return nil, syserror.EINVAL + } + + addrBuf := make([]byte, addrlen) + if _, err := t.CopyInBytes(addr, addrBuf); err != nil { + return nil, err + } + + return addrBuf, nil +} + +// writeAddress writes a sockaddr structure and its length to an output buffer +// in the unstrusted address space range. If the address is bigger than the +// buffer, it is truncated. +func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error { + // Get the buffer length. + var bufLen uint32 + if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil { + return err + } + + if int32(bufLen) < 0 { + return syserror.EINVAL + } + + // Write the length unconditionally. + if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil { + return err + } + + if addr == nil { + return nil + } + + if bufLen > addrLen { + bufLen = addrLen + } + + // Copy as much of the address as will fit in the buffer. + encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr) + if bufLen > uint32(len(encodedAddr)) { + bufLen = uint32(len(encodedAddr)) + } + _, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)]) + return err +} + +// Socket implements the linux syscall socket(2). +func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + domain := int(args[0].Int()) + stype := args[1].Int() + protocol := int(args[2].Int()) + + // Check and initialize the flags. + if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { + return 0, nil, syserror.EINVAL + } + + // Create the new socket. + s, e := socket.NewVFS2(t, domain, linux.SockType(stype&0xf), protocol) + if e != nil { + return 0, nil, e.ToError() + } + defer s.DecRef() + + if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil { + return 0, nil, err + } + + fd, err := t.NewFDFromVFS2(0, s, kernel.FDFlags{ + CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// SocketPair implements the linux syscall socketpair(2). +func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + domain := int(args[0].Int()) + stype := args[1].Int() + protocol := int(args[2].Int()) + addr := args[3].Pointer() + + // Check and initialize the flags. + if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { + return 0, nil, syserror.EINVAL + } + + // Create the socket pair. + s1, s2, e := socket.PairVFS2(t, domain, linux.SockType(stype&0xf), protocol) + if e != nil { + return 0, nil, e.ToError() + } + // Adding to the FD table will cause an extra reference to be acquired. + defer s1.DecRef() + defer s2.DecRef() + + nonblocking := uint32(stype & linux.SOCK_NONBLOCK) + if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil { + return 0, nil, err + } + if err := s2.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil { + return 0, nil, err + } + + // Create the FDs for the sockets. + flags := kernel.FDFlags{ + CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, + } + fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{s1, s2}, flags) + if err != nil { + return 0, nil, err + } + + if _, err := t.CopyOut(addr, fds); err != nil { + for _, fd := range fds { + if _, file := t.FDTable().Remove(fd); file != nil { + file.DecRef() + } + } + return 0, nil, err + } + + return 0, nil, nil +} + +// Connect implements the linux syscall connect(2). +func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Uint() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Capture address and call syscall implementation. + a, err := CaptureAddress(t, addr, addrlen) + if err != nil { + return 0, nil, err + } + + blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0 + return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS) +} + +// accept is the implementation of the accept syscall. It is called by accept +// and accept4 syscall handlers. +func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) { + // Check that no unsupported flags are passed in. + if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { + return 0, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, syserror.ENOTSOCK + } + + // Call the syscall implementation for this socket, then copy the + // output address if one is specified. + blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0 + + peerRequested := addrLen != 0 + nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking) + if e != nil { + return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + } + if peerRequested { + // NOTE(magi): Linux does not give you an error if it can't + // write the data back out so neither do we. + if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL { + return 0, err + } + } + return uintptr(nfd), nil +} + +// Accept4 implements the linux syscall accept4(2). +func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Pointer() + flags := int(args[3].Int()) + + n, err := accept(t, fd, addr, addrlen, flags) + return n, nil, err +} + +// Accept implements the linux syscall accept(2). +func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Pointer() + + n, err := accept(t, fd, addr, addrlen, 0) + return n, nil, err +} + +// Bind implements the linux syscall bind(2). +func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Uint() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Capture address and call syscall implementation. + a, err := CaptureAddress(t, addr, addrlen) + if err != nil { + return 0, nil, err + } + + return 0, nil, s.Bind(t, a).ToError() +} + +// Listen implements the linux syscall listen(2). +func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + backlog := args[1].Int() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Per Linux, the backlog is silently capped to reasonable values. + if backlog <= 0 { + backlog = minListenBacklog + } + if backlog > maxListenBacklog { + backlog = maxListenBacklog + } + + return 0, nil, s.Listen(t, int(backlog)).ToError() +} + +// Shutdown implements the linux syscall shutdown(2). +func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + how := args[1].Int() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Validate how, then call syscall implementation. + switch how { + case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR: + default: + return 0, nil, syserror.EINVAL + } + + return 0, nil, s.Shutdown(t, int(how)).ToError() +} + +// GetSockOpt implements the linux syscall getsockopt(2). +func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + level := args[1].Int() + name := args[2].Int() + optValAddr := args[3].Pointer() + optLenAddr := args[4].Pointer() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Read the length. Reject negative values. + optLen := int32(0) + if _, err := t.CopyIn(optLenAddr, &optLen); err != nil { + return 0, nil, err + } + if optLen < 0 { + return 0, nil, syserror.EINVAL + } + + // Call syscall implementation then copy both value and value len out. + v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen)) + if e != nil { + return 0, nil, e.ToError() + } + + vLen := int32(binary.Size(v)) + if _, err := t.CopyOut(optLenAddr, vLen); err != nil { + return 0, nil, err + } + + if v != nil { + if _, err := t.CopyOut(optValAddr, v); err != nil { + return 0, nil, err + } + } + + return 0, nil, nil +} + +// getSockOpt tries to handle common socket options, or dispatches to a specific +// socket implementation. +func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) { + if level == linux.SOL_SOCKET { + switch name { + case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL: + if len < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + } + + switch name { + case linux.SO_TYPE: + _, skType, _ := s.Type() + return int32(skType), nil + case linux.SO_DOMAIN: + family, _, _ := s.Type() + return int32(family), nil + case linux.SO_PROTOCOL: + _, _, protocol := s.Type() + return int32(protocol), nil + } + } + + return s.GetSockOpt(t, level, name, optValAddr, len) +} + +// SetSockOpt implements the linux syscall setsockopt(2). +// +// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket. +func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + level := args[1].Int() + name := args[2].Int() + optValAddr := args[3].Pointer() + optLen := args[4].Int() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + if optLen < 0 { + return 0, nil, syserror.EINVAL + } + if optLen > maxOptLen { + return 0, nil, syserror.EINVAL + } + buf := t.CopyScratchBuffer(int(optLen)) + if _, err := t.CopyIn(optValAddr, &buf); err != nil { + return 0, nil, err + } + + // Call syscall implementation. + if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil { + return 0, nil, err.ToError() + } + + return 0, nil, nil +} + +// GetSockName implements the linux syscall getsockname(2). +func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Pointer() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Get the socket name and copy it to the caller. + v, vl, err := s.GetSockName(t) + if err != nil { + return 0, nil, err.ToError() + } + + return 0, nil, writeAddress(t, v, vl, addr, addrlen) +} + +// GetPeerName implements the linux syscall getpeername(2). +func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + addrlen := args[2].Pointer() + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Get the socket peer name and copy it to the caller. + v, vl, err := s.GetPeerName(t) + if err != nil { + return 0, nil, err.ToError() + } + + return 0, nil, writeAddress(t, v, vl, addr, addrlen) +} + +// RecvMsg implements the linux syscall recvmsg(2). +func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + msgPtr := args[1].Pointer() + flags := args[2].Int() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Reject flags that we don't handle yet. + if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { + return 0, nil, syserror.EINVAL + } + + if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { + flags |= linux.MSG_DONTWAIT + } + + var haveDeadline bool + var deadline ktime.Time + if dl := s.RecvTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + + n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline) + return n, nil, err +} + +// RecvMMsg implements the linux syscall recvmmsg(2). +func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + msgPtr := args[1].Pointer() + vlen := args[2].Uint() + flags := args[3].Int() + toPtr := args[4].Pointer() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syserror.EINVAL + } + + // Reject flags that we don't handle yet. + if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { + return 0, nil, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { + flags |= linux.MSG_DONTWAIT + } + + var haveDeadline bool + var deadline ktime.Time + if toPtr != 0 { + var ts linux.Timespec + if _, err := ts.CopyIn(t, toPtr); err != nil { + return 0, nil, err + } + if !ts.Valid() { + return 0, nil, syserror.EINVAL + } + deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration()) + haveDeadline = true + } + + if !haveDeadline { + if dl := s.RecvTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + } + + var count uint32 + var err error + for i := uint64(0); i < uint64(vlen); i++ { + mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) + if !ok { + return 0, nil, syserror.EFAULT + } + var n uintptr + if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil { + break + } + + // Copy the received length to the caller. + lp, ok := mp.AddLength(messageHeader64Len) + if !ok { + return 0, nil, syserror.EFAULT + } + if _, err = t.CopyOut(lp, uint32(n)); err != nil { + break + } + count++ + } + + if count == 0 { + return 0, nil, err + } + return uintptr(count), nil, nil +} + +func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) { + // Capture the message header and io vectors. + var msg MessageHeader64 + if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil { + return 0, err + } + + if msg.IovLen > linux.UIO_MAXIOV { + return 0, syserror.EMSGSIZE + } + dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + // FIXME(b/63594852): Pretend we have an empty error queue. + if flags&linux.MSG_ERRQUEUE != 0 { + return 0, syserror.EAGAIN + } + + // Fast path when no control message nor name buffers are provided. + if msg.ControlLen == 0 && msg.NameLen == 0 { + n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0) + if err != nil { + return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS) + } + if !cms.Unix.Empty() { + mflags |= linux.MSG_CTRUNC + cms.Release() + } + + if int(msg.Flags) != mflags { + // Copy out the flags to the caller. + if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil { + return 0, err + } + } + + return uintptr(n), nil + } + + if msg.ControlLen > maxControlLen { + return 0, syserror.ENOBUFS + } + n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) + if e != nil { + return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + } + defer cms.Release() + + controlData := make([]byte, 0, msg.ControlLen) + controlData = control.PackControlMessages(t, cms, controlData) + + if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() { + creds, _ := cms.Unix.Credentials.(control.SCMCredentials) + controlData, mflags = control.PackCredentials(t, creds, controlData, mflags) + } + + if cms.Unix.Rights != nil { + controlData, mflags = control.PackRightsVFS2(t, cms.Unix.Rights.(control.SCMRightsVFS2), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags) + } + + // Copy the address to the caller. + if msg.NameLen != 0 { + if err := writeAddress(t, sender, senderLen, usermem.Addr(msg.Name), usermem.Addr(msgPtr+nameLenOffset)); err != nil { + return 0, err + } + } + + // Copy the control data to the caller. + if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil { + return 0, err + } + if len(controlData) > 0 { + if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil { + return 0, err + } + } + + // Copy out the flags to the caller. + if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil { + return 0, err + } + + return uintptr(n), nil +} + +// recvFrom is the implementation of the recvfrom syscall. It is called by +// recvfrom and recv syscall handlers. +func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) { + if int(bufLen) < 0 { + return 0, syserror.EINVAL + } + + // Reject flags that we don't handle yet. + if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 { + return 0, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, syserror.ENOTSOCK + } + + if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { + flags |= linux.MSG_DONTWAIT + } + + dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + var haveDeadline bool + var deadline ktime.Time + if dl := s.RecvTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + + n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) + cm.Release() + if e != nil { + return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + } + + // Copy the address to the caller. + if nameLenPtr != 0 { + if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil { + return 0, err + } + } + + return uintptr(n), nil +} + +// RecvFrom implements the linux syscall recvfrom(2). +func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + bufPtr := args[1].Pointer() + bufLen := args[2].Uint64() + flags := args[3].Int() + namePtr := args[4].Pointer() + nameLenPtr := args[5].Pointer() + + n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr) + return n, nil, err +} + +// SendMsg implements the linux syscall sendmsg(2). +func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + msgPtr := args[1].Pointer() + flags := args[2].Int() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Reject flags that we don't handle yet. + if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { + return 0, nil, syserror.EINVAL + } + + if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { + flags |= linux.MSG_DONTWAIT + } + + n, err := sendSingleMsg(t, s, file, msgPtr, flags) + return n, nil, err +} + +// SendMMsg implements the linux syscall sendmmsg(2). +func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + msgPtr := args[1].Pointer() + vlen := args[2].Uint() + flags := args[3].Int() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, nil, syserror.ENOTSOCK + } + + // Reject flags that we don't handle yet. + if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { + return 0, nil, syserror.EINVAL + } + + if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { + flags |= linux.MSG_DONTWAIT + } + + var count uint32 + var err error + for i := uint64(0); i < uint64(vlen); i++ { + mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) + if !ok { + return 0, nil, syserror.EFAULT + } + var n uintptr + if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil { + break + } + + // Copy the received length to the caller. + lp, ok := mp.AddLength(messageHeader64Len) + if !ok { + return 0, nil, syserror.EFAULT + } + if _, err = t.CopyOut(lp, uint32(n)); err != nil { + break + } + count++ + } + + if count == 0 { + return 0, nil, err + } + return uintptr(count), nil, nil +} + +func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr usermem.Addr, flags int32) (uintptr, error) { + // Capture the message header. + var msg MessageHeader64 + if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil { + return 0, err + } + + var controlData []byte + if msg.ControlLen > 0 { + // Put an upper bound to prevent large allocations. + if msg.ControlLen > maxControlLen { + return 0, syserror.ENOBUFS + } + controlData = make([]byte, msg.ControlLen) + if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil { + return 0, err + } + } + + // Read the destination address if one is specified. + var to []byte + if msg.NameLen != 0 { + var err error + to, err = CaptureAddress(t, usermem.Addr(msg.Name), msg.NameLen) + if err != nil { + return 0, err + } + } + + // Read data then call the sendmsg implementation. + if msg.IovLen > linux.UIO_MAXIOV { + return 0, syserror.EMSGSIZE + } + src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + controlMessages, err := control.Parse(t, s, controlData) + if err != nil { + return 0, err + } + + var haveDeadline bool + var deadline ktime.Time + if dl := s.SendTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + + // Call the syscall implementation. + n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) + err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) + if err != nil { + controlMessages.Release() + } + return uintptr(n), err +} + +// sendTo is the implementation of the sendto syscall. It is called by sendto +// and send syscall handlers. +func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) { + bl := int(bufLen) + if bl < 0 { + return 0, syserror.EINVAL + } + + // Get socket from the file descriptor. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, syserror.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.Impl().(socket.SocketVFS2) + if !ok { + return 0, syserror.ENOTSOCK + } + + if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { + flags |= linux.MSG_DONTWAIT + } + + // Read the destination address if one is specified. + var to []byte + var err error + if namePtr != 0 { + to, err = CaptureAddress(t, namePtr, nameLen) + if err != nil { + return 0, err + } + } + + src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + var haveDeadline bool + var deadline ktime.Time + if dl := s.SendTimeout(); dl > 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } else if dl < 0 { + flags |= linux.MSG_DONTWAIT + } + + // Call the syscall implementation. + n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)}) + return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file) +} + +// SendTo implements the linux syscall sendto(2). +func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + bufPtr := args[1].Pointer() + bufLen := args[2].Uint64() + flags := args[3].Int() + namePtr := args[4].Pointer() + nameLen := args[5].Uint() + + n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen) + return n, nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go new file mode 100644 index 000000000..945a364a7 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -0,0 +1,291 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Splice implements Linux syscall splice(2). +func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + inFD := args[0].Int() + inOffsetPtr := args[1].Pointer() + outFD := args[2].Int() + outOffsetPtr := args[3].Pointer() + count := int64(args[4].SizeT()) + flags := args[5].Int() + + if count == 0 { + return 0, nil, nil + } + if count > int64(kernel.MAX_RW_COUNT) { + count = int64(kernel.MAX_RW_COUNT) + } + + // Check for invalid flags. + if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { + return 0, nil, syserror.EINVAL + } + + // Get file descriptions. + inFile := t.GetFileVFS2(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef() + outFile := t.GetFileVFS2(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef() + + // Check that both files support the required directionality. + if !inFile.IsReadable() || !outFile.IsWritable() { + return 0, nil, syserror.EBADF + } + + // The operation is non-blocking if anything is non-blocking. + // + // N.B. This is a rather simplistic heuristic that avoids some + // poor edge case behavior since the exact semantics here are + // underspecified and vary between versions of Linux itself. + nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) + + // At least one file description must represent a pipe. + inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) + outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) + if !inIsPipe && !outIsPipe { + return 0, nil, syserror.EINVAL + } + + // Copy in offsets. + inOffset := int64(-1) + if inOffsetPtr != 0 { + if inIsPipe { + return 0, nil, syserror.ESPIPE + } + if inFile.Options().DenyPRead { + return 0, nil, syserror.EINVAL + } + if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil { + return 0, nil, err + } + if inOffset < 0 { + return 0, nil, syserror.EINVAL + } + } + outOffset := int64(-1) + if outOffsetPtr != 0 { + if outIsPipe { + return 0, nil, syserror.ESPIPE + } + if outFile.Options().DenyPWrite { + return 0, nil, syserror.EINVAL + } + if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil { + return 0, nil, err + } + if outOffset < 0 { + return 0, nil, syserror.EINVAL + } + } + + // Move data. + var ( + n int64 + err error + inCh chan struct{} + outCh chan struct{} + ) + for { + // If both input and output are pipes, delegate to the pipe + // implementation. Otherwise, exactly one end is a pipe, which we + // ensure is consistently ordered after the non-pipe FD's locks by + // passing the pipe FD as usermem.IO to the non-pipe end. + switch { + case inIsPipe && outIsPipe: + n, err = pipe.Splice(t, outPipeFD, inPipeFD, count) + case inIsPipe: + if outOffset != -1 { + n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{}) + outOffset += n + } else { + n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{}) + } + case outIsPipe: + if inOffset != -1 { + n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{}) + inOffset += n + } else { + n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{}) + } + } + if n != 0 || err != syserror.ErrWouldBlock || nonBlock { + break + } + + // Note that the blocking behavior here is a bit different than the + // normal pattern. Because we need to have both data to read and data + // to write simultaneously, we actually explicitly block on both of + // these cases in turn before returning to the splice operation. + if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { + if inCh == nil { + inCh = make(chan struct{}, 1) + inW, _ := waiter.NewChannelEntry(inCh) + inFile.EventRegister(&inW, eventMaskRead) + defer inFile.EventUnregister(&inW) + continue // Need to refresh readiness. + } + if err = t.Block(inCh); err != nil { + break + } + } + if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { + if outCh == nil { + outCh = make(chan struct{}, 1) + outW, _ := waiter.NewChannelEntry(outCh) + outFile.EventRegister(&outW, eventMaskWrite) + defer outFile.EventUnregister(&outW) + continue // Need to refresh readiness. + } + if err = t.Block(outCh); err != nil { + break + } + } + } + + // Copy updated offsets out. + if inOffsetPtr != 0 { + if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil { + return 0, nil, err + } + } + if outOffsetPtr != 0 { + if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil { + return 0, nil, err + } + } + + if n == 0 { + return 0, nil, err + } + + // On Linux, inotify behavior is not very consistent with splice(2). We try + // our best to emulate Linux for very basic calls to splice, where for some + // reason, events are generated for output files, but not input files. + outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + return uintptr(n), nil, nil +} + +// Tee implements Linux syscall tee(2). +func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + inFD := args[0].Int() + outFD := args[1].Int() + count := int64(args[2].SizeT()) + flags := args[3].Int() + + if count == 0 { + return 0, nil, nil + } + if count > int64(kernel.MAX_RW_COUNT) { + count = int64(kernel.MAX_RW_COUNT) + } + + // Check for invalid flags. + if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { + return 0, nil, syserror.EINVAL + } + + // Get file descriptions. + inFile := t.GetFileVFS2(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef() + outFile := t.GetFileVFS2(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef() + + // Check that both files support the required directionality. + if !inFile.IsReadable() || !outFile.IsWritable() { + return 0, nil, syserror.EBADF + } + + // The operation is non-blocking if anything is non-blocking. + // + // N.B. This is a rather simplistic heuristic that avoids some + // poor edge case behavior since the exact semantics here are + // underspecified and vary between versions of Linux itself. + nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) + + // Both file descriptions must represent pipes. + inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) + outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) + if !inIsPipe || !outIsPipe { + return 0, nil, syserror.EINVAL + } + + // Copy data. + var ( + inCh chan struct{} + outCh chan struct{} + ) + for { + n, err := pipe.Tee(t, outPipeFD, inPipeFD, count) + if n != 0 { + return uintptr(n), nil, nil + } + if err != syserror.ErrWouldBlock || nonBlock { + return 0, nil, err + } + + // Note that the blocking behavior here is a bit different than the + // normal pattern. Because we need to have both data to read and data + // to write simultaneously, we actually explicitly block on both of + // these cases in turn before returning to the tee operation. + if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { + if inCh == nil { + inCh = make(chan struct{}, 1) + inW, _ := waiter.NewChannelEntry(inCh) + inFile.EventRegister(&inW, eventMaskRead) + defer inFile.EventUnregister(&inW) + continue // Need to refresh readiness. + } + if err := t.Block(inCh); err != nil { + return 0, nil, err + } + } + if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { + if outCh == nil { + outCh = make(chan struct{}, 1) + outW, _ := waiter.NewChannelEntry(outCh) + outFile.EventRegister(&outW, eventMaskWrite) + defer outFile.EventUnregister(&outW) + continue // Need to refresh readiness. + } + if err := t.Block(outCh); err != nil { + return 0, nil, err + } + } + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go new file mode 100644 index 000000000..bb1d5cac4 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -0,0 +1,388 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Stat implements Linux syscall stat(2). +func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + statAddr := args[1].Pointer() + return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, 0 /* flags */) +} + +// Lstat implements Linux syscall lstat(2). +func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + statAddr := args[1].Pointer() + return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, linux.AT_SYMLINK_NOFOLLOW) +} + +// Newfstatat implements Linux syscall newfstatat, which backs fstatat(2). +func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + statAddr := args[2].Pointer() + flags := args[3].Int() + return 0, nil, fstatat(t, dirfd, pathAddr, statAddr, flags) +} + +func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags int32) error { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return syserror.EINVAL + } + + opts := vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS, + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + if !path.Absolute { + if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { + return syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + return syserror.EBADF + } + if !path.HasComponents() { + // Use FileDescription.Stat() instead of + // VirtualFilesystem.StatAt() for fstatat(fd, ""), since the + // former may be able to use opened file state to expedite the + // Stat. + statx, err := dirfile.Stat(t, opts) + dirfile.DecRef() + if err != nil { + return err + } + var stat linux.Stat + convertStatxToUserStat(t, &statx, &stat) + _, err = stat.CopyOut(t, statAddr) + return err + } + start = dirfile.VirtualDentry() + start.IncRef() + defer start.DecRef() + dirfile.DecRef() + } + } + + statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + }, &opts) + if err != nil { + return err + } + var stat linux.Stat + convertStatxToUserStat(t, &statx, &stat) + _, err = stat.CopyOut(t, statAddr) + return err +} + +func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec { + return linux.Timespec{ + Sec: sxts.Sec, + Nsec: int64(sxts.Nsec), + } +} + +// Fstat implements Linux syscall fstat(2). +func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + statAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + statx, err := file.Stat(t, vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS, + }) + if err != nil { + return 0, nil, err + } + var stat linux.Stat + convertStatxToUserStat(t, &statx, &stat) + _, err = stat.CopyOut(t, statAddr) + return 0, nil, err +} + +// Statx implements Linux syscall statx(2). +func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + flags := args[2].Int() + mask := args[3].Uint() + statxAddr := args[4].Pointer() + + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 { + return 0, nil, syserror.EINVAL + } + // Make sure that only one sync type option is set. + syncType := uint32(flags & linux.AT_STATX_SYNC_TYPE) + if syncType != 0 && !bits.IsPowerOfTwo32(syncType) { + return 0, nil, syserror.EINVAL + } + if mask&linux.STATX__RESERVED != 0 { + return 0, nil, syserror.EINVAL + } + + opts := vfs.StatOptions{ + Mask: mask, + Sync: uint32(flags & linux.AT_STATX_SYNC_TYPE), + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + if !path.Absolute { + if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { + return 0, nil, syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + return 0, nil, syserror.EBADF + } + if !path.HasComponents() { + // Use FileDescription.Stat() instead of + // VirtualFilesystem.StatAt() for statx(fd, ""), since the + // former may be able to use opened file state to expedite the + // Stat. + statx, err := dirfile.Stat(t, opts) + dirfile.DecRef() + if err != nil { + return 0, nil, err + } + userifyStatx(t, &statx) + _, err = statx.CopyOut(t, statxAddr) + return 0, nil, err + } + start = dirfile.VirtualDentry() + start.IncRef() + defer start.DecRef() + dirfile.DecRef() + } + } + + statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + }, &opts) + if err != nil { + return 0, nil, err + } + userifyStatx(t, &statx) + _, err = statx.CopyOut(t, statxAddr) + return 0, nil, err +} + +func userifyStatx(t *kernel.Task, statx *linux.Statx) { + userns := t.UserNamespace() + statx.UID = uint32(auth.KUID(statx.UID).In(userns).OrOverflow()) + statx.GID = uint32(auth.KGID(statx.GID).In(userns).OrOverflow()) +} + +// Readlink implements Linux syscall readlink(2). +func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + bufAddr := args[1].Pointer() + size := args[2].SizeT() + return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size) +} + +// Access implements Linux syscall access(2). +func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + + return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode) +} + +// Faccessat implements Linux syscall faccessat(2). +// +// Note that the faccessat() system call does not take a flags argument: +// "The raw faccessat() system call takes only the first three arguments. The +// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within +// the glibc wrapper function for faccessat(). If either of these flags is +// specified, then the wrapper function employs fstatat(2) to determine access +// permissions." - faccessat(2) +func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + addr := args[1].Pointer() + mode := args[2].ModeT() + + return 0, nil, accessAt(t, dirfd, addr, mode) +} + +func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error { + const rOK = 4 + const wOK = 2 + const xOK = 1 + + // Sanity check the mode. + if mode&^(rOK|wOK|xOK) != 0 { + return syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + + // access(2) and faccessat(2) check permissions using real + // UID/GID, not effective UID/GID. + // + // "access() needs to use the real uid/gid, not the effective + // uid/gid. We do this by temporarily clearing all FS-related + // capabilities and switching the fsuid/fsgid around to the + // real ones." -fs/open.c:faccessat + creds := t.Credentials().Fork() + creds.EffectiveKUID = creds.RealKUID + creds.EffectiveKGID = creds.RealKGID + if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID { + creds.EffectiveCaps = creds.PermittedCaps + } else { + creds.EffectiveCaps = 0 + } + + return t.Kernel().VFS().AccessAt(t, creds, vfs.AccessTypes(mode), &tpop.pop) +} + +// Readlinkat implements Linux syscall mknodat(2). +func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + bufAddr := args[2].Pointer() + size := args[3].SizeT() + return readlinkat(t, dirfd, pathAddr, bufAddr, size) +} + +func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, size uint) (uintptr, *kernel.SyscallControl, error) { + if int(size) <= 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + // "Since Linux 2.6.39, pathname can be an empty string, in which case the + // call operates on the symbolic link referred to by dirfd ..." - + // readlinkat(2) + tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + + if len(target) > int(size) { + target = target[:size] + } + n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target)) + if n == 0 { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Statfs implements Linux syscall statfs(2). +func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + bufAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + _, err = statfs.CopyOut(t, bufAddr) + return 0, nil, err +} + +// Fstatfs implements Linux syscall fstatfs(2). +func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + bufAddr := args[1].Pointer() + + tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + _, err = statfs.CopyOut(t, bufAddr) + return 0, nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go new file mode 100644 index 000000000..2da538fc6 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go @@ -0,0 +1,46 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// This takes both input and output as pointer arguments to avoid copying large +// structs. +func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) { + // Linux just copies fields from struct kstat without regard to struct + // kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too. + userns := t.UserNamespace() + *stat = linux.Stat{ + Dev: uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)), + Ino: statx.Ino, + Nlink: uint64(statx.Nlink), + Mode: uint32(statx.Mode), + UID: uint32(auth.KUID(statx.UID).In(userns).OrOverflow()), + GID: uint32(auth.KGID(statx.GID).In(userns).OrOverflow()), + Rdev: uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)), + Size: int64(statx.Size), + Blksize: int64(statx.Blksize), + Blocks: int64(statx.Blocks), + ATime: timespecFromStatxTimestamp(statx.Atime), + MTime: timespecFromStatxTimestamp(statx.Mtime), + CTime: timespecFromStatxTimestamp(statx.Ctime), + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go new file mode 100644 index 000000000..88b9c7627 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go @@ -0,0 +1,46 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// This takes both input and output as pointer arguments to avoid copying large +// structs. +func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) { + // Linux just copies fields from struct kstat without regard to struct + // kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too. + userns := t.UserNamespace() + *stat = linux.Stat{ + Dev: uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)), + Ino: statx.Ino, + Nlink: uint32(statx.Nlink), + Mode: uint32(statx.Mode), + UID: uint32(auth.KUID(statx.UID).In(userns).OrOverflow()), + GID: uint32(auth.KGID(statx.GID).In(userns).OrOverflow()), + Rdev: uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)), + Size: int64(statx.Size), + Blksize: int32(statx.Blksize), + Blocks: int64(statx.Blocks), + ATime: timespecFromStatxTimestamp(statx.Atime), + MTime: timespecFromStatxTimestamp(statx.Mtime), + CTime: timespecFromStatxTimestamp(statx.Ctime), + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go new file mode 100644 index 000000000..0d0ebf46a --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -0,0 +1,115 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Sync implements Linux syscall sync(2). +func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, t.Kernel().VFS().SyncAllFilesystems(t) +} + +// Syncfs implements Linux syscall syncfs(2). +func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.SyncFS(t) +} + +// Fsync implements Linux syscall fsync(2). +func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.Sync(t) +} + +// Fdatasync implements Linux syscall fdatasync(2). +func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // TODO(gvisor.dev/issue/1897): Avoid writeback of unnecessary metadata. + return Fsync(t, args) +} + +// SyncFileRange implements Linux syscall sync_file_range(2). +func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + offset := args[1].Int64() + nbytes := args[2].Int64() + flags := args[3].Uint() + + // Check for negative values and overflow. + if offset < 0 || offset+nbytes < 0 { + return 0, nil, syserror.EINVAL + } + if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support + // is a full-file sync, i.e. fsync(2). As a result, there are severe + // limitations on how much we support sync_file_range: + // - In Linux, sync_file_range(2) doesn't write out the file's metadata, even + // if the file size is changed. We do. + // - We always sync the entire file instead of [offset, offset+nbytes). + // - We do not support the use of WAIT_BEFORE without WAIT_AFTER. For + // correctness, we would have to perform a write-out every time WAIT_BEFORE + // was used, but this would be much more expensive than expected if there + // were no write-out operations in progress. + // - Whenever WAIT_AFTER is used, we sync the file. + // - Ignore WRITE. If this flag is used with WAIT_AFTER, then the file will + // be synced anyway. If this flag is used without WAIT_AFTER, then it is + // safe (and less expensive) to do nothing, because the syscall will not + // wait for the write-out to complete--we only need to make sure that the + // next time WAIT_BEFORE or WAIT_AFTER are used, the write-out completes. + // - According to fs/sync.c, WAIT_BEFORE|WAIT_AFTER "will detect any I/O + // errors or ENOSPC conditions and will return those to the caller, after + // clearing the EIO and ENOSPC flags in the address_space." We don't do + // this. + + if flags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 && + flags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 { + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.ENOSYS + } + + if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 { + if err := file.Sync(t); err != nil { + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) + } + } + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go new file mode 100644 index 000000000..5ac79bc09 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go @@ -0,0 +1,127 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/syserror" +) + +// TimerfdCreate implements Linux syscall timerfd_create(2). +func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + clockID := args[0].Int() + flags := args[1].Int() + + if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 { + return 0, nil, syserror.EINVAL + } + + // Timerfds aren't writable per se (their implementation of Write just + // returns EINVAL), but they are "opened for writing", which is necessary + // to actually reach said implementation of Write. + fileFlags := uint32(linux.O_RDWR) + if flags&linux.TFD_NONBLOCK != 0 { + fileFlags |= linux.O_NONBLOCK + } + + var clock ktime.Clock + switch clockID { + case linux.CLOCK_REALTIME: + clock = t.Kernel().RealtimeClock() + case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME: + clock = t.Kernel().MonotonicClock() + default: + return 0, nil, syserror.EINVAL + } + vfsObj := t.Kernel().VFS() + file, err := timerfd.New(vfsObj, clock, fileFlags) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.TFD_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// TimerfdSettime implements Linux syscall timerfd_settime(2). +func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + flags := args[1].Int() + newValAddr := args[2].Pointer() + oldValAddr := args[3].Pointer() + + if flags&^(linux.TFD_TIMER_ABSTIME) != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + tfd, ok := file.Impl().(*timerfd.TimerFileDescription) + if !ok { + return 0, nil, syserror.EINVAL + } + + var newVal linux.Itimerspec + if _, err := t.CopyIn(newValAddr, &newVal); err != nil { + return 0, nil, err + } + newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock()) + if err != nil { + return 0, nil, err + } + tm, oldS := tfd.SetTime(newS) + if oldValAddr != 0 { + oldVal := ktime.ItimerspecFromSetting(tm, oldS) + if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil { + return 0, nil, err + } + } + return 0, nil, nil +} + +// TimerfdGettime implements Linux syscall timerfd_gettime(2). +func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + curValAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + tfd, ok := file.Impl().(*timerfd.TimerFileDescription) + if !ok { + return 0, nil, syserror.EINVAL + } + + tm, s := tfd.GetTime() + curVal := ktime.ItimerspecFromSetting(tm, s) + _, err := t.CopyOut(curValAddr, &curVal) + return 0, nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go new file mode 100644 index 000000000..8f497ecc7 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -0,0 +1,168 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package vfs2 provides syscall implementations that use VFS2. +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/syscalls" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" +) + +// Override syscall table to add syscalls implementations from this package. +func Override() { + // Override AMD64. + s := linux.AMD64 + s.Table[0] = syscalls.Supported("read", Read) + s.Table[1] = syscalls.Supported("write", Write) + s.Table[2] = syscalls.Supported("open", Open) + s.Table[3] = syscalls.Supported("close", Close) + s.Table[4] = syscalls.Supported("stat", Stat) + s.Table[5] = syscalls.Supported("fstat", Fstat) + s.Table[6] = syscalls.Supported("lstat", Lstat) + s.Table[7] = syscalls.Supported("poll", Poll) + s.Table[8] = syscalls.Supported("lseek", Lseek) + s.Table[9] = syscalls.Supported("mmap", Mmap) + s.Table[16] = syscalls.Supported("ioctl", Ioctl) + s.Table[17] = syscalls.Supported("pread64", Pread64) + s.Table[18] = syscalls.Supported("pwrite64", Pwrite64) + s.Table[19] = syscalls.Supported("readv", Readv) + s.Table[20] = syscalls.Supported("writev", Writev) + s.Table[21] = syscalls.Supported("access", Access) + s.Table[22] = syscalls.Supported("pipe", Pipe) + s.Table[23] = syscalls.Supported("select", Select) + s.Table[32] = syscalls.Supported("dup", Dup) + s.Table[33] = syscalls.Supported("dup2", Dup2) + delete(s.Table, 40) // sendfile + s.Table[41] = syscalls.Supported("socket", Socket) + s.Table[42] = syscalls.Supported("connect", Connect) + s.Table[43] = syscalls.Supported("accept", Accept) + s.Table[44] = syscalls.Supported("sendto", SendTo) + s.Table[45] = syscalls.Supported("recvfrom", RecvFrom) + s.Table[46] = syscalls.Supported("sendmsg", SendMsg) + s.Table[47] = syscalls.Supported("recvmsg", RecvMsg) + s.Table[48] = syscalls.Supported("shutdown", Shutdown) + s.Table[49] = syscalls.Supported("bind", Bind) + s.Table[50] = syscalls.Supported("listen", Listen) + s.Table[51] = syscalls.Supported("getsockname", GetSockName) + s.Table[52] = syscalls.Supported("getpeername", GetPeerName) + s.Table[53] = syscalls.Supported("socketpair", SocketPair) + s.Table[54] = syscalls.Supported("setsockopt", SetSockOpt) + s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt) + s.Table[59] = syscalls.Supported("execve", Execve) + s.Table[72] = syscalls.Supported("fcntl", Fcntl) + s.Table[73] = syscalls.Supported("fcntl", Flock) + s.Table[74] = syscalls.Supported("fsync", Fsync) + s.Table[75] = syscalls.Supported("fdatasync", Fdatasync) + s.Table[76] = syscalls.Supported("truncate", Truncate) + s.Table[77] = syscalls.Supported("ftruncate", Ftruncate) + s.Table[78] = syscalls.Supported("getdents", Getdents) + s.Table[79] = syscalls.Supported("getcwd", Getcwd) + s.Table[80] = syscalls.Supported("chdir", Chdir) + s.Table[81] = syscalls.Supported("fchdir", Fchdir) + s.Table[82] = syscalls.Supported("rename", Rename) + s.Table[83] = syscalls.Supported("mkdir", Mkdir) + s.Table[84] = syscalls.Supported("rmdir", Rmdir) + s.Table[85] = syscalls.Supported("creat", Creat) + s.Table[86] = syscalls.Supported("link", Link) + s.Table[87] = syscalls.Supported("unlink", Unlink) + s.Table[88] = syscalls.Supported("symlink", Symlink) + s.Table[89] = syscalls.Supported("readlink", Readlink) + s.Table[90] = syscalls.Supported("chmod", Chmod) + s.Table[91] = syscalls.Supported("fchmod", Fchmod) + s.Table[92] = syscalls.Supported("chown", Chown) + s.Table[93] = syscalls.Supported("fchown", Fchown) + s.Table[94] = syscalls.Supported("lchown", Lchown) + s.Table[132] = syscalls.Supported("utime", Utime) + s.Table[133] = syscalls.Supported("mknod", Mknod) + s.Table[137] = syscalls.Supported("statfs", Statfs) + s.Table[138] = syscalls.Supported("fstatfs", Fstatfs) + s.Table[161] = syscalls.Supported("chroot", Chroot) + s.Table[162] = syscalls.Supported("sync", Sync) + s.Table[165] = syscalls.Supported("mount", Mount) + s.Table[166] = syscalls.Supported("umount2", Umount2) + s.Table[187] = syscalls.Supported("readahead", Readahead) + s.Table[188] = syscalls.Supported("setxattr", Setxattr) + s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr) + s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr) + s.Table[191] = syscalls.Supported("getxattr", Getxattr) + s.Table[192] = syscalls.Supported("lgetxattr", Lgetxattr) + s.Table[193] = syscalls.Supported("fgetxattr", Fgetxattr) + s.Table[194] = syscalls.Supported("listxattr", Listxattr) + s.Table[195] = syscalls.Supported("llistxattr", Llistxattr) + s.Table[196] = syscalls.Supported("flistxattr", Flistxattr) + s.Table[197] = syscalls.Supported("removexattr", Removexattr) + s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr) + s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr) + s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}) + s.Table[213] = syscalls.Supported("epoll_create", EpollCreate) + s.Table[217] = syscalls.Supported("getdents64", Getdents64) + s.Table[221] = syscalls.PartiallySupported("fadvise64", Fadvise64, "The syscall is 'supported', but ignores all provided advice.", nil) + s.Table[232] = syscalls.Supported("epoll_wait", EpollWait) + s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl) + s.Table[235] = syscalls.Supported("utimes", Utimes) + s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil) + s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil) + s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil) + s.Table[257] = syscalls.Supported("openat", Openat) + s.Table[258] = syscalls.Supported("mkdirat", Mkdirat) + s.Table[259] = syscalls.Supported("mknodat", Mknodat) + s.Table[260] = syscalls.Supported("fchownat", Fchownat) + s.Table[261] = syscalls.Supported("futimesat", Futimesat) + s.Table[262] = syscalls.Supported("newfstatat", Newfstatat) + s.Table[263] = syscalls.Supported("unlinkat", Unlinkat) + s.Table[264] = syscalls.Supported("renameat", Renameat) + s.Table[265] = syscalls.Supported("linkat", Linkat) + s.Table[266] = syscalls.Supported("symlinkat", Symlinkat) + s.Table[267] = syscalls.Supported("readlinkat", Readlinkat) + s.Table[268] = syscalls.Supported("fchmodat", Fchmodat) + s.Table[269] = syscalls.Supported("faccessat", Faccessat) + s.Table[270] = syscalls.Supported("pselect", Pselect) + s.Table[271] = syscalls.Supported("ppoll", Ppoll) + s.Table[275] = syscalls.Supported("splice", Splice) + s.Table[276] = syscalls.Supported("tee", Tee) + s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange) + s.Table[280] = syscalls.Supported("utimensat", Utimensat) + s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait) + s.Table[282] = syscalls.Supported("signalfd", Signalfd) + s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) + s.Table[284] = syscalls.Supported("eventfd", Eventfd) + s.Table[285] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil) + s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) + s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) + s.Table[288] = syscalls.Supported("accept4", Accept4) + s.Table[289] = syscalls.Supported("signalfd4", Signalfd4) + s.Table[290] = syscalls.Supported("eventfd2", Eventfd2) + s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1) + s.Table[292] = syscalls.Supported("dup3", Dup3) + s.Table[293] = syscalls.Supported("pipe2", Pipe2) + s.Table[294] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil) + s.Table[295] = syscalls.Supported("preadv", Preadv) + s.Table[296] = syscalls.Supported("pwritev", Pwritev) + s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg) + s.Table[306] = syscalls.Supported("syncfs", Syncfs) + s.Table[307] = syscalls.Supported("sendmmsg", SendMMsg) + s.Table[316] = syscalls.Supported("renameat2", Renameat2) + s.Table[319] = syscalls.Supported("memfd_create", MemfdCreate) + s.Table[322] = syscalls.Supported("execveat", Execveat) + s.Table[327] = syscalls.Supported("preadv2", Preadv2) + s.Table[328] = syscalls.Supported("pwritev2", Pwritev2) + s.Table[332] = syscalls.Supported("statx", Statx) + s.Init() + + // Override ARM64. + s = linux.ARM64 + s.Table[63] = syscalls.Supported("read", Read) + s.Init() +} diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go new file mode 100644 index 000000000..af455d5c1 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go @@ -0,0 +1,356 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "bytes" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Listxattr implements Linux syscall listxattr(2). +func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listxattr(t, args, followFinalSymlink) +} + +// Llistxattr implements Linux syscall llistxattr(2). +func Llistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listxattr(t, args, nofollowFinalSymlink) +} + +func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + listAddr := args[1].Pointer() + size := args[2].SizeT() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size)) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrNameList(t, listAddr, size, names) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Flistxattr implements Linux syscall flistxattr(2). +func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + listAddr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + names, err := file.Listxattr(t, uint64(size)) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrNameList(t, listAddr, size, names) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Getxattr implements Linux syscall getxattr(2). +func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getxattr(t, args, followFinalSymlink) +} + +// Lgetxattr implements Linux syscall lgetxattr(2). +func Lgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getxattr(t, args, nofollowFinalSymlink) +} + +func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + + value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{ + Name: name, + Size: uint64(size), + }) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrValue(t, valueAddr, size, value) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Fgetxattr implements Linux syscall fgetxattr(2). +func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + + value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)}) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrValue(t, valueAddr, size, value) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Setxattr implements Linux syscall setxattr(2). +func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, setxattr(t, args, followFinalSymlink) +} + +// Lsetxattr implements Linux syscall lsetxattr(2). +func Lsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, setxattr(t, args, nofollowFinalSymlink) +} + +func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + flags := args[4].Int() + + if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { + return syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + value, err := copyInXattrValue(t, valueAddr, size) + if err != nil { + return err + } + + return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{ + Name: name, + Value: value, + Flags: uint32(flags), + }) +} + +// Fsetxattr implements Linux syscall fsetxattr(2). +func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + flags := args[4].Int() + + if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + value, err := copyInXattrValue(t, valueAddr, size) + if err != nil { + return 0, nil, err + } + + return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{ + Name: name, + Value: value, + Flags: uint32(flags), + }) +} + +// Removexattr implements Linux syscall removexattr(2). +func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, removexattr(t, args, followFinalSymlink) +} + +// Lremovexattr implements Linux syscall lremovexattr(2). +func Lremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, removexattr(t, args, nofollowFinalSymlink) +} + +func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + + return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name) +} + +// Fremovexattr implements Linux syscall fremovexattr(2). +func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + + return 0, nil, file.Removexattr(t, name) +} + +func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) { + name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1) + if err != nil { + if err == syserror.ENAMETOOLONG { + return "", syserror.ERANGE + } + return "", err + } + if len(name) == 0 { + return "", syserror.ERANGE + } + return name, nil +} + +func copyOutXattrNameList(t *kernel.Task, listAddr usermem.Addr, size uint, names []string) (int, error) { + if size > linux.XATTR_LIST_MAX { + size = linux.XATTR_LIST_MAX + } + var buf bytes.Buffer + for _, name := range names { + buf.WriteString(name) + buf.WriteByte(0) + } + if size == 0 { + // Return the size that would be required to accomodate the list. + return buf.Len(), nil + } + if buf.Len() > int(size) { + if size >= linux.XATTR_LIST_MAX { + return 0, syserror.E2BIG + } + return 0, syserror.ERANGE + } + return t.CopyOutBytes(listAddr, buf.Bytes()) +} + +func copyInXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint) (string, error) { + if size > linux.XATTR_SIZE_MAX { + return "", syserror.E2BIG + } + buf := make([]byte, size) + if _, err := t.CopyInBytes(valueAddr, buf); err != nil { + return "", err + } + return gohacks.StringFromImmutableBytes(buf), nil +} + +func copyOutXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint, value string) (int, error) { + if size > linux.XATTR_SIZE_MAX { + size = linux.XATTR_SIZE_MAX + } + if size == 0 { + // Return the size that would be required to accomodate the value. + return len(value), nil + } + if len(value) > int(size) { + if size >= linux.XATTR_SIZE_MAX { + return 0, syserror.E2BIG + } + return 0, syserror.ERANGE + } + return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value)) +} diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go new file mode 100644 index 000000000..f88055676 --- /dev/null +++ b/pkg/sentry/syscalls/syscalls.go @@ -0,0 +1,111 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package syscalls is the interface from the application to the kernel. +// Traditionally, syscalls is the interface that is used by applications to +// request services from the kernel of a operating system. We provide a +// user-mode kernel that needs to handle those requests coming from unmodified +// applications. Therefore, we still use the term "syscalls" to denote this +// interface. +// +// Note that the stubs in this package may merely provide the interface, not +// the actual implementation. It just makes writing syscall stubs +// straightforward. +package syscalls + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Supported returns a syscall that is fully supported. +func Supported(name string, fn kernel.SyscallFn) kernel.Syscall { + return kernel.Syscall{ + Name: name, + Fn: fn, + SupportLevel: kernel.SupportFull, + Note: "Fully Supported.", + } +} + +// PartiallySupported returns a syscall that has a partial implementation. +func PartiallySupported(name string, fn kernel.SyscallFn, note string, urls []string) kernel.Syscall { + return kernel.Syscall{ + Name: name, + Fn: fn, + SupportLevel: kernel.SupportPartial, + Note: note, + URLs: urls, + } +} + +// Error returns a syscall handler that will always give the passed error. +func Error(name string, err error, note string, urls []string) kernel.Syscall { + if note != "" { + note = note + "; " + } + return kernel.Syscall{ + Name: name, + Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, err + }, + SupportLevel: kernel.SupportUnimplemented, + Note: fmt.Sprintf("%sReturns %q.", note, err.Error()), + URLs: urls, + } +} + +// ErrorWithEvent gives a syscall function that sends an unimplemented +// syscall event via the event channel and returns the passed error. +func ErrorWithEvent(name string, err error, note string, urls []string) kernel.Syscall { + if note != "" { + note = note + "; " + } + return kernel.Syscall{ + Name: name, + Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, err + }, + SupportLevel: kernel.SupportUnimplemented, + Note: fmt.Sprintf("%sReturns %q.", note, err.Error()), + URLs: urls, + } +} + +// CapError gives a syscall function that checks for capability c. If the task +// has the capability, it returns ENOSYS, otherwise EPERM. To unprivileged +// tasks, it will seem like there is an implementation. +func CapError(name string, c linux.Capability, note string, urls []string) kernel.Syscall { + if note != "" { + note = note + "; " + } + return kernel.Syscall{ + Name: name, + Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + if !t.HasCapability(c) { + return 0, nil, syserror.EPERM + } + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.ENOSYS + }, + SupportLevel: kernel.SupportUnimplemented, + Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, syserror.EPERM, c.String(), syserror.ENOSYS), + URLs: urls, + } +} |