summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/syscalls
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/syscalls')
-rw-r--r--pkg/sentry/syscalls/BUILD21
-rw-r--r--pkg/sentry/syscalls/epoll.go173
-rw-r--r--pkg/sentry/syscalls/linux/BUILD103
-rw-r--r--pkg/sentry/syscalls/linux/error.go157
-rw-r--r--pkg/sentry/syscalls/linux/flags.go55
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go736
-rw-r--r--pkg/sentry/syscalls/linux/sigset.go71
-rw-r--r--pkg/sentry/syscalls/linux/sys_aio.go382
-rw-r--r--pkg/sentry/syscalls/linux/sys_capability.go149
-rw-r--r--pkg/sentry/syscalls/linux/sys_clone_amd64.go35
-rw-r--r--pkg/sentry/syscalls/linux/sys_clone_arm64.go35
-rw-r--r--pkg/sentry/syscalls/linux/sys_epoll.go147
-rw-r--r--pkg/sentry/syscalls/linux/sys_eventfd.go56
-rw-r--r--pkg/sentry/syscalls/linux/sys_file.go2238
-rw-r--r--pkg/sentry/syscalls/linux/sys_futex.go288
-rw-r--r--pkg/sentry/syscalls/linux/sys_getdents.go250
-rw-r--r--pkg/sentry/syscalls/linux/sys_identity.go180
-rw-r--r--pkg/sentry/syscalls/linux/sys_inotify.go133
-rw-r--r--pkg/sentry/syscalls/linux/sys_lseek.go58
-rw-r--r--pkg/sentry/syscalls/linux/sys_mempolicy.go312
-rw-r--r--pkg/sentry/syscalls/linux/sys_mmap.go332
-rw-r--r--pkg/sentry/syscalls/linux/sys_mount.go154
-rw-r--r--pkg/sentry/syscalls/linux/sys_pipe.go77
-rw-r--r--pkg/sentry/syscalls/linux/sys_poll.go545
-rw-r--r--pkg/sentry/syscalls/linux/sys_prctl.go228
-rw-r--r--pkg/sentry/syscalls/linux/sys_random.go92
-rw-r--r--pkg/sentry/syscalls/linux/sys_read.go394
-rw-r--r--pkg/sentry/syscalls/linux/sys_rlimit.go224
-rw-r--r--pkg/sentry/syscalls/linux/sys_rseq.go48
-rw-r--r--pkg/sentry/syscalls/linux/sys_rusage.go112
-rw-r--r--pkg/sentry/syscalls/linux/sys_sched.go99
-rw-r--r--pkg/sentry/syscalls/linux/sys_seccomp.go76
-rw-r--r--pkg/sentry/syscalls/linux/sys_sem.go241
-rw-r--r--pkg/sentry/syscalls/linux/sys_shm.go161
-rw-r--r--pkg/sentry/syscalls/linux/sys_signal.go590
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go1138
-rw-r--r--pkg/sentry/syscalls/linux/sys_splice.go337
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat.go290
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat_amd64.go45
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat_arm64.go45
-rw-r--r--pkg/sentry/syscalls/linux/sys_sync.go141
-rw-r--r--pkg/sentry/syscalls/linux/sys_sysinfo.go48
-rw-r--r--pkg/sentry/syscalls/linux/sys_syslog.go61
-rw-r--r--pkg/sentry/syscalls/linux/sys_thread.go769
-rw-r--r--pkg/sentry/syscalls/linux/sys_time.go342
-rw-r--r--pkg/sentry/syscalls/linux/sys_timer.go203
-rw-r--r--pkg/sentry/syscalls/linux/sys_timerfd.go121
-rw-r--r--pkg/sentry/syscalls/linux/sys_tls_amd64.go52
-rw-r--r--pkg/sentry/syscalls/linux/sys_tls_arm64.go28
-rw-r--r--pkg/sentry/syscalls/linux/sys_utsname.go95
-rw-r--r--pkg/sentry/syscalls/linux/sys_write.go364
-rw-r--r--pkg/sentry/syscalls/linux/sys_xattr.go432
-rw-r--r--pkg/sentry/syscalls/linux/timespec.go111
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/BUILD76
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/aio.go216
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/epoll.go228
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/eventfd.go61
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/execve.go137
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/fd.go355
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/filesystem.go384
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/fscontext.go131
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/getdents.go161
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/inotify.go137
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/ioctl.go107
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/lock.go64
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/memfd.go63
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/mmap.go92
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/mount.go145
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/path.go94
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/pipe.go63
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/poll.go586
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/read_write.go641
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/setstat.go428
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/signal.go100
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/socket.go1139
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/splice.go291
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/stat.go388
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/stat_amd64.go46
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/stat_arm64.go46
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/sync.go115
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/timerfd.go127
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/vfs2.go168
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/xattr.go356
-rw-r--r--pkg/sentry/syscalls/syscalls.go111
84 files changed, 20630 insertions, 0 deletions
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
new file mode 100644
index 000000000..b8d1bd415
--- /dev/null
+++ b/pkg/sentry/syscalls/BUILD
@@ -0,0 +1,21 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "syscalls",
+ srcs = [
+ "epoll.go",
+ "syscalls.go",
+ ],
+ visibility = ["//:sandbox"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/epoll",
+ "//pkg/sentry/kernel/time",
+ "//pkg/syserror",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
new file mode 100644
index 000000000..d9fb808c0
--- /dev/null
+++ b/pkg/sentry/syscalls/epoll.go
@@ -0,0 +1,173 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syscalls
+
+import (
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// CreateEpoll implements the epoll_create(2) linux syscall.
+func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) {
+ file := epoll.NewEventPoll(t)
+ defer file.DecRef()
+
+ fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
+ CloseOnExec: closeOnExec,
+ })
+ if err != nil {
+ return 0, err
+ }
+
+ return fd, nil
+}
+
+// AddEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_ADD.
+func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
+ // Get epoll from the file descriptor.
+ epollfile := t.GetFile(epfd)
+ if epollfile == nil {
+ return syserror.EBADF
+ }
+ defer epollfile.DecRef()
+
+ // Get the target file id.
+ file := t.GetFile(fd)
+ if file == nil {
+ return syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the epollPoll operations.
+ e, ok := epollfile.FileOperations.(*epoll.EventPoll)
+ if !ok {
+ return syserror.EBADF
+ }
+
+ // Try to add the entry.
+ return e.AddEntry(epoll.FileIdentifier{file, fd}, flags, mask, userData)
+}
+
+// UpdateEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_MOD.
+func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
+ // Get epoll from the file descriptor.
+ epollfile := t.GetFile(epfd)
+ if epollfile == nil {
+ return syserror.EBADF
+ }
+ defer epollfile.DecRef()
+
+ // Get the target file id.
+ file := t.GetFile(fd)
+ if file == nil {
+ return syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the epollPoll operations.
+ e, ok := epollfile.FileOperations.(*epoll.EventPoll)
+ if !ok {
+ return syserror.EBADF
+ }
+
+ // Try to update the entry.
+ return e.UpdateEntry(epoll.FileIdentifier{file, fd}, flags, mask, userData)
+}
+
+// RemoveEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_DEL.
+func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error {
+ // Get epoll from the file descriptor.
+ epollfile := t.GetFile(epfd)
+ if epollfile == nil {
+ return syserror.EBADF
+ }
+ defer epollfile.DecRef()
+
+ // Get the target file id.
+ file := t.GetFile(fd)
+ if file == nil {
+ return syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the epollPoll operations.
+ e, ok := epollfile.FileOperations.(*epoll.EventPoll)
+ if !ok {
+ return syserror.EBADF
+ }
+
+ // Try to remove the entry.
+ return e.RemoveEntry(epoll.FileIdentifier{file, fd})
+}
+
+// WaitEpoll implements the epoll_wait(2) linux syscall.
+func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEvent, error) {
+ // Get epoll from the file descriptor.
+ epollfile := t.GetFile(fd)
+ if epollfile == nil {
+ return nil, syserror.EBADF
+ }
+ defer epollfile.DecRef()
+
+ // Extract the epollPoll operations.
+ e, ok := epollfile.FileOperations.(*epoll.EventPoll)
+ if !ok {
+ return nil, syserror.EBADF
+ }
+
+ // Try to read events and return right away if we got them or if the
+ // caller requested a non-blocking "wait".
+ r := e.ReadEvents(max)
+ if len(r) != 0 || timeout == 0 {
+ return r, nil
+ }
+
+ // We'll have to wait. Set up the timer if a timeout was specified and
+ // and register with the epoll object for readability events.
+ var haveDeadline bool
+ var deadline ktime.Time
+ if timeout > 0 {
+ timeoutDur := time.Duration(timeout) * time.Millisecond
+ deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
+ haveDeadline = true
+ }
+
+ w, ch := waiter.NewChannelEntry(nil)
+ e.EventRegister(&w, waiter.EventIn)
+ defer e.EventUnregister(&w)
+
+ // Try to read the events again until we succeed, timeout or get
+ // interrupted.
+ for {
+ r = e.ReadEvents(max)
+ if len(r) != 0 {
+ return r, nil
+ }
+
+ if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+ if err == syserror.ETIMEDOUT {
+ return nil, nil
+ }
+
+ return nil, err
+ }
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
new file mode 100644
index 000000000..217fcfef2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -0,0 +1,103 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "linux",
+ srcs = [
+ "error.go",
+ "flags.go",
+ "linux64.go",
+ "sigset.go",
+ "sys_aio.go",
+ "sys_capability.go",
+ "sys_clone_amd64.go",
+ "sys_clone_arm64.go",
+ "sys_epoll.go",
+ "sys_eventfd.go",
+ "sys_file.go",
+ "sys_futex.go",
+ "sys_getdents.go",
+ "sys_identity.go",
+ "sys_inotify.go",
+ "sys_lseek.go",
+ "sys_mempolicy.go",
+ "sys_mmap.go",
+ "sys_mount.go",
+ "sys_pipe.go",
+ "sys_poll.go",
+ "sys_prctl.go",
+ "sys_random.go",
+ "sys_read.go",
+ "sys_rlimit.go",
+ "sys_rseq.go",
+ "sys_rusage.go",
+ "sys_sched.go",
+ "sys_seccomp.go",
+ "sys_sem.go",
+ "sys_shm.go",
+ "sys_signal.go",
+ "sys_socket.go",
+ "sys_splice.go",
+ "sys_stat.go",
+ "sys_stat_amd64.go",
+ "sys_stat_arm64.go",
+ "sys_sync.go",
+ "sys_sysinfo.go",
+ "sys_syslog.go",
+ "sys_thread.go",
+ "sys_time.go",
+ "sys_timer.go",
+ "sys_timerfd.go",
+ "sys_tls_amd64.go",
+ "sys_tls_arm64.go",
+ "sys_utsname.go",
+ "sys_write.go",
+ "sys_xattr.go",
+ "timespec.go",
+ ],
+ visibility = ["//:sandbox"],
+ deps = [
+ "//pkg/abi",
+ "//pkg/abi/linux",
+ "//pkg/binary",
+ "//pkg/bpf",
+ "//pkg/context",
+ "//pkg/log",
+ "//pkg/metric",
+ "//pkg/rand",
+ "//pkg/safemem",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/anon",
+ "//pkg/sentry/fs/lock",
+ "//pkg/sentry/fs/timerfd",
+ "//pkg/sentry/fs/tmpfs",
+ "//pkg/sentry/fsbridge",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/epoll",
+ "//pkg/sentry/kernel/eventfd",
+ "//pkg/sentry/kernel/fasync",
+ "//pkg/sentry/kernel/pipe",
+ "//pkg/sentry/kernel/sched",
+ "//pkg/sentry/kernel/shm",
+ "//pkg/sentry/kernel/signalfd",
+ "//pkg/sentry/kernel/time",
+ "//pkg/sentry/limits",
+ "//pkg/sentry/loader",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/mm",
+ "//pkg/sentry/socket",
+ "//pkg/sentry/socket/control",
+ "//pkg/sentry/socket/unix/transport",
+ "//pkg/sentry/syscalls",
+ "//pkg/sentry/usage",
+ "//pkg/sentry/vfs",
+ "//pkg/sync",
+ "//pkg/syserr",
+ "//pkg/syserror",
+ "//pkg/usermem",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
new file mode 100644
index 000000000..64de56ac5
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -0,0 +1,157 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "io"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/metric"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+var (
+ partialResultMetric = metric.MustCreateNewUint64Metric("/syscalls/partial_result", true /* sync */, "Whether or not a partial result has occurred for this sandbox.")
+ partialResultOnce sync.Once
+)
+
+// HandleIOErrorVFS2 handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// op and f are used only for panics.
+func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op string, f *vfs.FileDescription) error {
+ known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+ if err != nil {
+ return err
+ }
+ if !known {
+ // An unknown error is encountered with a partial read/write.
+ fs := f.Mount().Filesystem().VirtualFilesystem()
+ root := vfs.RootFromContext(t)
+ name, _ := fs.PathnameWithDeleted(t, root, f.VirtualDentry())
+ log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, err, err, op, name)
+ partialResultOnce.Do(partialResultMetric.Increment)
+ }
+ return nil
+}
+
+// handleIOError handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// op and f are used only for panics.
+func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error {
+ known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+ if err != nil {
+ return err
+ }
+ if !known {
+ // An unknown error is encountered with a partial read/write.
+ name, _ := f.Dirent.FullName(nil /* ignore chroot */)
+ log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations)
+ partialResultOnce.Do(partialResultMetric.Increment)
+ }
+ return nil
+}
+
+// handleIOError handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// Returns false if error is unknown.
+func handleIOErrorImpl(t *kernel.Task, partialResult bool, err, intr error, op string) (bool, error) {
+ switch err {
+ case nil:
+ // Typical successful syscall.
+ return true, nil
+ case io.EOF:
+ // EOF is always consumed. If this is a partial read/write
+ // (result != 0), the application will see that, otherwise
+ // they will see 0.
+ return true, nil
+ case syserror.ErrExceedsFileSizeLimit:
+ // Ignore partialResult because this error only applies to
+ // normal files, and for those files we cannot accumulate
+ // write results.
+ //
+ // Do not consume the error and return it as EFBIG.
+ // Simultaneously send a SIGXFSZ per setrlimit(2).
+ t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
+ return true, syserror.EFBIG
+ case syserror.ErrInterrupted:
+ // The syscall was interrupted. Return nil if it completed
+ // partially, otherwise return the error code that the syscall
+ // needs (to indicate to the kernel what it should do).
+ if partialResult {
+ return true, nil
+ }
+ return true, intr
+ }
+
+ if !partialResult {
+ // Typical syscall error.
+ return true, err
+ }
+
+ switch err {
+ case syserror.EINTR:
+ // Syscall interrupted, but completed a partial
+ // read/write. Like ErrWouldBlock, since we have a
+ // partial read/write, we consume the error and return
+ // the partial result.
+ return true, nil
+ case syserror.EFAULT:
+ // EFAULT is only shown the user if nothing was
+ // read/written. If we read something (this case), they see
+ // a partial read/write. They will then presumably try again
+ // with an incremented buffer, which will EFAULT with
+ // result == 0.
+ return true, nil
+ case syserror.EPIPE:
+ // Writes to a pipe or socket will return EPIPE if the other
+ // side is gone. The partial write is returned. EPIPE will be
+ // returned on the next call.
+ //
+ // TODO(gvisor.dev/issue/161): In some cases SIGPIPE should
+ // also be sent to the application.
+ return true, nil
+ case syserror.ENOSPC:
+ // Similar to EPIPE. Return what we wrote this time, and let
+ // ENOSPC be returned on the next call.
+ return true, nil
+ case syserror.ECONNRESET:
+ // For TCP sendfile connections, we may have a reset. But we
+ // should just return n as the result.
+ return true, nil
+ case syserror.ErrWouldBlock:
+ // Syscall would block, but completed a partial read/write.
+ // This case should only be returned by IssueIO for nonblocking
+ // files. Since we have a partial read/write, we consume
+ // ErrWouldBlock, returning the partial result.
+ return true, nil
+ }
+
+ switch err.(type) {
+ case kernel.SyscallRestartErrno:
+ // Identical to the EINTR case.
+ return true, nil
+ }
+
+ // Error is unknown and cannot be properly handled.
+ return false, nil
+}
diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
new file mode 100644
index 000000000..07961dad9
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -0,0 +1,55 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+)
+
+// flagsToPermissions returns a Permissions object from Linux flags.
+// This includes truncate permission if O_TRUNC is set in the mask.
+func flagsToPermissions(mask uint) (p fs.PermMask) {
+ if mask&linux.O_TRUNC != 0 {
+ p.Write = true
+ }
+ switch mask & linux.O_ACCMODE {
+ case linux.O_WRONLY:
+ p.Write = true
+ case linux.O_RDWR:
+ p.Write = true
+ p.Read = true
+ case linux.O_RDONLY:
+ p.Read = true
+ }
+ return
+}
+
+// linuxToFlags converts Linux file flags to a FileFlags object.
+func linuxToFlags(mask uint) fs.FileFlags {
+ return fs.FileFlags{
+ Direct: mask&linux.O_DIRECT != 0,
+ DSync: mask&(linux.O_DSYNC|linux.O_SYNC) != 0,
+ Sync: mask&linux.O_SYNC != 0,
+ NonBlocking: mask&linux.O_NONBLOCK != 0,
+ Read: (mask & linux.O_ACCMODE) != linux.O_WRONLY,
+ Write: (mask & linux.O_ACCMODE) != linux.O_RDONLY,
+ Append: mask&linux.O_APPEND != 0,
+ Directory: mask&linux.O_DIRECTORY != 0,
+ Async: mask&linux.O_ASYNC != 0,
+ LargeFile: mask&linux.O_LARGEFILE != 0,
+ Truncate: mask&linux.O_TRUNC != 0,
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
new file mode 100644
index 000000000..ea4f9b1a7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -0,0 +1,736 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package linux provides syscall tables for amd64 Linux.
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/syscalls"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+ // LinuxSysname is the OS name advertised by gVisor.
+ LinuxSysname = "Linux"
+
+ // LinuxRelease is the Linux release version number advertised by gVisor.
+ LinuxRelease = "4.4.0"
+
+ // LinuxVersion is the version info advertised by gVisor.
+ LinuxVersion = "#1 SMP Sun Jan 10 15:06:54 PST 2016"
+)
+
+// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
+// numbers from Linux 4.4.
+var AMD64 = &kernel.SyscallTable{
+ OS: abi.Linux,
+ Arch: arch.AMD64,
+ Version: kernel.Version{
+ // Version 4.4 is chosen as a stable, longterm version of Linux, which
+ // guides the interface provided by this syscall table. The build
+ // version is that for a clean build with default kernel config, at 5
+ // minutes after v4.4 was tagged.
+ Sysname: LinuxSysname,
+ Release: LinuxRelease,
+ Version: LinuxVersion,
+ },
+ AuditNumber: linux.AUDIT_ARCH_X86_64,
+ Table: map[uintptr]kernel.Syscall{
+ 0: syscalls.Supported("read", Read),
+ 1: syscalls.Supported("write", Write),
+ 2: syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil),
+ 3: syscalls.Supported("close", Close),
+ 4: syscalls.Supported("stat", Stat),
+ 5: syscalls.Supported("fstat", Fstat),
+ 6: syscalls.Supported("lstat", Lstat),
+ 7: syscalls.Supported("poll", Poll),
+ 8: syscalls.Supported("lseek", Lseek),
+ 9: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil),
+ 10: syscalls.Supported("mprotect", Mprotect),
+ 11: syscalls.Supported("munmap", Munmap),
+ 12: syscalls.Supported("brk", Brk),
+ 13: syscalls.Supported("rt_sigaction", RtSigaction),
+ 14: syscalls.Supported("rt_sigprocmask", RtSigprocmask),
+ 15: syscalls.Supported("rt_sigreturn", RtSigreturn),
+ 16: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
+ 17: syscalls.Supported("pread64", Pread64),
+ 18: syscalls.Supported("pwrite64", Pwrite64),
+ 19: syscalls.Supported("readv", Readv),
+ 20: syscalls.Supported("writev", Writev),
+ 21: syscalls.Supported("access", Access),
+ 22: syscalls.Supported("pipe", Pipe),
+ 23: syscalls.Supported("select", Select),
+ 24: syscalls.Supported("sched_yield", SchedYield),
+ 25: syscalls.Supported("mremap", Mremap),
+ 26: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil),
+ 27: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil),
+ 28: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil),
+ 29: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
+ 30: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil),
+ 31: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil),
+ 32: syscalls.Supported("dup", Dup),
+ 33: syscalls.Supported("dup2", Dup2),
+ 34: syscalls.Supported("pause", Pause),
+ 35: syscalls.Supported("nanosleep", Nanosleep),
+ 36: syscalls.Supported("getitimer", Getitimer),
+ 37: syscalls.Supported("alarm", Alarm),
+ 38: syscalls.Supported("setitimer", Setitimer),
+ 39: syscalls.Supported("getpid", Getpid),
+ 40: syscalls.Supported("sendfile", Sendfile),
+ 41: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil),
+ 42: syscalls.Supported("connect", Connect),
+ 43: syscalls.Supported("accept", Accept),
+ 44: syscalls.Supported("sendto", SendTo),
+ 45: syscalls.Supported("recvfrom", RecvFrom),
+ 46: syscalls.Supported("sendmsg", SendMsg),
+ 47: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil),
+ 48: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil),
+ 49: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil),
+ 50: syscalls.Supported("listen", Listen),
+ 51: syscalls.Supported("getsockname", GetSockName),
+ 52: syscalls.Supported("getpeername", GetPeerName),
+ 53: syscalls.Supported("socketpair", SocketPair),
+ 54: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil),
+ 55: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil),
+ 56: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil),
+ 57: syscalls.Supported("fork", Fork),
+ 58: syscalls.Supported("vfork", Vfork),
+ 59: syscalls.Supported("execve", Execve),
+ 60: syscalls.Supported("exit", Exit),
+ 61: syscalls.Supported("wait4", Wait4),
+ 62: syscalls.Supported("kill", Kill),
+ 63: syscalls.Supported("uname", Uname),
+ 64: syscalls.Supported("semget", Semget),
+ 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
+ 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+ 67: syscalls.Supported("shmdt", Shmdt),
+ 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
+ 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil),
+ 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil),
+ 75: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil),
+ 76: syscalls.Supported("truncate", Truncate),
+ 77: syscalls.Supported("ftruncate", Ftruncate),
+ 78: syscalls.Supported("getdents", Getdents),
+ 79: syscalls.Supported("getcwd", Getcwd),
+ 80: syscalls.Supported("chdir", Chdir),
+ 81: syscalls.Supported("fchdir", Fchdir),
+ 82: syscalls.Supported("rename", Rename),
+ 83: syscalls.Supported("mkdir", Mkdir),
+ 84: syscalls.Supported("rmdir", Rmdir),
+ 85: syscalls.Supported("creat", Creat),
+ 86: syscalls.Supported("link", Link),
+ 87: syscalls.Supported("unlink", Unlink),
+ 88: syscalls.Supported("symlink", Symlink),
+ 89: syscalls.Supported("readlink", Readlink),
+ 90: syscalls.Supported("chmod", Chmod),
+ 91: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil),
+ 92: syscalls.Supported("chown", Chown),
+ 93: syscalls.Supported("fchown", Fchown),
+ 94: syscalls.Supported("lchown", Lchown),
+ 95: syscalls.Supported("umask", Umask),
+ 96: syscalls.Supported("gettimeofday", Gettimeofday),
+ 97: syscalls.Supported("getrlimit", Getrlimit),
+ 98: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil),
+ 99: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil),
+ 100: syscalls.Supported("times", Times),
+ 101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil),
+ 102: syscalls.Supported("getuid", Getuid),
+ 103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil),
+ 104: syscalls.Supported("getgid", Getgid),
+ 105: syscalls.Supported("setuid", Setuid),
+ 106: syscalls.Supported("setgid", Setgid),
+ 107: syscalls.Supported("geteuid", Geteuid),
+ 108: syscalls.Supported("getegid", Getegid),
+ 109: syscalls.Supported("setpgid", Setpgid),
+ 110: syscalls.Supported("getppid", Getppid),
+ 111: syscalls.Supported("getpgrp", Getpgrp),
+ 112: syscalls.Supported("setsid", Setsid),
+ 113: syscalls.Supported("setreuid", Setreuid),
+ 114: syscalls.Supported("setregid", Setregid),
+ 115: syscalls.Supported("getgroups", Getgroups),
+ 116: syscalls.Supported("setgroups", Setgroups),
+ 117: syscalls.Supported("setresuid", Setresuid),
+ 118: syscalls.Supported("getresuid", Getresuid),
+ 119: syscalls.Supported("setresgid", Setresgid),
+ 120: syscalls.Supported("getresgid", Getresgid),
+ 121: syscalls.Supported("getpgid", Getpgid),
+ 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+ 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+ 124: syscalls.Supported("getsid", Getsid),
+ 125: syscalls.Supported("capget", Capget),
+ 126: syscalls.Supported("capset", Capset),
+ 127: syscalls.Supported("rt_sigpending", RtSigpending),
+ 128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait),
+ 129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo),
+ 130: syscalls.Supported("rt_sigsuspend", RtSigsuspend),
+ 131: syscalls.Supported("sigaltstack", Sigaltstack),
+ 132: syscalls.Supported("utime", Utime),
+ 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil),
+ 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil),
+ 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil),
+ 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil),
+ 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil),
+ 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil),
+ 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}),
+ 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil),
+ 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil),
+ 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil),
+ 143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil),
+ 144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil),
+ 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil),
+ 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil),
+ 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil),
+ 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil),
+ 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil),
+ 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil),
+ 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil),
+ 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil),
+ 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil),
+ 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil),
+ 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil),
+ 160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil),
+ 161: syscalls.Supported("chroot", Chroot),
+ 162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil),
+ 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil),
+ 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil),
+ 165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil),
+ 166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil),
+ 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
+ 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
+ 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil),
+ 170: syscalls.Supported("sethostname", Sethostname),
+ 171: syscalls.Supported("setdomainname", Setdomainname),
+ 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil),
+ 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil),
+ 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil),
+ 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil),
+ 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil),
+ 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil),
+ 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil),
+ 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations
+ 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil),
+ 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil),
+ 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil),
+ 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil),
+ 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil),
+ 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
+ 186: syscalls.Supported("gettid", Gettid),
+ 187: syscalls.Supported("readahead", Readahead),
+ 188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+ 189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+ 190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+ 191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+ 192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+ 193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+ 194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+ 195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+ 196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+ 197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+ 198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+ 199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
+ 200: syscalls.Supported("tkill", Tkill),
+ 201: syscalls.Supported("time", Time),
+ 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
+ 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil),
+ 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil),
+ 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
+ 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
+ 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
+ 213: syscalls.Supported("epoll_create", EpollCreate),
+ 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil),
+ 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil),
+ 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil),
+ 217: syscalls.Supported("getdents64", Getdents64),
+ 218: syscalls.Supported("set_tid_address", SetTidAddress),
+ 219: syscalls.Supported("restart_syscall", RestartSyscall),
+ 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
+ 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
+ 222: syscalls.Supported("timer_create", TimerCreate),
+ 223: syscalls.Supported("timer_settime", TimerSettime),
+ 224: syscalls.Supported("timer_gettime", TimerGettime),
+ 225: syscalls.Supported("timer_getoverrun", TimerGetoverrun),
+ 226: syscalls.Supported("timer_delete", TimerDelete),
+ 227: syscalls.Supported("clock_settime", ClockSettime),
+ 228: syscalls.Supported("clock_gettime", ClockGettime),
+ 229: syscalls.Supported("clock_getres", ClockGetres),
+ 230: syscalls.Supported("clock_nanosleep", ClockNanosleep),
+ 231: syscalls.Supported("exit_group", ExitGroup),
+ 232: syscalls.Supported("epoll_wait", EpollWait),
+ 233: syscalls.Supported("epoll_ctl", EpollCtl),
+ 234: syscalls.Supported("tgkill", Tgkill),
+ 235: syscalls.Supported("utimes", Utimes),
+ 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil),
+ 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}),
+ 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil),
+ 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil),
+ 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil),
+ 247: syscalls.Supported("waitid", Waitid),
+ 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil),
+ 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil),
+ 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
+ 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+ 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+ 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil),
+ 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
+ 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+ 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
+ 257: syscalls.Supported("openat", Openat),
+ 258: syscalls.Supported("mkdirat", Mkdirat),
+ 259: syscalls.Supported("mknodat", Mknodat),
+ 260: syscalls.Supported("fchownat", Fchownat),
+ 261: syscalls.Supported("futimesat", Futimesat),
+ 262: syscalls.Supported("fstatat", Fstatat),
+ 263: syscalls.Supported("unlinkat", Unlinkat),
+ 264: syscalls.Supported("renameat", Renameat),
+ 265: syscalls.Supported("linkat", Linkat),
+ 266: syscalls.Supported("symlinkat", Symlinkat),
+ 267: syscalls.Supported("readlinkat", Readlinkat),
+ 268: syscalls.Supported("fchmodat", Fchmodat),
+ 269: syscalls.Supported("faccessat", Faccessat),
+ 270: syscalls.Supported("pselect", Pselect),
+ 271: syscalls.Supported("ppoll", Ppoll),
+ 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
+ 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+ 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+ 275: syscalls.Supported("splice", Splice),
+ 276: syscalls.Supported("tee", Tee),
+ 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
+ 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+ 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly)
+ 280: syscalls.Supported("utimensat", Utimensat),
+ 281: syscalls.Supported("epoll_pwait", EpollPwait),
+ 282: syscalls.PartiallySupported("signalfd", Signalfd, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
+ 283: syscalls.Supported("timerfd_create", TimerfdCreate),
+ 284: syscalls.Supported("eventfd", Eventfd),
+ 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil),
+ 286: syscalls.Supported("timerfd_settime", TimerfdSettime),
+ 287: syscalls.Supported("timerfd_gettime", TimerfdGettime),
+ 288: syscalls.Supported("accept4", Accept4),
+ 289: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
+ 290: syscalls.Supported("eventfd2", Eventfd2),
+ 291: syscalls.Supported("epoll_create1", EpollCreate1),
+ 292: syscalls.Supported("dup3", Dup3),
+ 293: syscalls.Supported("pipe2", Pipe2),
+ 294: syscalls.Supported("inotify_init1", InotifyInit1),
+ 295: syscalls.Supported("preadv", Preadv),
+ 296: syscalls.Supported("pwritev", Pwritev),
+ 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
+ 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil),
+ 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil),
+ 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+ 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+ 302: syscalls.Supported("prlimit64", Prlimit64),
+ 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
+ 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
+ 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil),
+ 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil),
+ 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil),
+ 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995)
+ 309: syscalls.Supported("getcpu", Getcpu),
+ 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+ 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+ 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil),
+ 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil),
+ 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+ 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+ 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772)
+ 317: syscalls.Supported("seccomp", Seccomp),
+ 318: syscalls.Supported("getrandom", GetRandom),
+ 319: syscalls.Supported("memfd_create", MemfdCreate),
+ 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil),
+ 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
+ 322: syscalls.Supported("execveat", Execveat),
+ 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
+ 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267)
+ 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+
+ // Syscalls implemented after 325 are "backports" from versions
+ // of Linux after 4.4.
+ 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
+ 327: syscalls.Supported("preadv2", Preadv2),
+ 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
+ 329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
+ 330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
+ 331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
+ 332: syscalls.Supported("statx", Statx),
+ 333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
+ 334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
+
+ // Linux skips ahead to syscall 424 to sync numbers between arches.
+ 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
+ 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
+ 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
+ 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
+ 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
+ 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
+ 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
+ 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
+ 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
+ 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
+ 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
+ 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
+ },
+ Emulate: map[usermem.Addr]uintptr{
+ 0xffffffffff600000: 96, // vsyscall gettimeofday(2)
+ 0xffffffffff600400: 201, // vsyscall time(2)
+ 0xffffffffff600800: 309, // vsyscall getcpu(2)
+ },
+ Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, syserror.ENOSYS
+ },
+}
+
+// ARM64 is a table of Linux arm64 syscall API with the corresponding syscall
+// numbers from Linux 4.4.
+var ARM64 = &kernel.SyscallTable{
+ OS: abi.Linux,
+ Arch: arch.ARM64,
+ Version: kernel.Version{
+ Sysname: LinuxSysname,
+ Release: LinuxRelease,
+ Version: LinuxVersion,
+ },
+ AuditNumber: linux.AUDIT_ARCH_AARCH64,
+ Table: map[uintptr]kernel.Syscall{
+ 0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 5: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+ 6: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+ 7: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+ 8: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+ 9: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+ 10: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+ 11: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+ 12: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+ 13: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+ 14: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+ 15: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+ 16: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
+ 17: syscalls.Supported("getcwd", Getcwd),
+ 18: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
+ 19: syscalls.Supported("eventfd2", Eventfd2),
+ 20: syscalls.Supported("epoll_create1", EpollCreate1),
+ 21: syscalls.Supported("epoll_ctl", EpollCtl),
+ 22: syscalls.Supported("epoll_pwait", EpollPwait),
+ 23: syscalls.Supported("dup", Dup),
+ 24: syscalls.Supported("dup3", Dup3),
+ 25: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
+ 26: syscalls.Supported("inotify_init1", InotifyInit1),
+ 27: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
+ 28: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+ 29: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
+ 30: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+ 31: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+ 32: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil),
+ 33: syscalls.Supported("mknodat", Mknodat),
+ 34: syscalls.Supported("mkdirat", Mkdirat),
+ 35: syscalls.Supported("unlinkat", Unlinkat),
+ 36: syscalls.Supported("symlinkat", Symlinkat),
+ 37: syscalls.Supported("linkat", Linkat),
+ 38: syscalls.Supported("renameat", Renameat),
+ 39: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil),
+ 40: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil),
+ 41: syscalls.Error("pivot_root", syserror.EPERM, "", nil),
+ 42: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil),
+ 43: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil),
+ 44: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil),
+ 45: syscalls.Supported("truncate", Truncate),
+ 46: syscalls.Supported("ftruncate", Ftruncate),
+ 47: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil),
+ 48: syscalls.Supported("faccessat", Faccessat),
+ 49: syscalls.Supported("chdir", Chdir),
+ 50: syscalls.Supported("fchdir", Fchdir),
+ 51: syscalls.Supported("chroot", Chroot),
+ 52: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil),
+ 53: syscalls.Supported("fchmodat", Fchmodat),
+ 54: syscalls.Supported("fchownat", Fchownat),
+ 55: syscalls.Supported("fchown", Fchown),
+ 56: syscalls.Supported("openat", Openat),
+ 57: syscalls.Supported("close", Close),
+ 58: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil),
+ 59: syscalls.Supported("pipe2", Pipe2),
+ 60: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations
+ 61: syscalls.Supported("getdents64", Getdents64),
+ 62: syscalls.Supported("lseek", Lseek),
+ 63: syscalls.Supported("read", Read),
+ 64: syscalls.Supported("write", Write),
+ 65: syscalls.Supported("readv", Readv),
+ 66: syscalls.Supported("writev", Writev),
+ 67: syscalls.Supported("pread64", Pread64),
+ 68: syscalls.Supported("pwrite64", Pwrite64),
+ 69: syscalls.Supported("preadv", Preadv),
+ 70: syscalls.Supported("pwritev", Pwritev),
+ 71: syscalls.Supported("sendfile", Sendfile),
+ 72: syscalls.Supported("pselect", Pselect),
+ 73: syscalls.Supported("ppoll", Ppoll),
+ 74: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
+ 75: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+ 76: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+ 77: syscalls.Supported("tee", Tee),
+ 78: syscalls.Supported("readlinkat", Readlinkat),
+ 79: syscalls.Supported("fstatat", Fstatat),
+ 80: syscalls.Supported("fstat", Fstat),
+ 81: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil),
+ 82: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil),
+ 83: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil),
+ 84: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
+ 85: syscalls.Supported("timerfd_create", TimerfdCreate),
+ 86: syscalls.Supported("timerfd_settime", TimerfdSettime),
+ 87: syscalls.Supported("timerfd_gettime", TimerfdGettime),
+ 88: syscalls.Supported("utimensat", Utimensat),
+ 89: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil),
+ 90: syscalls.Supported("capget", Capget),
+ 91: syscalls.Supported("capset", Capset),
+ 92: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil),
+ 93: syscalls.Supported("exit", Exit),
+ 94: syscalls.Supported("exit_group", ExitGroup),
+ 95: syscalls.Supported("waitid", Waitid),
+ 96: syscalls.Supported("set_tid_address", SetTidAddress),
+ 97: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
+ 98: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
+ 99: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+ 100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+ 101: syscalls.Supported("nanosleep", Nanosleep),
+ 102: syscalls.Supported("getitimer", Getitimer),
+ 103: syscalls.Supported("setitimer", Setitimer),
+ 104: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil),
+ 105: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil),
+ 106: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil),
+ 107: syscalls.Supported("timer_create", TimerCreate),
+ 108: syscalls.Supported("timer_gettime", TimerGettime),
+ 109: syscalls.Supported("timer_getoverrun", TimerGetoverrun),
+ 110: syscalls.Supported("timer_settime", TimerSettime),
+ 111: syscalls.Supported("timer_delete", TimerDelete),
+ 112: syscalls.Supported("clock_settime", ClockSettime),
+ 113: syscalls.Supported("clock_gettime", ClockGettime),
+ 114: syscalls.Supported("clock_getres", ClockGetres),
+ 115: syscalls.Supported("clock_nanosleep", ClockNanosleep),
+ 116: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil),
+ 117: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil),
+ 118: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil),
+ 119: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil),
+ 120: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil),
+ 121: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil),
+ 122: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil),
+ 123: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil),
+ 124: syscalls.Supported("sched_yield", SchedYield),
+ 125: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil),
+ 126: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil),
+ 127: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil),
+ 128: syscalls.Supported("restart_syscall", RestartSyscall),
+ 129: syscalls.Supported("kill", Kill),
+ 130: syscalls.Supported("tkill", Tkill),
+ 131: syscalls.Supported("tgkill", Tgkill),
+ 132: syscalls.Supported("sigaltstack", Sigaltstack),
+ 133: syscalls.Supported("rt_sigsuspend", RtSigsuspend),
+ 134: syscalls.Supported("rt_sigaction", RtSigaction),
+ 135: syscalls.Supported("rt_sigprocmask", RtSigprocmask),
+ 136: syscalls.Supported("rt_sigpending", RtSigpending),
+ 137: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait),
+ 138: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo),
+ 139: syscalls.Supported("rt_sigreturn", RtSigreturn),
+ 140: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil),
+ 141: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil),
+ 142: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil),
+ 143: syscalls.Supported("setregid", Setregid),
+ 144: syscalls.Supported("setgid", Setgid),
+ 145: syscalls.Supported("setreuid", Setreuid),
+ 146: syscalls.Supported("setuid", Setuid),
+ 147: syscalls.Supported("setresuid", Setresuid),
+ 148: syscalls.Supported("getresuid", Getresuid),
+ 149: syscalls.Supported("setresgid", Setresgid),
+ 150: syscalls.Supported("getresgid", Getresgid),
+ 151: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+ 152: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+ 153: syscalls.Supported("times", Times),
+ 154: syscalls.Supported("setpgid", Setpgid),
+ 155: syscalls.Supported("getpgid", Getpgid),
+ 156: syscalls.Supported("getsid", Getsid),
+ 157: syscalls.Supported("setsid", Setsid),
+ 158: syscalls.Supported("getgroups", Getgroups),
+ 159: syscalls.Supported("setgroups", Setgroups),
+ 160: syscalls.Supported("uname", Uname),
+ 161: syscalls.Supported("sethostname", Sethostname),
+ 162: syscalls.Supported("setdomainname", Setdomainname),
+ 163: syscalls.Supported("getrlimit", Getrlimit),
+ 164: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil),
+ 165: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil),
+ 166: syscalls.Supported("umask", Umask),
+ 167: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil),
+ 168: syscalls.Supported("getcpu", Getcpu),
+ 169: syscalls.Supported("gettimeofday", Gettimeofday),
+ 170: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil),
+ 171: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil),
+ 172: syscalls.Supported("getpid", Getpid),
+ 173: syscalls.Supported("getppid", Getppid),
+ 174: syscalls.Supported("getuid", Getuid),
+ 175: syscalls.Supported("geteuid", Geteuid),
+ 176: syscalls.Supported("getgid", Getgid),
+ 177: syscalls.Supported("getegid", Getegid),
+ 178: syscalls.Supported("gettid", Gettid),
+ 179: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil),
+ 180: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 181: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 182: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 183: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 184: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 185: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 186: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 187: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 190: syscalls.Supported("semget", Semget),
+ 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+ 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
+ 193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
+ 194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
+ 195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil),
+ 196: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil),
+ 197: syscalls.Supported("shmdt", Shmdt),
+ 198: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil),
+ 199: syscalls.Supported("socketpair", SocketPair),
+ 200: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil),
+ 201: syscalls.Supported("listen", Listen),
+ 202: syscalls.Supported("accept", Accept),
+ 203: syscalls.Supported("connect", Connect),
+ 204: syscalls.Supported("getsockname", GetSockName),
+ 205: syscalls.Supported("getpeername", GetPeerName),
+ 206: syscalls.Supported("sendto", SendTo),
+ 207: syscalls.Supported("recvfrom", RecvFrom),
+ 208: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil),
+ 209: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil),
+ 210: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil),
+ 211: syscalls.Supported("sendmsg", SendMsg),
+ 212: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil),
+ 213: syscalls.Supported("readahead", Readahead),
+ 214: syscalls.Supported("brk", Brk),
+ 215: syscalls.Supported("munmap", Munmap),
+ 216: syscalls.Supported("mremap", Mremap),
+ 217: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil),
+ 218: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil),
+ 219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
+ 220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil),
+ 221: syscalls.Supported("execve", Execve),
+ 222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil),
+ 223: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
+ 224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
+ 225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
+ 226: syscalls.Supported("mprotect", Mprotect),
+ 227: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil),
+ 228: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 229: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 230: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 231: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 232: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil),
+ 233: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil),
+ 234: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil),
+ 235: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}),
+ 236: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil),
+ 237: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil),
+ 238: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
+ 239: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly)
+ 240: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
+ 241: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil),
+ 242: syscalls.Supported("accept4", Accept4),
+ 243: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil),
+ 260: syscalls.Supported("wait4", Wait4),
+ 261: syscalls.Supported("prlimit64", Prlimit64),
+ 262: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+ 263: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+ 264: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
+ 265: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
+ 266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil),
+ 267: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil),
+ 268: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995)
+ 269: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil),
+ 270: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+ 271: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+ 272: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil),
+ 273: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil),
+ 274: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+ 275: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+ 276: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772)
+ 277: syscalls.Supported("seccomp", Seccomp),
+ 278: syscalls.Supported("getrandom", GetRandom),
+ 279: syscalls.Supported("memfd_create", MemfdCreate),
+ 280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
+ 281: syscalls.Supported("execveat", Execveat),
+ 282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
+ 283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267)
+ 284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+
+ // Syscalls after 284 are "backports" from versions of Linux after 4.4.
+ 285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
+ 286: syscalls.Supported("preadv2", Preadv2),
+ 287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
+ 288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
+ 289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
+ 290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
+ 291: syscalls.Supported("statx", Statx),
+ 292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
+ 293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
+
+ // Linux skips ahead to syscall 424 to sync numbers between arches.
+ 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
+ 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
+ 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
+ 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
+ 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
+ 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
+ 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
+ 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
+ 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
+ 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
+ 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
+ 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
+ },
+ Emulate: map[usermem.Addr]uintptr{},
+ Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, syserror.ENOSYS
+ },
+}
+
+func init() {
+ kernel.RegisterSyscallTable(AMD64)
+ kernel.RegisterSyscallTable(ARM64)
+}
diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go
new file mode 100644
index 000000000..434559b80
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sigset.go
@@ -0,0 +1,71 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// CopyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and
+// STOP are clear.
+//
+// TODO(gvisor.dev/issue/1624): This is only exported because
+// syscalls/vfs2/signal.go depends on it. Once vfs1 is deleted and the vfs2
+// syscalls are moved into this package, then they can be unexported.
+func CopyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) {
+ if size != linux.SignalSetSize {
+ return 0, syserror.EINVAL
+ }
+ b := t.CopyScratchBuffer(8)
+ if _, err := t.CopyInBytes(sigSetAddr, b); err != nil {
+ return 0, err
+ }
+ mask := usermem.ByteOrder.Uint64(b[:])
+ return linux.SignalSet(mask) &^ kernel.UnblockableSignals, nil
+}
+
+// copyOutSigSet copies out a sigset_t.
+func copyOutSigSet(t *kernel.Task, sigSetAddr usermem.Addr, mask linux.SignalSet) error {
+ b := t.CopyScratchBuffer(8)
+ usermem.ByteOrder.PutUint64(b, uint64(mask))
+ _, err := t.CopyOutBytes(sigSetAddr, b)
+ return err
+}
+
+// copyInSigSetWithSize copies in a structure as below
+//
+// struct {
+// sigset_t* sigset_addr;
+// size_t sizeof_sigset;
+// };
+//
+// and returns sigset_addr and size.
+func copyInSigSetWithSize(t *kernel.Task, addr usermem.Addr) (usermem.Addr, uint, error) {
+ switch t.Arch().Width() {
+ case 8:
+ in := t.CopyScratchBuffer(16)
+ if _, err := t.CopyInBytes(addr, in); err != nil {
+ return 0, 0, err
+ }
+ maskAddr := usermem.Addr(usermem.ByteOrder.Uint64(in[0:]))
+ maskSize := uint(usermem.ByteOrder.Uint64(in[8:]))
+ return maskAddr, maskSize, nil
+ default:
+ return 0, 0, syserror.ENOSYS
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
new file mode 100644
index 000000000..ba2557c52
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -0,0 +1,382 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// IoSetup implements linux syscall io_setup(2).
+func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ nrEvents := args[0].Int()
+ idAddr := args[1].Pointer()
+
+ // Linux uses the native long as the aio ID.
+ //
+ // The context pointer _must_ be zero initially.
+ var idIn uint64
+ if _, err := t.CopyIn(idAddr, &idIn); err != nil {
+ return 0, nil, err
+ }
+ if idIn != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ id, err := t.MemoryManager().NewAIOContext(t, uint32(nrEvents))
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // Copy out the new ID.
+ if _, err := t.CopyOut(idAddr, &id); err != nil {
+ t.MemoryManager().DestroyAIOContext(t, id)
+ return 0, nil, err
+ }
+
+ return 0, nil, nil
+}
+
+// IoDestroy implements linux syscall io_destroy(2).
+func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ id := args[0].Uint64()
+
+ ctx := t.MemoryManager().DestroyAIOContext(t, id)
+ if ctx == nil {
+ // Does not exist.
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Drain completed requests amd wait for pending requests until there are no
+ // more.
+ for {
+ ctx.Drain()
+
+ ch := ctx.WaitChannel()
+ if ch == nil {
+ // No more requests, we're done.
+ return 0, nil, nil
+ }
+ // The task cannot be interrupted during the wait. Equivalent to
+ // TASK_UNINTERRUPTIBLE in Linux.
+ t.UninterruptibleSleepStart(true /* deactivate */)
+ <-ch
+ t.UninterruptibleSleepFinish(true /* activate */)
+ }
+}
+
+// IoGetevents implements linux syscall io_getevents(2).
+func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ id := args[0].Uint64()
+ minEvents := args[1].Int()
+ events := args[2].Int()
+ eventsAddr := args[3].Pointer()
+ timespecAddr := args[4].Pointer()
+
+ // Sanity check arguments.
+ if minEvents < 0 || minEvents > events {
+ return 0, nil, syserror.EINVAL
+ }
+
+ ctx, ok := t.MemoryManager().LookupAIOContext(t, id)
+ if !ok {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Setup the timeout.
+ var haveDeadline bool
+ var deadline ktime.Time
+ if timespecAddr != 0 {
+ d, err := copyTimespecIn(t, timespecAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+ if !d.Valid() {
+ return 0, nil, syserror.EINVAL
+ }
+ deadline = t.Kernel().MonotonicClock().Now().Add(d.ToDuration())
+ haveDeadline = true
+ }
+
+ // Loop over all requests.
+ for count := int32(0); count < events; count++ {
+ // Get a request, per semantics.
+ var v interface{}
+ if count >= minEvents {
+ var ok bool
+ v, ok = ctx.PopRequest()
+ if !ok {
+ return uintptr(count), nil, nil
+ }
+ } else {
+ var err error
+ v, err = waitForRequest(ctx, t, haveDeadline, deadline)
+ if err != nil {
+ if count > 0 || err == syserror.ETIMEDOUT {
+ return uintptr(count), nil, nil
+ }
+ return 0, nil, syserror.ConvertIntr(err, syserror.EINTR)
+ }
+ }
+
+ ev := v.(*linux.IOEvent)
+
+ // Copy out the result.
+ if _, err := t.CopyOut(eventsAddr, ev); err != nil {
+ if count > 0 {
+ return uintptr(count), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
+ }
+
+ // Keep rolling.
+ eventsAddr += usermem.Addr(linux.IOEventSize)
+ }
+
+ // Everything finished.
+ return uintptr(events), nil, nil
+}
+
+func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (interface{}, error) {
+ for {
+ if v, ok := ctx.PopRequest(); ok {
+ // Request was readily available. Just return it.
+ return v, nil
+ }
+
+ // Need to wait for request completion.
+ done := ctx.WaitChannel()
+ if done == nil {
+ // Context has been destroyed.
+ return nil, syserror.EINVAL
+ }
+ if err := t.BlockWithDeadline(done, haveDeadline, deadline); err != nil {
+ return nil, err
+ }
+ }
+}
+
+// memoryFor returns appropriate memory for the given callback.
+func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) {
+ bytes := int(cb.Bytes)
+ if bytes < 0 {
+ // Linux also requires that this field fit in ssize_t.
+ return usermem.IOSequence{}, syserror.EINVAL
+ }
+
+ // Since this I/O will be asynchronous with respect to t's task goroutine,
+ // we have no guarantee that t's AddressSpace will be active during the
+ // I/O.
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE:
+ return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+ AddressSpaceActive: false,
+ })
+
+ case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV:
+ return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+ AddressSpaceActive: false,
+ })
+
+ case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP:
+ return usermem.IOSequence{}, nil
+
+ default:
+ // Not a supported command.
+ return usermem.IOSequence{}, syserror.EINVAL
+ }
+}
+
+// IoCancel implements linux syscall io_cancel(2).
+//
+// It is not presently supported (ENOSYS indicates no support on this
+// architecture).
+func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, syserror.ENOSYS
+}
+
+// LINT.IfChange
+
+func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, actx *mm.AIOContext, eventFile *fs.File) kernel.AIOCallback {
+ return func(ctx context.Context) {
+ if actx.Dead() {
+ actx.CancelPendingRequest()
+ return
+ }
+ ev := &linux.IOEvent{
+ Data: cb.Data,
+ Obj: uint64(cbAddr),
+ }
+
+ var err error
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV:
+ ev.Result, err = file.Preadv(ctx, ioseq, cb.Offset)
+ case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+ ev.Result, err = file.Pwritev(ctx, ioseq, cb.Offset)
+ case linux.IOCB_CMD_FSYNC:
+ err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+ case linux.IOCB_CMD_FDSYNC:
+ err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncData)
+ }
+
+ // Update the result.
+ if err != nil {
+ err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file)
+ ev.Result = -int64(kernel.ExtractErrno(err, 0))
+ }
+
+ file.DecRef()
+
+ // Queue the result for delivery.
+ actx.FinishRequest(ev)
+
+ // Notify the event file if one was specified. This needs to happen
+ // *after* queueing the result to avoid racing with the thread we may
+ // wake up.
+ if eventFile != nil {
+ eventFile.FileOperations.(*eventfd.EventOperations).Signal(1)
+ eventFile.DecRef()
+ }
+ }
+}
+
+// submitCallback processes a single callback.
+func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error {
+ file := t.GetFile(cb.FD)
+ if file == nil {
+ // File not found.
+ return syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Was there an eventFD? Extract it.
+ var eventFile *fs.File
+ if cb.Flags&linux.IOCB_FLAG_RESFD != 0 {
+ eventFile = t.GetFile(cb.ResFD)
+ if eventFile == nil {
+ // Bad FD.
+ return syserror.EBADF
+ }
+ defer eventFile.DecRef()
+
+ // Check that it is an eventfd.
+ if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok {
+ // Not an event FD.
+ return syserror.EINVAL
+ }
+ }
+
+ ioseq, err := memoryFor(t, cb)
+ if err != nil {
+ return err
+ }
+
+ // Check offset for reads/writes.
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+ if cb.Offset < 0 {
+ return syserror.EINVAL
+ }
+ }
+
+ // Prepare the request.
+ ctx, ok := t.MemoryManager().LookupAIOContext(t, id)
+ if !ok {
+ return syserror.EINVAL
+ }
+ if ready := ctx.Prepare(); !ready {
+ // Context is busy.
+ return syserror.EAGAIN
+ }
+
+ if eventFile != nil {
+ // The request is set. Make sure there's a ref on the file.
+ //
+ // This is necessary when the callback executes on completion,
+ // which is also what will release this reference.
+ eventFile.IncRef()
+ }
+
+ // Perform the request asynchronously.
+ file.IncRef()
+ t.QueueAIO(getAIOCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile))
+
+ // All set.
+ return nil
+}
+
+// IoSubmit implements linux syscall io_submit(2).
+func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ id := args[0].Uint64()
+ nrEvents := args[1].Int()
+ addr := args[2].Pointer()
+
+ if nrEvents < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ for i := int32(0); i < nrEvents; i++ {
+ // Copy in the address.
+ cbAddrNative := t.Arch().Native(0)
+ if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
+ if i > 0 {
+ // Some successful.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
+ }
+
+ // Copy in this callback.
+ var cb linux.IOCallback
+ cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
+ if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+
+ if i > 0 {
+ // Some have been successful.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
+ }
+
+ // Process this callback.
+ if err := submitCallback(t, id, &cb, cbAddr); err != nil {
+ if i > 0 {
+ // Partial success.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
+ }
+
+ // Advance to the next one.
+ addr += usermem.Addr(t.Arch().Width())
+ }
+
+ return uintptr(nrEvents), nil, nil
+}
+
+// LINT.ThenChange(vfs2/aio.go)
diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go
new file mode 100644
index 000000000..adf5ea5f2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_capability.go
@@ -0,0 +1,149 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+func lookupCaps(t *kernel.Task, tid kernel.ThreadID) (permitted, inheritable, effective auth.CapabilitySet, err error) {
+ if tid < 0 {
+ err = syserror.EINVAL
+ return
+ }
+ if tid > 0 {
+ t = t.PIDNamespace().TaskWithID(tid)
+ }
+ if t == nil {
+ err = syserror.ESRCH
+ return
+ }
+ creds := t.Credentials()
+ permitted, inheritable, effective = creds.PermittedCaps, creds.InheritableCaps, creds.EffectiveCaps
+ return
+}
+
+// Capget implements Linux syscall capget.
+func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ hdrAddr := args[0].Pointer()
+ dataAddr := args[1].Pointer()
+
+ var hdr linux.CapUserHeader
+ if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+ return 0, nil, err
+ }
+ // hdr.Pid doesn't need to be valid if this capget() is a "version probe"
+ // (hdr.Version is unrecognized and dataAddr is null), so we can't do the
+ // lookup yet.
+ switch hdr.Version {
+ case linux.LINUX_CAPABILITY_VERSION_1:
+ if dataAddr == 0 {
+ return 0, nil, nil
+ }
+ p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid))
+ if err != nil {
+ return 0, nil, err
+ }
+ data := linux.CapUserData{
+ Effective: uint32(e),
+ Permitted: uint32(p),
+ Inheritable: uint32(i),
+ }
+ _, err = t.CopyOut(dataAddr, &data)
+ return 0, nil, err
+
+ case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
+ if dataAddr == 0 {
+ return 0, nil, nil
+ }
+ p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid))
+ if err != nil {
+ return 0, nil, err
+ }
+ data := [2]linux.CapUserData{
+ {
+ Effective: uint32(e),
+ Permitted: uint32(p),
+ Inheritable: uint32(i),
+ },
+ {
+ Effective: uint32(e >> 32),
+ Permitted: uint32(p >> 32),
+ Inheritable: uint32(i >> 32),
+ },
+ }
+ _, err = t.CopyOut(dataAddr, &data)
+ return 0, nil, err
+
+ default:
+ hdr.Version = linux.HighestCapabilityVersion
+ if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+ return 0, nil, err
+ }
+ if dataAddr != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ return 0, nil, nil
+ }
+}
+
+// Capset implements Linux syscall capset.
+func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ hdrAddr := args[0].Pointer()
+ dataAddr := args[1].Pointer()
+
+ var hdr linux.CapUserHeader
+ if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+ return 0, nil, err
+ }
+ switch hdr.Version {
+ case linux.LINUX_CAPABILITY_VERSION_1:
+ if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() {
+ return 0, nil, syserror.EPERM
+ }
+ var data linux.CapUserData
+ if _, err := t.CopyIn(dataAddr, &data); err != nil {
+ return 0, nil, err
+ }
+ p := auth.CapabilitySet(data.Permitted) & auth.AllCapabilities
+ i := auth.CapabilitySet(data.Inheritable) & auth.AllCapabilities
+ e := auth.CapabilitySet(data.Effective) & auth.AllCapabilities
+ return 0, nil, t.SetCapabilitySets(p, i, e)
+
+ case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
+ if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() {
+ return 0, nil, syserror.EPERM
+ }
+ var data [2]linux.CapUserData
+ if _, err := t.CopyIn(dataAddr, &data); err != nil {
+ return 0, nil, err
+ }
+ p := (auth.CapabilitySet(data[0].Permitted) | (auth.CapabilitySet(data[1].Permitted) << 32)) & auth.AllCapabilities
+ i := (auth.CapabilitySet(data[0].Inheritable) | (auth.CapabilitySet(data[1].Inheritable) << 32)) & auth.AllCapabilities
+ e := (auth.CapabilitySet(data[0].Effective) | (auth.CapabilitySet(data[1].Effective) << 32)) & auth.AllCapabilities
+ return 0, nil, t.SetCapabilitySets(p, i, e)
+
+ default:
+ hdr.Version = linux.HighestCapabilityVersion
+ if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, syserror.EINVAL
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/sys_clone_amd64.go b/pkg/sentry/syscalls/linux/sys_clone_amd64.go
new file mode 100644
index 000000000..dd43cf18d
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_clone_amd64.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// Clone implements linux syscall clone(2).
+// sys_clone has so many flavors. We implement the default one in linux 3.11
+// x86_64:
+// sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val)
+func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := int(args[0].Int())
+ stack := args[1].Pointer()
+ parentTID := args[2].Pointer()
+ childTID := args[3].Pointer()
+ tls := args[4].Pointer()
+ return clone(t, flags, stack, parentTID, childTID, tls)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_clone_arm64.go b/pkg/sentry/syscalls/linux/sys_clone_arm64.go
new file mode 100644
index 000000000..cf68a8949
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_clone_arm64.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// Clone implements linux syscall clone(2).
+// sys_clone has so many flavors, and we implement the default one in linux 3.11
+// arm64(kernel/fork.c with CONFIG_CLONE_BACKWARDS defined in the config file):
+// sys_clone(clone_flags, newsp, parent_tidptr, tls_val, child_tidptr)
+func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := int(args[0].Int())
+ stack := args[1].Pointer()
+ parentTID := args[2].Pointer()
+ tls := args[3].Pointer()
+ childTID := args[4].Pointer()
+ return clone(t, flags, stack, parentTID, childTID, tls)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
new file mode 100644
index 000000000..7f460d30b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -0,0 +1,147 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
+ "gvisor.dev/gvisor/pkg/sentry/syscalls"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// LINT.IfChange
+
+// EpollCreate1 implements the epoll_create1(2) linux syscall.
+func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := args[0].Int()
+ if flags & ^linux.EPOLL_CLOEXEC != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ closeOnExec := flags&linux.EPOLL_CLOEXEC != 0
+ fd, err := syscalls.CreateEpoll(t, closeOnExec)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
+
+// EpollCreate implements the epoll_create(2) linux syscall.
+func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ size := args[0].Int()
+
+ if size <= 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ fd, err := syscalls.CreateEpoll(t, false)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
+
+// EpollCtl implements the epoll_ctl(2) linux syscall.
+func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ epfd := args[0].Int()
+ op := args[1].Int()
+ fd := args[2].Int()
+ eventAddr := args[3].Pointer()
+
+ // Capture the event state if needed.
+ flags := epoll.EntryFlags(0)
+ mask := waiter.EventMask(0)
+ var data [2]int32
+ if op != linux.EPOLL_CTL_DEL {
+ var e linux.EpollEvent
+ if _, err := e.CopyIn(t, eventAddr); err != nil {
+ return 0, nil, err
+ }
+
+ if e.Events&linux.EPOLLONESHOT != 0 {
+ flags |= epoll.OneShot
+ }
+
+ if e.Events&linux.EPOLLET != 0 {
+ flags |= epoll.EdgeTriggered
+ }
+
+ mask = waiter.EventMaskFromLinux(e.Events)
+ data = e.Data
+ }
+
+ // Perform the requested operations.
+ switch op {
+ case linux.EPOLL_CTL_ADD:
+ // See fs/eventpoll.c.
+ mask |= waiter.EventHUp | waiter.EventErr
+ return 0, nil, syscalls.AddEpoll(t, epfd, fd, flags, mask, data)
+ case linux.EPOLL_CTL_DEL:
+ return 0, nil, syscalls.RemoveEpoll(t, epfd, fd)
+ case linux.EPOLL_CTL_MOD:
+ // Same as EPOLL_CTL_ADD.
+ mask |= waiter.EventHUp | waiter.EventErr
+ return 0, nil, syscalls.UpdateEpoll(t, epfd, fd, flags, mask, data)
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+}
+
+// EpollWait implements the epoll_wait(2) linux syscall.
+func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ epfd := args[0].Int()
+ eventsAddr := args[1].Pointer()
+ maxEvents := int(args[2].Int())
+ timeout := int(args[3].Int())
+
+ r, err := syscalls.WaitEpoll(t, epfd, maxEvents, timeout)
+ if err != nil {
+ return 0, nil, syserror.ConvertIntr(err, syserror.EINTR)
+ }
+
+ if len(r) != 0 {
+ if _, err := linux.CopyEpollEventSliceOut(t, eventsAddr, r); err != nil {
+ return 0, nil, err
+ }
+ }
+
+ return uintptr(len(r)), nil, nil
+}
+
+// EpollPwait implements the epoll_pwait(2) linux syscall.
+func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ maskAddr := args[4].Pointer()
+ maskSize := uint(args[5].Uint())
+
+ if maskAddr != 0 {
+ mask, err := CopyInSigSet(t, maskAddr, maskSize)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ oldmask := t.SignalMask()
+ t.SetSignalMask(mask)
+ t.SetSavedSignalMask(oldmask)
+ }
+
+ return EpollWait(t, args)
+}
+
+// LINT.ThenChange(vfs2/epoll.go)
diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go
new file mode 100644
index 000000000..ed3413ca6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_eventfd.go
@@ -0,0 +1,56 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Eventfd2 implements linux syscall eventfd2(2).
+func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ initVal := args[0].Int()
+ flags := uint(args[1].Uint())
+ allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC)
+
+ if flags & ^allOps != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ event := eventfd.New(t, uint64(initVal), flags&linux.EFD_SEMAPHORE != 0)
+ event.SetFlags(fs.SettableFileFlags{
+ NonBlocking: flags&linux.EFD_NONBLOCK != 0,
+ })
+ defer event.DecRef()
+
+ fd, err := t.NewFDFrom(0, event, kernel.FDFlags{
+ CloseOnExec: flags&linux.EFD_CLOEXEC != 0,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
+
+// Eventfd implements linux syscall eventfd(2).
+func Eventfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ args[1].Value = 0
+ return Eventfd2(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
new file mode 100644
index 000000000..2797c6a72
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -0,0 +1,2238 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fileOpAt performs an operation on the second last component in the path.
+func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error {
+ // Extract the last component.
+ dir, name := fs.SplitLast(path)
+ if dir == "/" {
+ // Common case: we are accessing a file in the root.
+ root := t.FSContext().RootDirectory()
+ err := fn(root, root, name, linux.MaxSymlinkTraversals)
+ root.DecRef()
+ return err
+ } else if dir == "." && dirFD == linux.AT_FDCWD {
+ // Common case: we are accessing a file relative to the current
+ // working directory; skip the look-up.
+ wd := t.FSContext().WorkingDirectory()
+ root := t.FSContext().RootDirectory()
+ err := fn(root, wd, name, linux.MaxSymlinkTraversals)
+ wd.DecRef()
+ root.DecRef()
+ return err
+ }
+
+ return fileOpOn(t, dirFD, dir, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error {
+ return fn(root, d, name, remainingTraversals)
+ })
+}
+
+// fileOpOn performs an operation on the last entry of the path.
+func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error {
+ var (
+ d *fs.Dirent // The file.
+ wd *fs.Dirent // The working directory (if required.)
+ rel *fs.Dirent // The relative directory for search (if required.)
+ f *fs.File // The file corresponding to dirFD (if required.)
+ err error
+ )
+
+ // Extract the working directory (maybe).
+ if len(path) > 0 && path[0] == '/' {
+ // Absolute path; rel can be nil.
+ } else if dirFD == linux.AT_FDCWD {
+ // Need to reference the working directory.
+ wd = t.FSContext().WorkingDirectory()
+ rel = wd
+ } else {
+ // Need to extract the given FD.
+ f = t.GetFile(dirFD)
+ if f == nil {
+ return syserror.EBADF
+ }
+ rel = f.Dirent
+ if !fs.IsDir(rel.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+ }
+
+ // Grab the root (always required.)
+ root := t.FSContext().RootDirectory()
+
+ // Lookup the node.
+ remainingTraversals := uint(linux.MaxSymlinkTraversals)
+ if resolve {
+ d, err = t.MountNamespace().FindInode(t, root, rel, path, &remainingTraversals)
+ } else {
+ d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals)
+ }
+ root.DecRef()
+ if wd != nil {
+ wd.DecRef()
+ }
+ if f != nil {
+ f.DecRef()
+ }
+ if err != nil {
+ return err
+ }
+
+ err = fn(root, d, remainingTraversals)
+ d.DecRef()
+ return err
+}
+
+// copyInPath copies a path in.
+func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string, dirPath bool, err error) {
+ path, err = t.CopyInString(addr, linux.PATH_MAX)
+ if err != nil {
+ return "", false, err
+ }
+ if path == "" && !allowEmpty {
+ return "", false, syserror.ENOENT
+ }
+
+ // If the path ends with a /, then checks must be enforced in various
+ // ways in the different callers. We pass this back to the caller.
+ path, dirPath = fs.TrimTrailingSlashes(path)
+
+ return path, dirPath, nil
+}
+
+// LINT.IfChange
+
+func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) {
+ path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return 0, err
+ }
+
+ resolve := flags&linux.O_NOFOLLOW == 0
+ err = fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ // First check a few things about the filesystem before trying to get the file
+ // reference.
+ //
+ // It's required that Check does not try to open files not that aren't backed by
+ // this dirent (e.g. pipes and sockets) because this would result in opening these
+ // files an extra time just to check permissions.
+ if err := d.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
+ return err
+ }
+
+ if fs.IsSymlink(d.Inode.StableAttr) && !resolve {
+ return syserror.ELOOP
+ }
+
+ fileFlags := linuxToFlags(flags)
+ // Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
+ fileFlags.LargeFile = true
+ if fs.IsDir(d.Inode.StableAttr) {
+ // Don't allow directories to be opened writable.
+ if fileFlags.Write {
+ return syserror.EISDIR
+ }
+ } else {
+ // If O_DIRECTORY is set, but the file is not a directory, then fail.
+ if fileFlags.Directory {
+ return syserror.ENOTDIR
+ }
+ // If it's a directory, then make sure.
+ if dirPath {
+ return syserror.ENOTDIR
+ }
+ }
+
+ // Truncate is called when O_TRUNC is specified for any kind of
+ // existing Dirent. Behavior is delegated to the entry's Truncate
+ // implementation.
+ if flags&linux.O_TRUNC != 0 {
+ if err := d.Inode.Truncate(t, d, 0); err != nil {
+ return err
+ }
+ }
+
+ file, err := d.Inode.GetFile(t, d, fileFlags)
+ if err != nil {
+ return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+ }
+ defer file.DecRef()
+
+ // Success.
+ newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
+ CloseOnExec: flags&linux.O_CLOEXEC != 0,
+ })
+ if err != nil {
+ return err
+ }
+
+ // Set return result in frame.
+ fd = uintptr(newFD)
+
+ // Generate notification for opened file.
+ d.InotifyEvent(linux.IN_OPEN, 0)
+
+ return nil
+ })
+ return fd, err // Use result in frame.
+}
+
+func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
+ path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return err
+ }
+ if dirPath {
+ return syserror.ENOENT
+ }
+
+ return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
+ if !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ // Do we have the appropriate permissions on the parent?
+ if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+ return err
+ }
+
+ // Attempt a creation.
+ perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+
+ switch mode.FileType() {
+ case 0:
+ // "Zero file type is equivalent to type S_IFREG." - mknod(2)
+ fallthrough
+ case linux.ModeRegular:
+ // We are not going to return the file, so the actual
+ // flags used don't matter, but they cannot be empty or
+ // Create will complain.
+ flags := fs.FileFlags{Read: true, Write: true}
+ file, err := d.Create(t, root, name, flags, perms)
+ if err != nil {
+ return err
+ }
+ file.DecRef()
+ return nil
+
+ case linux.ModeNamedPipe:
+ return d.CreateFifo(t, root, name, perms)
+
+ case linux.ModeSocket:
+ // While it is possible create a unix domain socket file on linux
+ // using mknod(2), in practice this is pretty useless from an
+ // application. Linux internally uses mknod() to create the socket
+ // node during bind(2), but we implement bind(2) independently. If
+ // an application explicitly creates a socket node using mknod(),
+ // you can't seem to bind() or connect() to the resulting socket.
+ //
+ // Instead of emulating this seemingly useless behaviour, we'll
+ // indicate that the filesystem doesn't support the creation of
+ // sockets.
+ return syserror.EOPNOTSUPP
+
+ case linux.ModeCharacterDevice:
+ fallthrough
+ case linux.ModeBlockDevice:
+ // TODO(b/72101894): We don't support creating block or character
+ // devices at the moment.
+ //
+ // When we start supporting block and character devices, we'll
+ // need to check for CAP_MKNOD here.
+ return syserror.EPERM
+
+ default:
+ // "EINVAL - mode requested creation of something other than a
+ // regular file, device special file, FIFO or socket." - mknod(2)
+ return syserror.EINVAL
+ }
+ })
+}
+
+// Mknod implements the linux syscall mknod(2).
+func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ path := args[0].Pointer()
+ mode := linux.FileMode(args[1].ModeT())
+ // We don't need this argument until we support creation of device nodes.
+ _ = args[2].Uint() // dev
+
+ return 0, nil, mknodAt(t, linux.AT_FDCWD, path, mode)
+}
+
+// Mknodat implements the linux syscall mknodat(2).
+func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirFD := args[0].Int()
+ path := args[1].Pointer()
+ mode := linux.FileMode(args[2].ModeT())
+ // We don't need this argument until we support creation of device nodes.
+ _ = args[3].Uint() // dev
+
+ return 0, nil, mknodAt(t, dirFD, path, mode)
+}
+
+func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) {
+ path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return 0, err
+ }
+ if dirPath {
+ return 0, syserror.ENOENT
+ }
+
+ fileFlags := linuxToFlags(flags)
+ // Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
+ fileFlags.LargeFile = true
+
+ err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, parent *fs.Dirent, name string, remainingTraversals uint) error {
+ // Resolve the name to see if it exists, and follow any
+ // symlinks along the way. We must do the symlink resolution
+ // manually because if the symlink target does not exist, we
+ // must create the target (and not the symlink itself).
+ var (
+ found *fs.Dirent
+ err error
+ )
+ for {
+ if !fs.IsDir(parent.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ // Start by looking up the dirent at 'name'.
+ found, err = t.MountNamespace().FindLink(t, root, parent, name, &remainingTraversals)
+ if err != nil {
+ break
+ }
+ defer found.DecRef()
+
+ // We found something (possibly a symlink). If the
+ // O_EXCL flag was passed, then we can immediately
+ // return EEXIST.
+ if flags&linux.O_EXCL != 0 {
+ return syserror.EEXIST
+ }
+
+ // If we have a non-symlink, then we can proceed.
+ if !fs.IsSymlink(found.Inode.StableAttr) {
+ break
+ }
+
+ // If O_NOFOLLOW was passed, then don't try to resolve
+ // anything.
+ if flags&linux.O_NOFOLLOW != 0 {
+ return syserror.ELOOP
+ }
+
+ // Try to resolve the symlink directly to a Dirent.
+ var resolved *fs.Dirent
+ resolved, err = found.Inode.Getlink(t)
+ if err == nil {
+ // No more resolution necessary.
+ defer resolved.DecRef()
+ break
+ }
+ if err != fs.ErrResolveViaReadlink {
+ return err
+ }
+
+ // Are we able to resolve further?
+ if remainingTraversals == 0 {
+ return syscall.ELOOP
+ }
+
+ // Resolve the symlink to a path via Readlink.
+ var path string
+ path, err = found.Inode.Readlink(t)
+ if err != nil {
+ break
+ }
+ remainingTraversals--
+
+ // Get the new parent from the target path.
+ var newParent *fs.Dirent
+ newParentPath, newName := fs.SplitLast(path)
+ newParent, err = t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals)
+ if err != nil {
+ break
+ }
+ defer newParent.DecRef()
+
+ // Repeat the process with the parent and name of the
+ // symlink target.
+ parent = newParent
+ name = newName
+ }
+
+ var newFile *fs.File
+ switch err {
+ case nil:
+ // Like sys_open, check for a few things about the
+ // filesystem before trying to get a reference to the
+ // fs.File. The same constraints on Check apply.
+ if err := found.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
+ return err
+ }
+
+ // Truncate is called when O_TRUNC is specified for any kind of
+ // existing Dirent. Behavior is delegated to the entry's Truncate
+ // implementation.
+ if flags&linux.O_TRUNC != 0 {
+ if err := found.Inode.Truncate(t, found, 0); err != nil {
+ return err
+ }
+ }
+
+ // Create a new fs.File.
+ newFile, err = found.Inode.GetFile(t, found, fileFlags)
+ if err != nil {
+ return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+ }
+ defer newFile.DecRef()
+ case syserror.ENOENT:
+ // File does not exist. Proceed with creation.
+
+ // Do we have write permissions on the parent?
+ if err := parent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+ return err
+ }
+
+ // Attempt a creation.
+ perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+ newFile, err = parent.Create(t, root, name, fileFlags, perms)
+ if err != nil {
+ // No luck, bail.
+ return err
+ }
+ defer newFile.DecRef()
+ found = newFile.Dirent
+ default:
+ return err
+ }
+
+ // Success.
+ newFD, err := t.NewFDFrom(0, newFile, kernel.FDFlags{
+ CloseOnExec: flags&linux.O_CLOEXEC != 0,
+ })
+ if err != nil {
+ return err
+ }
+
+ // Set result in frame.
+ fd = uintptr(newFD)
+
+ // Queue the open inotify event. The creation event is
+ // automatically queued when the dirent is found. The open
+ // events are implemented at the syscall layer so we need to
+ // manually queue one here.
+ found.InotifyEvent(linux.IN_OPEN, 0)
+
+ return nil
+ })
+ return fd, err // Use result in frame.
+}
+
+// Open implements linux syscall open(2).
+func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ flags := uint(args[1].Uint())
+ if flags&linux.O_CREAT != 0 {
+ mode := linux.FileMode(args[2].ModeT())
+ n, err := createAt(t, linux.AT_FDCWD, addr, flags, mode)
+ return n, nil, err
+ }
+ n, err := openAt(t, linux.AT_FDCWD, addr, flags)
+ return n, nil, err
+}
+
+// Openat implements linux syscall openat(2).
+func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirFD := args[0].Int()
+ addr := args[1].Pointer()
+ flags := uint(args[2].Uint())
+ if flags&linux.O_CREAT != 0 {
+ mode := linux.FileMode(args[3].ModeT())
+ n, err := createAt(t, dirFD, addr, flags, mode)
+ return n, nil, err
+ }
+ n, err := openAt(t, dirFD, addr, flags)
+ return n, nil, err
+}
+
+// Creat implements linux syscall creat(2).
+func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ mode := linux.FileMode(args[1].ModeT())
+ n, err := createAt(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_TRUNC, mode)
+ return n, nil, err
+}
+
+// accessContext is a context that overrides the credentials used, but
+// otherwise carries the same values as the embedded context.
+//
+// accessContext should only be used for access(2).
+type accessContext struct {
+ context.Context
+ creds *auth.Credentials
+}
+
+// Value implements context.Context.
+func (ac accessContext) Value(key interface{}) interface{} {
+ switch key {
+ case auth.CtxCredentials:
+ return ac.creds
+ default:
+ return ac.Context.Value(key)
+ }
+}
+
+func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode uint) error {
+ const rOK = 4
+ const wOK = 2
+ const xOK = 1
+
+ path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return err
+ }
+
+ // Sanity check the mode.
+ if mode&^(rOK|wOK|xOK) != 0 {
+ return syserror.EINVAL
+ }
+
+ return fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ // access(2) and faccessat(2) check permissions using real
+ // UID/GID, not effective UID/GID.
+ //
+ // "access() needs to use the real uid/gid, not the effective
+ // uid/gid. We do this by temporarily clearing all FS-related
+ // capabilities and switching the fsuid/fsgid around to the
+ // real ones." -fs/open.c:faccessat
+ creds := t.Credentials().Fork()
+ creds.EffectiveKUID = creds.RealKUID
+ creds.EffectiveKGID = creds.RealKGID
+ if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
+ creds.EffectiveCaps = creds.PermittedCaps
+ } else {
+ creds.EffectiveCaps = 0
+ }
+
+ ctx := &accessContext{
+ Context: t,
+ creds: creds,
+ }
+
+ return d.Inode.CheckPermission(ctx, fs.PermMask{
+ Read: mode&rOK != 0,
+ Write: mode&wOK != 0,
+ Execute: mode&xOK != 0,
+ })
+ })
+}
+
+// Access implements linux syscall access(2).
+func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ mode := args[1].ModeT()
+
+ return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Faccessat implements linux syscall faccessat(2).
+//
+// Note that the faccessat() system call does not take a flags argument:
+// "The raw faccessat() system call takes only the first three arguments. The
+// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within
+// the glibc wrapper function for faccessat(). If either of these flags is
+// specified, then the wrapper function employs fstatat(2) to determine access
+// permissions." - faccessat(2)
+func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirFD := args[0].Int()
+ addr := args[1].Pointer()
+ mode := args[2].ModeT()
+
+ return 0, nil, accessAt(t, dirFD, addr, mode)
+}
+
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
+// Ioctl implements linux syscall ioctl(2).
+func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ request := int(args[1].Int())
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Shared flags between file and socket.
+ switch request {
+ case linux.FIONCLEX:
+ t.FDTable().SetFlags(fd, kernel.FDFlags{
+ CloseOnExec: false,
+ })
+ return 0, nil, nil
+ case linux.FIOCLEX:
+ t.FDTable().SetFlags(fd, kernel.FDFlags{
+ CloseOnExec: true,
+ })
+ return 0, nil, nil
+
+ case linux.FIONBIO:
+ var set int32
+ if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ return 0, nil, err
+ }
+ flags := file.Flags()
+ if set != 0 {
+ flags.NonBlocking = true
+ } else {
+ flags.NonBlocking = false
+ }
+ file.SetFlags(flags.Settable())
+ return 0, nil, nil
+
+ case linux.FIOASYNC:
+ var set int32
+ if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ return 0, nil, err
+ }
+ flags := file.Flags()
+ if set != 0 {
+ flags.Async = true
+ } else {
+ flags.Async = false
+ }
+ file.SetFlags(flags.Settable())
+ return 0, nil, nil
+
+ case linux.FIOSETOWN, linux.SIOCSPGRP:
+ var set int32
+ if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ return 0, nil, err
+ }
+ fSetOwn(t, file, set)
+ return 0, nil, nil
+
+ case linux.FIOGETOWN, linux.SIOCGPGRP:
+ who := fGetOwn(t, file)
+ _, err := t.CopyOut(args[2].Pointer(), &who)
+ return 0, nil, err
+
+ default:
+ ret, err := file.FileOperations.Ioctl(t, file, t.MemoryManager(), args)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return ret, nil, nil
+ }
+}
+
+// LINT.ThenChange(vfs2/ioctl.go)
+
+// LINT.IfChange
+
+// Getcwd implements the linux syscall getcwd(2).
+func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ size := args[1].SizeT()
+ cwd := t.FSContext().WorkingDirectory()
+ defer cwd.DecRef()
+ root := t.FSContext().RootDirectory()
+ defer root.DecRef()
+
+ // Get our fullname from the root and preprend unreachable if the root was
+ // unreachable from our current dirent this is the same behavior as on linux.
+ s, reachable := cwd.FullName(root)
+ if !reachable {
+ s = "(unreachable)" + s
+ }
+
+ // Note this is >= because we need a terminator.
+ if uint(len(s)) >= size {
+ return 0, nil, syserror.ERANGE
+ }
+
+ // Copy out the path name for the node.
+ bytes, err := t.CopyOutBytes(addr, []byte(s))
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // Top it off with a terminator.
+ _, err = t.CopyOut(addr+usermem.Addr(bytes), []byte("\x00"))
+ return uintptr(bytes + 1), nil, err
+}
+
+// Chroot implements the linux syscall chroot(2).
+func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+
+ if !t.HasCapability(linux.CAP_SYS_CHROOT) {
+ return 0, nil, syserror.EPERM
+ }
+
+ path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ // Is it a directory?
+ if !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ // Does it have execute permissions?
+ if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+ return err
+ }
+
+ t.FSContext().SetRootDirectory(d)
+ return nil
+ })
+}
+
+// Chdir implements the linux syscall chdir(2).
+func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+
+ path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ // Is it a directory?
+ if !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ // Does it have execute permissions?
+ if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+ return err
+ }
+
+ t.FSContext().SetWorkingDirectory(d)
+ return nil
+ })
+}
+
+// Fchdir implements the linux syscall fchdir(2).
+func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Is it a directory?
+ if !fs.IsDir(file.Dirent.Inode.StableAttr) {
+ return 0, nil, syserror.ENOTDIR
+ }
+
+ // Does it have execute permissions?
+ if err := file.Dirent.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+ return 0, nil, err
+ }
+
+ t.FSContext().SetWorkingDirectory(file.Dirent)
+ return 0, nil, nil
+}
+
+// LINT.ThenChange(vfs2/fscontext.go)
+
+// LINT.IfChange
+
+// Close implements linux syscall close(2).
+func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+
+ // Note that Remove provides a reference on the file that we may use to
+ // flush. It is still active until we drop the final reference below
+ // (and other reference-holding operations complete).
+ file, _ := t.FDTable().Remove(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ err := file.Flush(t)
+ return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file)
+}
+
+// Dup implements linux syscall dup(2).
+func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{})
+ if err != nil {
+ return 0, nil, syserror.EMFILE
+ }
+ return uintptr(newFD), nil, nil
+}
+
+// Dup2 implements linux syscall dup2(2).
+func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ oldfd := args[0].Int()
+ newfd := args[1].Int()
+
+ // If oldfd is a valid file descriptor, and newfd has the same value as oldfd,
+ // then dup2() does nothing, and returns newfd.
+ if oldfd == newfd {
+ oldFile := t.GetFile(oldfd)
+ if oldFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer oldFile.DecRef()
+
+ return uintptr(newfd), nil, nil
+ }
+
+ // Zero out flags arg to be used by Dup3.
+ args[2].Value = 0
+ return Dup3(t, args)
+}
+
+// Dup3 implements linux syscall dup3(2).
+func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ oldfd := args[0].Int()
+ newfd := args[1].Int()
+ flags := args[2].Uint()
+
+ if oldfd == newfd {
+ return 0, nil, syserror.EINVAL
+ }
+
+ oldFile := t.GetFile(oldfd)
+ if oldFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer oldFile.DecRef()
+
+ err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0})
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(newfd), nil, nil
+}
+
+func fGetOwnEx(t *kernel.Task, file *fs.File) linux.FOwnerEx {
+ ma := file.Async(nil)
+ if ma == nil {
+ return linux.FOwnerEx{}
+ }
+ a := ma.(*fasync.FileAsync)
+ ot, otg, opg := a.Owner()
+ switch {
+ case ot != nil:
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_TID,
+ PID: int32(t.PIDNamespace().IDOfTask(ot)),
+ }
+ case otg != nil:
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_PID,
+ PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)),
+ }
+ case opg != nil:
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_PGRP,
+ PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)),
+ }
+ default:
+ return linux.FOwnerEx{}
+ }
+}
+
+func fGetOwn(t *kernel.Task, file *fs.File) int32 {
+ owner := fGetOwnEx(t, file)
+ if owner.Type == linux.F_OWNER_PGRP {
+ return -owner.PID
+ }
+ return owner.PID
+}
+
+// fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux.
+//
+// If who is positive, it represents a PID. If negative, it represents a PGID.
+// If the PID or PGID is invalid, the owner is silently unset.
+func fSetOwn(t *kernel.Task, file *fs.File, who int32) error {
+ a := file.Async(fasync.New).(*fasync.FileAsync)
+ if who < 0 {
+ // Check for overflow before flipping the sign.
+ if who-1 > who {
+ return syserror.EINVAL
+ }
+ pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who))
+ a.SetOwnerProcessGroup(t, pg)
+ } else {
+ tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who))
+ a.SetOwnerThreadGroup(t, tg)
+ }
+ return nil
+}
+
+// Fcntl implements linux syscall fcntl(2).
+func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ cmd := args[1].Int()
+
+ file, flags := t.FDTable().Get(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ switch cmd {
+ case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
+ from := args[2].Int()
+ fd, err := t.NewFDFrom(from, file, kernel.FDFlags{
+ CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(fd), nil, nil
+ case linux.F_GETFD:
+ return uintptr(flags.ToLinuxFDFlags()), nil, nil
+ case linux.F_SETFD:
+ flags := args[2].Uint()
+ err := t.FDTable().SetFlags(fd, kernel.FDFlags{
+ CloseOnExec: flags&linux.FD_CLOEXEC != 0,
+ })
+ return 0, nil, err
+ case linux.F_GETFL:
+ return uintptr(file.Flags().ToLinux()), nil, nil
+ case linux.F_SETFL:
+ flags := uint(args[2].Uint())
+ file.SetFlags(linuxToFlags(flags).Settable())
+ return 0, nil, nil
+ case linux.F_SETLK, linux.F_SETLKW:
+ // In Linux the file system can choose to provide lock operations for an inode.
+ // Normally pipe and socket types lack lock operations. We diverge and use a heavy
+ // hammer by only allowing locks on files and directories.
+ if !fs.IsFile(file.Dirent.Inode.StableAttr) && !fs.IsDir(file.Dirent.Inode.StableAttr) {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Copy in the lock request.
+ flockAddr := args[2].Pointer()
+ var flock linux.Flock
+ if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+ return 0, nil, err
+ }
+
+ // Compute the lock whence.
+ var sw fs.SeekWhence
+ switch flock.Whence {
+ case 0:
+ sw = fs.SeekSet
+ case 1:
+ sw = fs.SeekCurrent
+ case 2:
+ sw = fs.SeekEnd
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Compute the lock offset.
+ var off int64
+ switch sw {
+ case fs.SeekSet:
+ off = 0
+ case fs.SeekCurrent:
+ // Note that Linux does not hold any mutexes while retrieving the file offset,
+ // see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
+ off = file.Offset()
+ case fs.SeekEnd:
+ uattr, err := file.Dirent.Inode.UnstableAttr(t)
+ if err != nil {
+ return 0, nil, err
+ }
+ off = uattr.Size
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Compute the lock range.
+ rng, err := lock.ComputeRange(flock.Start, flock.Len, off)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // These locks don't block; execute the non-blocking operation using the inode's lock
+ // context directly.
+ switch flock.Type {
+ case linux.F_RDLCK:
+ if !file.Flags().Read {
+ return 0, nil, syserror.EBADF
+ }
+ if cmd == linux.F_SETLK {
+ // Non-blocking lock, provide a nil lock.Blocker.
+ if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, nil) {
+ return 0, nil, syserror.EAGAIN
+ }
+ } else {
+ // Blocking lock, pass in the task to satisfy the lock.Blocker interface.
+ if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, t) {
+ return 0, nil, syserror.EINTR
+ }
+ }
+ return 0, nil, nil
+ case linux.F_WRLCK:
+ if !file.Flags().Write {
+ return 0, nil, syserror.EBADF
+ }
+ if cmd == linux.F_SETLK {
+ // Non-blocking lock, provide a nil lock.Blocker.
+ if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, nil) {
+ return 0, nil, syserror.EAGAIN
+ }
+ } else {
+ // Blocking lock, pass in the task to satisfy the lock.Blocker interface.
+ if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, t) {
+ return 0, nil, syserror.EINTR
+ }
+ }
+ return 0, nil, nil
+ case linux.F_UNLCK:
+ file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng)
+ return 0, nil, nil
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+ case linux.F_GETOWN:
+ return uintptr(fGetOwn(t, file)), nil, nil
+ case linux.F_SETOWN:
+ return 0, nil, fSetOwn(t, file, args[2].Int())
+ case linux.F_GETOWN_EX:
+ addr := args[2].Pointer()
+ owner := fGetOwnEx(t, file)
+ _, err := t.CopyOut(addr, &owner)
+ return 0, nil, err
+ case linux.F_SETOWN_EX:
+ addr := args[2].Pointer()
+ var owner linux.FOwnerEx
+ n, err := t.CopyIn(addr, &owner)
+ if err != nil {
+ return 0, nil, err
+ }
+ a := file.Async(fasync.New).(*fasync.FileAsync)
+ switch owner.Type {
+ case linux.F_OWNER_TID:
+ task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID))
+ if task == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ a.SetOwnerTask(t, task)
+ return uintptr(n), nil, nil
+ case linux.F_OWNER_PID:
+ tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID))
+ if tg == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ a.SetOwnerThreadGroup(t, tg)
+ return uintptr(n), nil, nil
+ case linux.F_OWNER_PGRP:
+ pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID))
+ if pg == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ a.SetOwnerProcessGroup(t, pg)
+ return uintptr(n), nil, nil
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+ case linux.F_GET_SEALS:
+ val, err := tmpfs.GetSeals(file.Dirent.Inode)
+ return uintptr(val), nil, err
+ case linux.F_ADD_SEALS:
+ if !file.Flags().Write {
+ return 0, nil, syserror.EPERM
+ }
+ err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint())
+ return 0, nil, err
+ case linux.F_GETPIPE_SZ:
+ sz, ok := file.FileOperations.(fs.FifoSizer)
+ if !ok {
+ return 0, nil, syserror.EINVAL
+ }
+ size, err := sz.FifoSize(t, file)
+ return uintptr(size), nil, err
+ case linux.F_SETPIPE_SZ:
+ sz, ok := file.FileOperations.(fs.FifoSizer)
+ if !ok {
+ return 0, nil, syserror.EINVAL
+ }
+ n, err := sz.SetFifoSize(int64(args[2].Int()))
+ return uintptr(n), nil, err
+ default:
+ // Everything else is not yet supported.
+ return 0, nil, syserror.EINVAL
+ }
+}
+
+// Fadvise64 implements linux syscall fadvise64(2).
+// This implementation currently ignores the provided advice.
+func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ length := args[2].Int64()
+ advice := args[3].Int()
+
+ // Note: offset is allowed to be negative.
+ if length < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // If the FD refers to a pipe or FIFO, return error.
+ if fs.IsPipe(file.Dirent.Inode.StableAttr) {
+ return 0, nil, syserror.ESPIPE
+ }
+
+ switch advice {
+ case linux.POSIX_FADV_NORMAL:
+ case linux.POSIX_FADV_RANDOM:
+ case linux.POSIX_FADV_SEQUENTIAL:
+ case linux.POSIX_FADV_WILLNEED:
+ case linux.POSIX_FADV_DONTNEED:
+ case linux.POSIX_FADV_NOREUSE:
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Sure, whatever.
+ return 0, nil, nil
+}
+
+func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
+ path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return err
+ }
+
+ return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
+ if !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ // Does this directory exist already?
+ remainingTraversals := uint(linux.MaxSymlinkTraversals)
+ f, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals)
+ switch err {
+ case nil:
+ // The directory existed.
+ defer f.DecRef()
+ return syserror.EEXIST
+ case syserror.EACCES:
+ // Permission denied while walking to the directory.
+ return err
+ default:
+ // Do we have write permissions on the parent?
+ if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+ return err
+ }
+
+ // Create the directory.
+ perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+ return d.CreateDirectory(t, root, name, perms)
+ }
+ })
+}
+
+// Mkdir implements linux syscall mkdir(2).
+func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ mode := linux.FileMode(args[1].ModeT())
+
+ return 0, nil, mkdirAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Mkdirat implements linux syscall mkdirat(2).
+func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirFD := args[0].Int()
+ addr := args[1].Pointer()
+ mode := linux.FileMode(args[2].ModeT())
+
+ return 0, nil, mkdirAt(t, dirFD, addr, mode)
+}
+
+func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
+ path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return err
+ }
+
+ // Special case: removing the root always returns EBUSY.
+ if path == "/" {
+ return syserror.EBUSY
+ }
+
+ return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
+ if !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ // Linux returns different ernos when the path ends in single
+ // dot vs. double dots.
+ switch name {
+ case ".":
+ return syserror.EINVAL
+ case "..":
+ return syserror.ENOTEMPTY
+ }
+
+ if err := d.MayDelete(t, root, name); err != nil {
+ return err
+ }
+
+ return d.RemoveDirectory(t, root, name)
+ })
+}
+
+// Rmdir implements linux syscall rmdir(2).
+func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+
+ return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr)
+}
+
+func symlinkAt(t *kernel.Task, dirFD int32, newAddr usermem.Addr, oldAddr usermem.Addr) error {
+ newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
+ if err != nil {
+ return err
+ }
+ if dirPath {
+ return syserror.ENOENT
+ }
+
+ // The oldPath is copied in verbatim. This is because the symlink
+ // will include all details, including trailing slashes.
+ oldPath, err := t.CopyInString(oldAddr, linux.PATH_MAX)
+ if err != nil {
+ return err
+ }
+ if oldPath == "" {
+ return syserror.ENOENT
+ }
+
+ return fileOpAt(t, dirFD, newPath, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
+ if !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ // Make sure we have write permissions on the parent directory.
+ if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+ return err
+ }
+ return d.CreateLink(t, root, oldPath, name)
+ })
+}
+
+// Symlink implements linux syscall symlink(2).
+func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ oldAddr := args[0].Pointer()
+ newAddr := args[1].Pointer()
+
+ return 0, nil, symlinkAt(t, linux.AT_FDCWD, newAddr, oldAddr)
+}
+
+// Symlinkat implements linux syscall symlinkat(2).
+func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ oldAddr := args[0].Pointer()
+ dirFD := args[1].Int()
+ newAddr := args[2].Pointer()
+
+ return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr)
+}
+
+// mayLinkAt determines whether t can create a hard link to target.
+//
+// This corresponds to Linux's fs/namei.c:may_linkat.
+func mayLinkAt(t *kernel.Task, target *fs.Inode) error {
+ // Linux will impose the following restrictions on hard links only if
+ // sysctl_protected_hardlinks is enabled. The kernel disables this
+ // setting by default for backward compatibility (see commit
+ // 561ec64ae67e), but also recommends that distributions enable it (and
+ // Debian does:
+ // https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=889098).
+ //
+ // gVisor currently behaves as though sysctl_protected_hardlinks is
+ // always enabled, and thus imposes the following restrictions on hard
+ // links.
+
+ if target.CheckOwnership(t) {
+ // fs/namei.c:may_linkat: "Source inode owner (or CAP_FOWNER)
+ // can hardlink all they like."
+ return nil
+ }
+
+ // If we are not the owner, then the file must be regular and have
+ // Read+Write permissions.
+ if !fs.IsRegular(target.StableAttr) {
+ return syserror.EPERM
+ }
+ if target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil {
+ return syserror.EPERM
+ }
+
+ return nil
+}
+
+// linkAt creates a hard link to the target specified by oldDirFD and oldAddr,
+// specified by newDirFD and newAddr. If resolve is true, then the symlinks
+// will be followed when evaluating the target.
+func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr, resolve, allowEmpty bool) error {
+ oldPath, _, err := copyInPath(t, oldAddr, allowEmpty)
+ if err != nil {
+ return err
+ }
+ newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
+ if err != nil {
+ return err
+ }
+ if dirPath {
+ return syserror.ENOENT
+ }
+
+ if allowEmpty && oldPath == "" {
+ target := t.GetFile(oldDirFD)
+ if target == nil {
+ return syserror.EBADF
+ }
+ defer target.DecRef()
+ if err := mayLinkAt(t, target.Dirent.Inode); err != nil {
+ return err
+ }
+
+ // Resolve the target directory.
+ return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error {
+ if !fs.IsDir(newParent.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ // Make sure we have write permissions on the parent directory.
+ if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+ return err
+ }
+ return newParent.CreateHardLink(t, root, target.Dirent, newName)
+ })
+ }
+
+ // Resolve oldDirFD and oldAddr to a dirent. The "resolve" argument
+ // only applies to this name.
+ return fileOpOn(t, oldDirFD, oldPath, resolve, func(root *fs.Dirent, target *fs.Dirent, _ uint) error {
+ if err := mayLinkAt(t, target.Inode); err != nil {
+ return err
+ }
+
+ // Next resolve newDirFD and newAddr to the parent dirent and name.
+ return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error {
+ if !fs.IsDir(newParent.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ // Make sure we have write permissions on the parent directory.
+ if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+ return err
+ }
+ return newParent.CreateHardLink(t, root, target, newName)
+ })
+ })
+}
+
+// Link implements linux syscall link(2).
+func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ oldAddr := args[0].Pointer()
+ newAddr := args[1].Pointer()
+
+ // man link(2):
+ // POSIX.1-2001 says that link() should dereference oldpath if it is a
+ // symbolic link. However, since kernel 2.0, Linux does not do so: if
+ // oldpath is a symbolic link, then newpath is created as a (hard) link
+ // to the same symbolic link file (i.e., newpath becomes a symbolic
+ // link to the same file that oldpath refers to).
+ resolve := false
+ return 0, nil, linkAt(t, linux.AT_FDCWD, oldAddr, linux.AT_FDCWD, newAddr, resolve, false /* allowEmpty */)
+}
+
+// Linkat implements linux syscall linkat(2).
+func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ oldDirFD := args[0].Int()
+ oldAddr := args[1].Pointer()
+ newDirFD := args[2].Int()
+ newAddr := args[3].Pointer()
+
+ // man linkat(2):
+ // By default, linkat(), does not dereference oldpath if it is a
+ // symbolic link (like link(2)). Since Linux 2.6.18, the flag
+ // AT_SYMLINK_FOLLOW can be specified in flags to cause oldpath to be
+ // dereferenced if it is a symbolic link.
+ flags := args[4].Int()
+
+ // Sanity check flags.
+ if flags&^(linux.AT_SYMLINK_FOLLOW|linux.AT_EMPTY_PATH) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ resolve := flags&linux.AT_SYMLINK_FOLLOW == linux.AT_SYMLINK_FOLLOW
+ allowEmpty := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH
+
+ if allowEmpty && !t.HasCapabilityIn(linux.CAP_DAC_READ_SEARCH, t.UserNamespace().Root()) {
+ return 0, nil, syserror.ENOENT
+ }
+
+ return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
+}
+
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
+func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
+ path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return 0, err
+ }
+ if dirPath {
+ return 0, syserror.ENOENT
+ }
+
+ err = fileOpOn(t, dirFD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ // Check for Read permission.
+ if err := d.Inode.CheckPermission(t, fs.PermMask{Read: true}); err != nil {
+ return err
+ }
+
+ s, err := d.Inode.Readlink(t)
+ if err == syserror.ENOLINK {
+ return syserror.EINVAL
+ }
+ if err != nil {
+ return err
+ }
+
+ buffer := []byte(s)
+ if uint(len(buffer)) > size {
+ buffer = buffer[:size]
+ }
+
+ n, err := t.CopyOutBytes(bufAddr, buffer)
+
+ // Update frame return value.
+ copied = uintptr(n)
+
+ return err
+ })
+ return copied, err // Return frame value.
+}
+
+// Readlink implements linux syscall readlink(2).
+func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ bufAddr := args[1].Pointer()
+ size := args[2].SizeT()
+
+ n, err := readlinkAt(t, linux.AT_FDCWD, addr, bufAddr, size)
+ return n, nil, err
+}
+
+// Readlinkat implements linux syscall readlinkat(2).
+func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirFD := args[0].Int()
+ addr := args[1].Pointer()
+ bufAddr := args[2].Pointer()
+ size := args[3].SizeT()
+
+ n, err := readlinkAt(t, dirFD, addr, bufAddr, size)
+ return n, nil, err
+}
+
+// LINT.ThenChange(vfs2/stat.go)
+
+// LINT.IfChange
+
+func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
+ path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return err
+ }
+
+ return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
+ if !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ if err := d.MayDelete(t, root, name); err != nil {
+ return err
+ }
+
+ return d.Remove(t, root, name, dirPath)
+ })
+}
+
+// Unlink implements linux syscall unlink(2).
+func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ return 0, nil, unlinkAt(t, linux.AT_FDCWD, addr)
+}
+
+// Unlinkat implements linux syscall unlinkat(2).
+func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirFD := args[0].Int()
+ addr := args[1].Pointer()
+ flags := args[2].Uint()
+ if flags&linux.AT_REMOVEDIR != 0 {
+ return 0, nil, rmdirAt(t, dirFD, addr)
+ }
+ return 0, nil, unlinkAt(t, dirFD, addr)
+}
+
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
+// Truncate implements linux syscall truncate(2).
+func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].Int64()
+
+ if length < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+ if dirPath {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+ t.SendSignal(&arch.SignalInfo{
+ Signo: int32(linux.SIGXFSZ),
+ Code: arch.SignalInfoUser,
+ })
+ return 0, nil, syserror.EFBIG
+ }
+
+ return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ if fs.IsDir(d.Inode.StableAttr) {
+ return syserror.EISDIR
+ }
+ // In contrast to open(O_TRUNC), truncate(2) is only valid for file
+ // types.
+ if !fs.IsFile(d.Inode.StableAttr) {
+ return syserror.EINVAL
+ }
+
+ // Reject truncation if the access permissions do not allow truncation.
+ // This is different from the behavior of sys_ftruncate, see below.
+ if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
+ return err
+ }
+
+ if err := d.Inode.Truncate(t, d, length); err != nil {
+ return err
+ }
+
+ // File length modified, generate notification.
+ d.InotifyEvent(linux.IN_MODIFY, 0)
+
+ return nil
+ })
+}
+
+// Ftruncate implements linux syscall ftruncate(2).
+func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ length := args[1].Int64()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Reject truncation if the file flags do not permit this operation.
+ // This is different from truncate(2) above.
+ if !file.Flags().Write {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // In contrast to open(O_TRUNC), truncate(2) is only valid for file
+ // types. Note that this is different from truncate(2) above, where a
+ // directory returns EISDIR.
+ if !fs.IsFile(file.Dirent.Inode.StableAttr) {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if length < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+ t.SendSignal(&arch.SignalInfo{
+ Signo: int32(linux.SIGXFSZ),
+ Code: arch.SignalInfoUser,
+ })
+ return 0, nil, syserror.EFBIG
+ }
+
+ if err := file.Dirent.Inode.Truncate(t, file.Dirent, length); err != nil {
+ return 0, nil, err
+ }
+
+ // File length modified, generate notification.
+ file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+
+ return 0, nil, nil
+}
+
+// LINT.ThenChange(vfs2/setstat.go)
+
+// Umask implements linux syscall umask(2).
+func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ mask := args[0].ModeT()
+ mask = t.FSContext().SwapUmask(mask & 0777)
+ return uintptr(mask), nil, nil
+}
+
+// LINT.IfChange
+
+// Change ownership of a file.
+//
+// uid and gid may be -1, in which case they will not be changed.
+func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error {
+ owner := fs.FileOwner{
+ UID: auth.NoID,
+ GID: auth.NoID,
+ }
+
+ uattr, err := d.Inode.UnstableAttr(t)
+ if err != nil {
+ return err
+ }
+ c := t.Credentials()
+ hasCap := d.Inode.CheckCapability(t, linux.CAP_CHOWN)
+ isOwner := uattr.Owner.UID == c.EffectiveKUID
+ if uid.Ok() {
+ kuid := c.UserNamespace.MapToKUID(uid)
+ // Valid UID must be supplied if UID is to be changed.
+ if !kuid.Ok() {
+ return syserror.EINVAL
+ }
+
+ // "Only a privileged process (CAP_CHOWN) may change the owner
+ // of a file." -chown(2)
+ //
+ // Linux also allows chown if you own the file and are
+ // explicitly not changing its UID.
+ isNoop := uattr.Owner.UID == kuid
+ if !(hasCap || (isOwner && isNoop)) {
+ return syserror.EPERM
+ }
+
+ owner.UID = kuid
+ }
+ if gid.Ok() {
+ kgid := c.UserNamespace.MapToKGID(gid)
+ // Valid GID must be supplied if GID is to be changed.
+ if !kgid.Ok() {
+ return syserror.EINVAL
+ }
+
+ // "The owner of a file may change the group of the file to any
+ // group of which that owner is a member. A privileged process
+ // (CAP_CHOWN) may change the group arbitrarily." -chown(2)
+ isNoop := uattr.Owner.GID == kgid
+ isMemberGroup := c.InGroup(kgid)
+ if !(hasCap || (isOwner && (isNoop || isMemberGroup))) {
+ return syserror.EPERM
+ }
+
+ owner.GID = kgid
+ }
+
+ // FIXME(b/62949101): This is racy; the inode's owner may have changed in
+ // the meantime. (Linux holds i_mutex while calling
+ // fs/attr.c:notify_change() => inode_operations::setattr =>
+ // inode_change_ok().)
+ if err := d.Inode.SetOwner(t, d, owner); err != nil {
+ return err
+ }
+
+ // When the owner or group are changed by an unprivileged user,
+ // chown(2) also clears the set-user-ID and set-group-ID bits, but
+ // we do not support them.
+ return nil
+}
+
+func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error {
+ path, _, err := copyInPath(t, addr, allowEmpty)
+ if err != nil {
+ return err
+ }
+
+ if path == "" {
+ // Annoying. What's wrong with fchown?
+ file := t.GetFile(fd)
+ if file == nil {
+ return syserror.EBADF
+ }
+ defer file.DecRef()
+
+ return chown(t, file.Dirent, uid, gid)
+ }
+
+ return fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ return chown(t, d, uid, gid)
+ })
+}
+
+// Chown implements linux syscall chown(2).
+func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ uid := auth.UID(args[1].Uint())
+ gid := auth.GID(args[2].Uint())
+
+ return 0, nil, chownAt(t, linux.AT_FDCWD, addr, true /* resolve */, false /* allowEmpty */, uid, gid)
+}
+
+// Lchown implements linux syscall lchown(2).
+func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ uid := auth.UID(args[1].Uint())
+ gid := auth.GID(args[2].Uint())
+
+ return 0, nil, chownAt(t, linux.AT_FDCWD, addr, false /* resolve */, false /* allowEmpty */, uid, gid)
+}
+
+// Fchown implements linux syscall fchown(2).
+func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ uid := auth.UID(args[1].Uint())
+ gid := auth.GID(args[2].Uint())
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ return 0, nil, chown(t, file.Dirent, uid, gid)
+}
+
+// Fchownat implements Linux syscall fchownat(2).
+func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirFD := args[0].Int()
+ addr := args[1].Pointer()
+ uid := auth.UID(args[2].Uint())
+ gid := auth.GID(args[3].Uint())
+ flags := args[4].Int()
+
+ if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ return 0, nil, chownAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, flags&linux.AT_EMPTY_PATH != 0, uid, gid)
+}
+
+func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error {
+ // Must own file to change mode.
+ if !d.Inode.CheckOwnership(t) {
+ return syserror.EPERM
+ }
+
+ p := fs.FilePermsFromMode(mode)
+ if !d.Inode.SetPermissions(t, d, p) {
+ return syserror.EPERM
+ }
+
+ // File attribute changed, generate notification.
+ d.InotifyEvent(linux.IN_ATTRIB, 0)
+
+ return nil
+}
+
+func chmodAt(t *kernel.Task, fd int32, addr usermem.Addr, mode linux.FileMode) error {
+ path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return err
+ }
+
+ return fileOpOn(t, fd, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ return chmod(t, d, mode)
+ })
+}
+
+// Chmod implements linux syscall chmod(2).
+func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ mode := linux.FileMode(args[1].ModeT())
+
+ return 0, nil, chmodAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Fchmod implements linux syscall fchmod(2).
+func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ mode := linux.FileMode(args[1].ModeT())
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ return 0, nil, chmod(t, file.Dirent, mode)
+}
+
+// Fchmodat implements linux syscall fchmodat(2).
+func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ mode := linux.FileMode(args[2].ModeT())
+
+ return 0, nil, chmodAt(t, fd, addr, mode)
+}
+
+// defaultSetToSystemTimeSpec returns a TimeSpec that will set ATime and MTime
+// to the system time.
+func defaultSetToSystemTimeSpec() fs.TimeSpec {
+ return fs.TimeSpec{
+ ATimeSetSystemTime: true,
+ MTimeSetSystemTime: true,
+ }
+}
+
+func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error {
+ setTimestamp := func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ // Does the task own the file?
+ if !d.Inode.CheckOwnership(t) {
+ // Trying to set a specific time? Must be owner.
+ if (ts.ATimeOmit || !ts.ATimeSetSystemTime) && (ts.MTimeOmit || !ts.MTimeSetSystemTime) {
+ return syserror.EPERM
+ }
+
+ // Trying to set to current system time? Must have write access.
+ if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
+ return err
+ }
+ }
+
+ if err := d.Inode.SetTimestamps(t, d, ts); err != nil {
+ return err
+ }
+
+ // File attribute changed, generate notification.
+ d.InotifyEvent(linux.IN_ATTRIB, 0)
+ return nil
+ }
+
+ // From utimes.c:
+ // "If filename is NULL and dfd refers to an open file, then operate on
+ // the file. Otherwise look up filename, possibly using dfd as a
+ // starting point."
+ if addr == 0 && dirFD != linux.AT_FDCWD {
+ if !resolve {
+ // Linux returns EINVAL in this case. See utimes.c.
+ return syserror.EINVAL
+ }
+ f := t.GetFile(dirFD)
+ if f == nil {
+ return syserror.EBADF
+ }
+ defer f.DecRef()
+
+ root := t.FSContext().RootDirectory()
+ defer root.DecRef()
+
+ return setTimestamp(root, f.Dirent, linux.MaxSymlinkTraversals)
+ }
+
+ path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return err
+ }
+
+ return fileOpOn(t, dirFD, path, resolve, setTimestamp)
+}
+
+// Utime implements linux syscall utime(2).
+func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ filenameAddr := args[0].Pointer()
+ timesAddr := args[1].Pointer()
+
+ // No timesAddr argument will be interpreted as current system time.
+ ts := defaultSetToSystemTimeSpec()
+ if timesAddr != 0 {
+ var times linux.Utime
+ if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ return 0, nil, err
+ }
+ ts = fs.TimeSpec{
+ ATime: ktime.FromSeconds(times.Actime),
+ MTime: ktime.FromSeconds(times.Modtime),
+ }
+ }
+ return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
+}
+
+// Utimes implements linux syscall utimes(2).
+func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ filenameAddr := args[0].Pointer()
+ timesAddr := args[1].Pointer()
+
+ // No timesAddr argument will be interpreted as current system time.
+ ts := defaultSetToSystemTimeSpec()
+ if timesAddr != 0 {
+ var times [2]linux.Timeval
+ if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ return 0, nil, err
+ }
+ ts = fs.TimeSpec{
+ ATime: ktime.FromTimeval(times[0]),
+ MTime: ktime.FromTimeval(times[1]),
+ }
+ }
+ return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
+}
+
+// timespecIsValid checks that the timespec is valid for use in utimensat.
+func timespecIsValid(ts linux.Timespec) bool {
+ // Nsec must be UTIME_OMIT, UTIME_NOW, or less than 10^9.
+ return ts.Nsec == linux.UTIME_OMIT || ts.Nsec == linux.UTIME_NOW || ts.Nsec < 1e9
+}
+
+// Utimensat implements linux syscall utimensat(2).
+func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirFD := args[0].Int()
+ pathnameAddr := args[1].Pointer()
+ timesAddr := args[2].Pointer()
+ flags := args[3].Int()
+
+ // No timesAddr argument will be interpreted as current system time.
+ ts := defaultSetToSystemTimeSpec()
+ if timesAddr != 0 {
+ var times [2]linux.Timespec
+ if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ return 0, nil, err
+ }
+ if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // If both are UTIME_OMIT, this is a noop.
+ if times[0].Nsec == linux.UTIME_OMIT && times[1].Nsec == linux.UTIME_OMIT {
+ return 0, nil, nil
+ }
+
+ ts = fs.TimeSpec{
+ ATime: ktime.FromTimespec(times[0]),
+ ATimeOmit: times[0].Nsec == linux.UTIME_OMIT,
+ ATimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
+ MTime: ktime.FromTimespec(times[1]),
+ MTimeOmit: times[1].Nsec == linux.UTIME_OMIT,
+ MTimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
+ }
+ }
+ return 0, nil, utimes(t, dirFD, pathnameAddr, ts, flags&linux.AT_SYMLINK_NOFOLLOW == 0)
+}
+
+// Futimesat implements linux syscall futimesat(2).
+func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirFD := args[0].Int()
+ pathnameAddr := args[1].Pointer()
+ timesAddr := args[2].Pointer()
+
+ // No timesAddr argument will be interpreted as current system time.
+ ts := defaultSetToSystemTimeSpec()
+ if timesAddr != 0 {
+ var times [2]linux.Timeval
+ if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ return 0, nil, err
+ }
+ if times[0].Usec >= 1e6 || times[0].Usec < 0 ||
+ times[1].Usec >= 1e6 || times[1].Usec < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ ts = fs.TimeSpec{
+ ATime: ktime.FromTimeval(times[0]),
+ MTime: ktime.FromTimeval(times[1]),
+ }
+ }
+ return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
+}
+
+// LINT.ThenChange(vfs2/setstat.go)
+
+// LINT.IfChange
+
+func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error {
+ newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
+ if err != nil {
+ return err
+ }
+ oldPath, _, err := copyInPath(t, oldAddr, false /* allowEmpty */)
+ if err != nil {
+ return err
+ }
+
+ return fileOpAt(t, oldDirFD, oldPath, func(root *fs.Dirent, oldParent *fs.Dirent, oldName string, _ uint) error {
+ if !fs.IsDir(oldParent.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ // Rename rejects paths that end in ".", "..", or empty (i.e.
+ // the root) with EBUSY.
+ switch oldName {
+ case "", ".", "..":
+ return syserror.EBUSY
+ }
+
+ return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error {
+ if !fs.IsDir(newParent.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ // Rename rejects paths that end in ".", "..", or empty
+ // (i.e. the root) with EBUSY.
+ switch newName {
+ case "", ".", "..":
+ return syserror.EBUSY
+ }
+
+ return fs.Rename(t, root, oldParent, oldName, newParent, newName)
+ })
+ })
+}
+
+// Rename implements linux syscall rename(2).
+func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ oldPathAddr := args[0].Pointer()
+ newPathAddr := args[1].Pointer()
+ return 0, nil, renameAt(t, linux.AT_FDCWD, oldPathAddr, linux.AT_FDCWD, newPathAddr)
+}
+
+// Renameat implements linux syscall renameat(2).
+func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ oldDirFD := args[0].Int()
+ oldPathAddr := args[1].Pointer()
+ newDirFD := args[2].Int()
+ newPathAddr := args[3].Pointer()
+ return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
+}
+
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// Fallocate implements linux system call fallocate(2).
+func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ mode := args[1].Int64()
+ offset := args[2].Int64()
+ length := args[3].Int64()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ if offset < 0 || length <= 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if mode != 0 {
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, syserror.ENOTSUP
+ }
+ if !file.Flags().Write {
+ return 0, nil, syserror.EBADF
+ }
+ if fs.IsPipe(file.Dirent.Inode.StableAttr) {
+ return 0, nil, syserror.ESPIPE
+ }
+ if fs.IsDir(file.Dirent.Inode.StableAttr) {
+ return 0, nil, syserror.EISDIR
+ }
+ if !fs.IsRegular(file.Dirent.Inode.StableAttr) {
+ return 0, nil, syserror.ENODEV
+ }
+ size := offset + length
+ if size < 0 {
+ return 0, nil, syserror.EFBIG
+ }
+ if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+ t.SendSignal(&arch.SignalInfo{
+ Signo: int32(linux.SIGXFSZ),
+ Code: arch.SignalInfoUser,
+ })
+ return 0, nil, syserror.EFBIG
+ }
+
+ if err := file.Dirent.Inode.Allocate(t, file.Dirent, offset, length); err != nil {
+ return 0, nil, err
+ }
+
+ // File length modified, generate notification.
+ file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+
+ return 0, nil, nil
+}
+
+// Flock implements linux syscall flock(2).
+func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ operation := args[1].Int()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ // flock(2): EBADF fd is not an open file descriptor.
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ nonblocking := operation&linux.LOCK_NB != 0
+ operation &^= linux.LOCK_NB
+
+ // A BSD style lock spans the entire file.
+ rng := lock.LockRange{
+ Start: 0,
+ End: lock.LockEOF,
+ }
+
+ switch operation {
+ case linux.LOCK_EX:
+ if nonblocking {
+ // Since we're nonblocking we pass a nil lock.Blocker implementation.
+ if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, nil) {
+ return 0, nil, syserror.EWOULDBLOCK
+ }
+ } else {
+ // Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
+ if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, t) {
+ return 0, nil, syserror.EINTR
+ }
+ }
+ case linux.LOCK_SH:
+ if nonblocking {
+ // Since we're nonblocking we pass a nil lock.Blocker implementation.
+ if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, nil) {
+ return 0, nil, syserror.EWOULDBLOCK
+ }
+ } else {
+ // Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
+ if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, t) {
+ return 0, nil, syserror.EINTR
+ }
+ }
+ case linux.LOCK_UN:
+ file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng)
+ default:
+ // flock(2): EINVAL operation is invalid.
+ return 0, nil, syserror.EINVAL
+ }
+
+ return 0, nil, nil
+}
+
+const (
+ memfdPrefix = "/memfd:"
+ memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
+ memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + 1
+)
+
+// MemfdCreate implements the linux syscall memfd_create(2).
+func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ flags := args[1].Uint()
+
+ if flags&^memfdAllFlags != 0 {
+ // Unknown bits in flags.
+ return 0, nil, syserror.EINVAL
+ }
+
+ allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
+ cloExec := flags&linux.MFD_CLOEXEC != 0
+
+ name, err := t.CopyInString(addr, syscall.PathMax-len(memfdPrefix))
+ if err != nil {
+ return 0, nil, err
+ }
+ if len(name) > memfdMaxNameLen {
+ return 0, nil, syserror.EINVAL
+ }
+ name = memfdPrefix + name
+
+ inode := tmpfs.NewMemfdInode(t, allowSeals)
+ dirent := fs.NewDirent(t, inode, name)
+ // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
+ // FMODE_READ | FMODE_WRITE.
+ file, err := inode.GetFile(t, dirent, fs.FileFlags{Read: true, Write: true})
+ if err != nil {
+ return 0, nil, err
+ }
+
+ defer dirent.DecRef()
+ defer file.DecRef()
+
+ newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
+ CloseOnExec: cloExec,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(newFD), nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
new file mode 100644
index 000000000..b68261f72
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -0,0 +1,288 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// futexWaitRestartBlock encapsulates the state required to restart futex(2)
+// via restart_syscall(2).
+//
+// +stateify savable
+type futexWaitRestartBlock struct {
+ duration time.Duration
+
+ // addr stored as uint64 since uintptr is not save-able.
+ addr uint64
+ private bool
+ val uint32
+ mask uint32
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+ return futexWaitDuration(t, f.duration, false, usermem.Addr(f.addr), f.private, f.val, f.mask)
+}
+
+// futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is
+// complete.
+//
+// The wait blocks forever if forever is true, otherwise it blocks until ts.
+//
+// If blocking is interrupted, the syscall is restarted with the original
+// arguments.
+func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr usermem.Addr, private bool, val, mask uint32) (uintptr, error) {
+ w := t.FutexWaiter()
+ err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
+ if err != nil {
+ return 0, err
+ }
+
+ if forever {
+ err = t.Block(w.C)
+ } else if clockRealtime {
+ notifier, tchan := ktime.NewChannelNotifier()
+ timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
+ timer.Swap(ktime.Setting{
+ Enabled: true,
+ Next: ktime.FromTimespec(ts),
+ })
+ err = t.BlockWithTimer(w.C, tchan)
+ timer.Destroy()
+ } else {
+ err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts))
+ }
+
+ t.Futex().WaitComplete(w)
+ return 0, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is
+// complete.
+//
+// The wait blocks forever if forever is true, otherwise is blocks for
+// duration.
+//
+// If blocking is interrupted, forever determines how to restart the
+// syscall. If forever is true, the syscall is restarted with the original
+// arguments. If forever is false, duration is a relative timeout and the
+// syscall is restarted with the remaining timeout.
+func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr usermem.Addr, private bool, val, mask uint32) (uintptr, error) {
+ w := t.FutexWaiter()
+ err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
+ if err != nil {
+ return 0, err
+ }
+
+ remaining, err := t.BlockWithTimeout(w.C, !forever, duration)
+ t.Futex().WaitComplete(w)
+ if err == nil {
+ return 0, nil
+ }
+
+ // The wait was unsuccessful for some reason other than interruption. Simply
+ // forward the error.
+ if err != syserror.ErrInterrupted {
+ return 0, err
+ }
+
+ // The wait was interrupted and we need to restart. Decide how.
+
+ // The wait duration was absolute, restart with the original arguments.
+ if forever {
+ return 0, kernel.ERESTARTSYS
+ }
+
+ // The wait duration was relative, restart with the remaining duration.
+ t.SetSyscallRestartBlock(&futexWaitRestartBlock{
+ duration: remaining,
+ addr: uint64(addr),
+ private: private,
+ val: val,
+ mask: mask,
+ })
+ return 0, kernel.ERESTART_RESTARTBLOCK
+}
+
+func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.Addr, private bool) error {
+ w := t.FutexWaiter()
+ locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false)
+ if err != nil {
+ return err
+ }
+ if locked {
+ // Futex acquired, we're done!
+ return nil
+ }
+
+ if forever {
+ err = t.Block(w.C)
+ } else {
+ notifier, tchan := ktime.NewChannelNotifier()
+ timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
+ timer.Swap(ktime.Setting{
+ Enabled: true,
+ Next: ktime.FromTimespec(ts),
+ })
+ err = t.BlockWithTimer(w.C, tchan)
+ timer.Destroy()
+ }
+
+ t.Futex().WaitComplete(w)
+ return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+func tryLockPI(t *kernel.Task, addr usermem.Addr, private bool) error {
+ w := t.FutexWaiter()
+ locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true)
+ if err != nil {
+ return err
+ }
+ if !locked {
+ return syserror.EWOULDBLOCK
+ }
+ return nil
+}
+
+// Futex implements linux syscall futex(2).
+// It provides a method for a program to wait for a value at a given address to
+// change, and a method to wake up anyone waiting on a particular address.
+func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ futexOp := args[1].Int()
+ val := int(args[2].Int())
+ nreq := int(args[3].Int())
+ timeout := args[3].Pointer()
+ naddr := args[4].Pointer()
+ val3 := args[5].Int()
+
+ cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME)
+ private := (futexOp & linux.FUTEX_PRIVATE_FLAG) != 0
+ clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME
+ mask := uint32(val3)
+
+ switch cmd {
+ case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET:
+ // WAIT{_BITSET} wait forever if the timeout isn't passed.
+ forever := (timeout == 0)
+
+ var timespec linux.Timespec
+ if !forever {
+ var err error
+ timespec, err = copyTimespecIn(t, timeout)
+ if err != nil {
+ return 0, nil, err
+ }
+ }
+
+ switch cmd {
+ case linux.FUTEX_WAIT:
+ // WAIT uses a relative timeout.
+ mask = ^uint32(0)
+ var timeoutDur time.Duration
+ if !forever {
+ timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond
+ }
+ n, err := futexWaitDuration(t, timeoutDur, forever, addr, private, uint32(val), mask)
+ return n, nil, err
+
+ case linux.FUTEX_WAIT_BITSET:
+ // WAIT_BITSET uses an absolute timeout which is either
+ // CLOCK_MONOTONIC or CLOCK_REALTIME.
+ if mask == 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask)
+ return n, nil, err
+ default:
+ panic("unreachable")
+ }
+
+ case linux.FUTEX_WAKE:
+ mask = ^uint32(0)
+ fallthrough
+
+ case linux.FUTEX_WAKE_BITSET:
+ if mask == 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if val <= 0 {
+ // The Linux kernel wakes one waiter even if val is
+ // non-positive.
+ val = 1
+ }
+ n, err := t.Futex().Wake(t, addr, private, mask, val)
+ return uintptr(n), nil, err
+
+ case linux.FUTEX_REQUEUE:
+ n, err := t.Futex().Requeue(t, addr, naddr, private, val, nreq)
+ return uintptr(n), nil, err
+
+ case linux.FUTEX_CMP_REQUEUE:
+ // 'val3' contains the value to be checked at 'addr' and
+ // 'val' is the number of waiters that should be woken up.
+ nval := uint32(val3)
+ n, err := t.Futex().RequeueCmp(t, addr, naddr, private, nval, val, nreq)
+ return uintptr(n), nil, err
+
+ case linux.FUTEX_WAKE_OP:
+ op := uint32(val3)
+ if val <= 0 {
+ // The Linux kernel wakes one waiter even if val is
+ // non-positive.
+ val = 1
+ }
+ n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
+ return uintptr(n), nil, err
+
+ case linux.FUTEX_LOCK_PI:
+ forever := (timeout == 0)
+
+ var timespec linux.Timespec
+ if !forever {
+ var err error
+ timespec, err = copyTimespecIn(t, timeout)
+ if err != nil {
+ return 0, nil, err
+ }
+ }
+ err := futexLockPI(t, timespec, forever, addr, private)
+ return 0, nil, err
+
+ case linux.FUTEX_TRYLOCK_PI:
+ err := tryLockPI(t, addr, private)
+ return 0, nil, err
+
+ case linux.FUTEX_UNLOCK_PI:
+ err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private)
+ return 0, nil, err
+
+ case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, syserror.ENOSYS
+
+ default:
+ // We don't even know about this command.
+ return 0, nil, syserror.ENOSYS
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
new file mode 100644
index 000000000..b126fecc0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -0,0 +1,250 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "bytes"
+ "io"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// LINT.IfChange
+
+// Getdents implements linux syscall getdents(2) for 64bit systems.
+func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ size := int(args[2].Uint())
+
+ minSize := int(smallestDirent(t.Arch()))
+ if size < minSize {
+ // size is smaller than smallest possible dirent.
+ return 0, nil, syserror.EINVAL
+ }
+
+ n, err := getdents(t, fd, addr, size, (*dirent).Serialize)
+ return n, nil, err
+}
+
+// Getdents64 implements linux syscall getdents64(2).
+func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ size := int(args[2].Uint())
+
+ minSize := int(smallestDirent64(t.Arch()))
+ if size < minSize {
+ // size is smaller than smallest possible dirent.
+ return 0, nil, syserror.EINVAL
+ }
+
+ n, err := getdents(t, fd, addr, size, (*dirent).Serialize64)
+ return n, nil, err
+}
+
+// getdents implements the core of getdents(2)/getdents64(2).
+// f is the syscall implementation dirent serialization function.
+func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) {
+ dir := t.GetFile(fd)
+ if dir == nil {
+ return 0, syserror.EBADF
+ }
+ defer dir.DecRef()
+
+ w := &usermem.IOReadWriter{
+ Ctx: t,
+ IO: t.MemoryManager(),
+ Addr: addr,
+ Opts: usermem.IOOpts{
+ AddressSpaceActive: true,
+ },
+ }
+
+ ds := newDirentSerializer(f, w, t.Arch(), size)
+ rerr := dir.Readdir(t, ds)
+
+ switch err := handleIOError(t, ds.Written() > 0, rerr, kernel.ERESTARTSYS, "getdents", dir); err {
+ case nil:
+ dir.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+ return uintptr(ds.Written()), nil
+ case io.EOF:
+ return 0, nil
+ default:
+ return 0, err
+ }
+}
+
+// oldDirentHdr is a fixed sized header matching the fixed size
+// fields found in the old linux dirent struct.
+type oldDirentHdr struct {
+ Ino uint64
+ Off uint64
+ Reclen uint16
+}
+
+// direntHdr is a fixed sized header matching the fixed size
+// fields found in the new linux dirent struct.
+type direntHdr struct {
+ OldHdr oldDirentHdr
+ Typ uint8
+}
+
+// dirent contains the data pointed to by a new linux dirent struct.
+type dirent struct {
+ Hdr direntHdr
+ Name []byte
+}
+
+// newDirent returns a dirent from an fs.InodeOperationsInfo.
+func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent {
+ d := &dirent{
+ Hdr: direntHdr{
+ OldHdr: oldDirentHdr{
+ Ino: attr.InodeID,
+ Off: offset,
+ },
+ Typ: fs.ToDirentType(attr.Type),
+ },
+ Name: []byte(name),
+ }
+ d.Hdr.OldHdr.Reclen = d.padRec(int(width))
+ return d
+}
+
+// smallestDirent returns the size of the smallest possible dirent using
+// the old linux dirent format.
+func smallestDirent(a arch.Context) uint {
+ d := dirent{}
+ return uint(binary.Size(d.Hdr.OldHdr)) + a.Width() + 1
+}
+
+// smallestDirent64 returns the size of the smallest possible dirent using
+// the new linux dirent format.
+func smallestDirent64(a arch.Context) uint {
+ d := dirent{}
+ return uint(binary.Size(d.Hdr)) + a.Width()
+}
+
+// padRec pads the name field until the rec length is a multiple of the width,
+// which must be a power of 2. It returns the padded rec length.
+func (d *dirent) padRec(width int) uint16 {
+ a := int(binary.Size(d.Hdr)) + len(d.Name)
+ r := (a + width) &^ (width - 1)
+ padding := r - a
+ d.Name = append(d.Name, make([]byte, padding)...)
+ return uint16(r)
+}
+
+// Serialize64 serializes a Dirent struct to a byte slice, keeping the new
+// linux dirent format. Returns the number of bytes serialized or an error.
+func (d *dirent) Serialize64(w io.Writer) (int, error) {
+ n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr))
+ if err != nil {
+ return 0, err
+ }
+ n2, err := w.Write(d.Name)
+ if err != nil {
+ return 0, err
+ }
+ return n1 + n2, nil
+}
+
+// Serialize serializes a Dirent struct to a byte slice, using the old linux
+// dirent format.
+// Returns the number of bytes serialized or an error.
+func (d *dirent) Serialize(w io.Writer) (int, error) {
+ n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr.OldHdr))
+ if err != nil {
+ return 0, err
+ }
+ n2, err := w.Write(d.Name)
+ if err != nil {
+ return 0, err
+ }
+ n3, err := w.Write([]byte{d.Hdr.Typ})
+ if err != nil {
+ return 0, err
+ }
+ return n1 + n2 + n3, nil
+}
+
+// direntSerializer implements fs.InodeOperationsInfoSerializer, serializing dirents to an
+// io.Writer.
+type direntSerializer struct {
+ serialize func(*dirent, io.Writer) (int, error)
+ w io.Writer
+ // width is the arch native value width.
+ width uint
+ // offset is the current dirent offset.
+ offset uint64
+ // written is the total bytes serialized.
+ written int
+ // size is the size of the buffer to serialize into.
+ size int
+}
+
+func newDirentSerializer(f func(d *dirent, w io.Writer) (int, error), w io.Writer, ac arch.Context, size int) *direntSerializer {
+ return &direntSerializer{
+ serialize: f,
+ w: w,
+ width: ac.Width(),
+ size: size,
+ }
+}
+
+// CopyOut implements fs.InodeOperationsInfoSerializer.CopyOut.
+// It serializes and writes the fs.DentAttr to the direntSerializer io.Writer.
+func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error {
+ ds.offset++
+
+ d := newDirent(ds.width, name, attr, ds.offset)
+
+ // Serialize dirent into a temp buffer.
+ var b bytes.Buffer
+ n, err := ds.serialize(d, &b)
+ if err != nil {
+ ds.offset--
+ return err
+ }
+
+ // Check that we have enough room remaining to write the dirent.
+ if n > (ds.size - ds.written) {
+ ds.offset--
+ return io.EOF
+ }
+
+ // Write out the temp buffer.
+ if _, err := b.WriteTo(ds.w); err != nil {
+ ds.offset--
+ return err
+ }
+
+ ds.written += n
+ return nil
+}
+
+// Written returns the total number of bytes written.
+func (ds *direntSerializer) Written() int {
+ return ds.written
+}
+
+// LINT.ThenChange(vfs2/getdents.go)
diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go
new file mode 100644
index 000000000..715ac45e6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_identity.go
@@ -0,0 +1,180 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+ // As NGROUPS_MAX in include/uapi/linux/limits.h.
+ maxNGroups = 65536
+)
+
+// Getuid implements the Linux syscall getuid.
+func Getuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ c := t.Credentials()
+ ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
+ return uintptr(ruid), nil, nil
+}
+
+// Geteuid implements the Linux syscall geteuid.
+func Geteuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ c := t.Credentials()
+ euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
+ return uintptr(euid), nil, nil
+}
+
+// Getresuid implements the Linux syscall getresuid.
+func Getresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ ruidAddr := args[0].Pointer()
+ euidAddr := args[1].Pointer()
+ suidAddr := args[2].Pointer()
+ c := t.Credentials()
+ ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
+ euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
+ suid := c.SavedKUID.In(c.UserNamespace).OrOverflow()
+ if _, err := t.CopyOut(ruidAddr, ruid); err != nil {
+ return 0, nil, err
+ }
+ if _, err := t.CopyOut(euidAddr, euid); err != nil {
+ return 0, nil, err
+ }
+ if _, err := t.CopyOut(suidAddr, suid); err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, nil
+}
+
+// Getgid implements the Linux syscall getgid.
+func Getgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ c := t.Credentials()
+ rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
+ return uintptr(rgid), nil, nil
+}
+
+// Getegid implements the Linux syscall getegid.
+func Getegid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ c := t.Credentials()
+ egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
+ return uintptr(egid), nil, nil
+}
+
+// Getresgid implements the Linux syscall getresgid.
+func Getresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ rgidAddr := args[0].Pointer()
+ egidAddr := args[1].Pointer()
+ sgidAddr := args[2].Pointer()
+ c := t.Credentials()
+ rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
+ egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
+ sgid := c.SavedKGID.In(c.UserNamespace).OrOverflow()
+ if _, err := t.CopyOut(rgidAddr, rgid); err != nil {
+ return 0, nil, err
+ }
+ if _, err := t.CopyOut(egidAddr, egid); err != nil {
+ return 0, nil, err
+ }
+ if _, err := t.CopyOut(sgidAddr, sgid); err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, nil
+}
+
+// Setuid implements the Linux syscall setuid.
+func Setuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ uid := auth.UID(args[0].Int())
+ return 0, nil, t.SetUID(uid)
+}
+
+// Setreuid implements the Linux syscall setreuid.
+func Setreuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ ruid := auth.UID(args[0].Int())
+ euid := auth.UID(args[1].Int())
+ return 0, nil, t.SetREUID(ruid, euid)
+}
+
+// Setresuid implements the Linux syscall setreuid.
+func Setresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ ruid := auth.UID(args[0].Int())
+ euid := auth.UID(args[1].Int())
+ suid := auth.UID(args[2].Int())
+ return 0, nil, t.SetRESUID(ruid, euid, suid)
+}
+
+// Setgid implements the Linux syscall setgid.
+func Setgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ gid := auth.GID(args[0].Int())
+ return 0, nil, t.SetGID(gid)
+}
+
+// Setregid implements the Linux syscall setregid.
+func Setregid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ rgid := auth.GID(args[0].Int())
+ egid := auth.GID(args[1].Int())
+ return 0, nil, t.SetREGID(rgid, egid)
+}
+
+// Setresgid implements the Linux syscall setregid.
+func Setresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ rgid := auth.GID(args[0].Int())
+ egid := auth.GID(args[1].Int())
+ sgid := auth.GID(args[2].Int())
+ return 0, nil, t.SetRESGID(rgid, egid, sgid)
+}
+
+// Getgroups implements the Linux syscall getgroups.
+func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ size := int(args[0].Int())
+ if size < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ kgids := t.Credentials().ExtraKGIDs
+ // "If size is zero, list is not modified, but the total number of
+ // supplementary group IDs for the process is returned." - getgroups(2)
+ if size == 0 {
+ return uintptr(len(kgids)), nil, nil
+ }
+ if size < len(kgids) {
+ return 0, nil, syserror.EINVAL
+ }
+ gids := make([]auth.GID, len(kgids))
+ for i, kgid := range kgids {
+ gids[i] = kgid.In(t.UserNamespace()).OrOverflow()
+ }
+ if _, err := t.CopyOut(args[1].Pointer(), gids); err != nil {
+ return 0, nil, err
+ }
+ return uintptr(len(gids)), nil, nil
+}
+
+// Setgroups implements the Linux syscall setgroups.
+func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ size := args[0].Int()
+ if size < 0 || size > maxNGroups {
+ return 0, nil, syserror.EINVAL
+ }
+ if size == 0 {
+ return 0, nil, t.SetExtraGIDs(nil)
+ }
+ gids := make([]auth.GID, size)
+ if _, err := t.CopyIn(args[1].Pointer(), &gids); err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, t.SetExtraGIDs(gids)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go
new file mode 100644
index 000000000..b2c7b3444
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_inotify.go
@@ -0,0 +1,133 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fs/anon"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC)
+
+// InotifyInit1 implements the inotify_init1() syscalls.
+func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := int(args[0].Int())
+
+ if flags&^allFlags != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ dirent := fs.NewDirent(t, anon.NewInode(t), "inotify")
+ fileFlags := fs.FileFlags{
+ Read: true,
+ Write: true,
+ NonBlocking: flags&linux.IN_NONBLOCK != 0,
+ }
+ n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t))
+ defer n.DecRef()
+
+ fd, err := t.NewFDFrom(0, n, kernel.FDFlags{
+ CloseOnExec: flags&linux.IN_CLOEXEC != 0,
+ })
+
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
+
+// InotifyInit implements the inotify_init() syscalls.
+func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ args[0].Value = 0
+ return InotifyInit1(t, args)
+}
+
+// fdToInotify resolves an fd to an inotify object. If successful, the file will
+// have an extra ref and the caller is responsible for releasing the ref.
+func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) {
+ file := t.GetFile(fd)
+ if file == nil {
+ // Invalid fd.
+ return nil, nil, syserror.EBADF
+ }
+
+ ino, ok := file.FileOperations.(*fs.Inotify)
+ if !ok {
+ // Not an inotify fd.
+ file.DecRef()
+ return nil, nil, syserror.EINVAL
+ }
+
+ return ino, file, nil
+}
+
+// InotifyAddWatch implements the inotify_add_watch() syscall.
+func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ mask := args[2].Uint()
+
+ // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link."
+ // -- inotify(7)
+ resolve := mask&linux.IN_DONT_FOLLOW == 0
+
+ // "EINVAL: The given event mask contains no valid events."
+ // -- inotify_add_watch(2)
+ if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ ino, file, err := fdToInotify(t, fd)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer file.DecRef()
+
+ path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ err = fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, dirent *fs.Dirent, _ uint) error {
+ // "IN_ONLYDIR: Only watch pathname if it is a directory." -- inotify(7)
+ if onlyDir := mask&linux.IN_ONLYDIR != 0; onlyDir && !fs.IsDir(dirent.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ // Copy out to the return frame.
+ fd = ino.AddWatch(dirent, mask)
+
+ return nil
+ })
+ return uintptr(fd), nil, err // Return from the existing value.
+}
+
+// InotifyRmWatch implements the inotify_rm_watch() syscall.
+func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ wd := args[1].Int()
+
+ ino, file, err := fdToInotify(t, fd)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer file.DecRef()
+ return 0, nil, ino.RmWatch(wd)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
new file mode 100644
index 000000000..3f7691eae
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -0,0 +1,58 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// LINT.IfChange
+
+// Lseek implements linux syscall lseek(2).
+func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ offset := args[1].Int64()
+ whence := args[2].Int()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ var sw fs.SeekWhence
+ switch whence {
+ case 0:
+ sw = fs.SeekSet
+ case 1:
+ sw = fs.SeekCurrent
+ case 2:
+ sw = fs.SeekEnd
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ offset, serr := file.Seek(t, sw, offset)
+ err := handleIOError(t, false /* partialResult */, serr, kernel.ERESTARTSYS, "lseek", file)
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(offset), nil, err
+}
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go
new file mode 100644
index 000000000..9b4a5c3f1
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go
@@ -0,0 +1,312 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// We unconditionally report a single NUMA node. This also means that our
+// "nodemask_t" is a single unsigned long (uint64).
+const (
+ maxNodes = 1
+ allowedNodemask = (1 << maxNodes) - 1
+)
+
+func copyInNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32) (uint64, error) {
+ // "nodemask points to a bit mask of node IDs that contains up to maxnode
+ // bits. The bit mask size is rounded to the next multiple of
+ // sizeof(unsigned long), but the kernel will use bits only up to maxnode.
+ // A NULL value of nodemask or a maxnode value of zero specifies the empty
+ // set of nodes. If the value of maxnode is zero, the nodemask argument is
+ // ignored." - set_mempolicy(2). Unfortunately, most of this is inaccurate
+ // because of what appears to be a bug: mm/mempolicy.c:get_nodes() uses
+ // maxnode-1, not maxnode, as the number of bits.
+ bits := maxnode - 1
+ if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
+ return 0, syserror.EINVAL
+ }
+ if bits == 0 {
+ return 0, nil
+ }
+ // Copy in the whole nodemask.
+ numUint64 := (bits + 63) / 64
+ buf := t.CopyScratchBuffer(int(numUint64) * 8)
+ if _, err := t.CopyInBytes(addr, buf); err != nil {
+ return 0, err
+ }
+ val := usermem.ByteOrder.Uint64(buf)
+ // Check that only allowed bits in the first unsigned long in the nodemask
+ // are set.
+ if val&^allowedNodemask != 0 {
+ return 0, syserror.EINVAL
+ }
+ // Check that all remaining bits in the nodemask are 0.
+ for i := 8; i < len(buf); i++ {
+ if buf[i] != 0 {
+ return 0, syserror.EINVAL
+ }
+ }
+ return val, nil
+}
+
+func copyOutNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32, val uint64) error {
+ // mm/mempolicy.c:copy_nodes_to_user() also uses maxnode-1 as the number of
+ // bits.
+ bits := maxnode - 1
+ if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
+ return syserror.EINVAL
+ }
+ if bits == 0 {
+ return nil
+ }
+ // Copy out the first unsigned long in the nodemask.
+ buf := t.CopyScratchBuffer(8)
+ usermem.ByteOrder.PutUint64(buf, val)
+ if _, err := t.CopyOutBytes(addr, buf); err != nil {
+ return err
+ }
+ // Zero out remaining unsigned longs in the nodemask.
+ if bits > 64 {
+ remAddr, ok := addr.AddLength(8)
+ if !ok {
+ return syserror.EFAULT
+ }
+ remUint64 := (bits - 1) / 64
+ if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{
+ AddressSpaceActive: true,
+ }); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// GetMempolicy implements the syscall get_mempolicy(2).
+func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ mode := args[0].Pointer()
+ nodemask := args[1].Pointer()
+ maxnode := args[2].Uint()
+ addr := args[3].Pointer()
+ flags := args[4].Uint()
+
+ if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ nodeFlag := flags&linux.MPOL_F_NODE != 0
+ addrFlag := flags&linux.MPOL_F_ADDR != 0
+ memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0
+
+ // "EINVAL: The value specified by maxnode is less than the number of node
+ // IDs supported by the system." - get_mempolicy(2)
+ if nodemask != 0 && maxnode < maxNodes {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is
+ // ignored and the set of nodes (memories) that the thread is allowed to
+ // specify in subsequent calls to mbind(2) or set_mempolicy(2) (in the
+ // absence of any mode flags) is returned in nodemask."
+ if memsAllowed {
+ // "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either
+ // MPOL_F_ADDR or MPOL_F_NODE."
+ if nodeFlag || addrFlag {
+ return 0, nil, syserror.EINVAL
+ }
+ if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, nil
+ }
+
+ // "If flags specifies MPOL_F_ADDR, then information is returned about the
+ // policy governing the memory address given in addr. ... If the mode
+ // argument is not NULL, then get_mempolicy() will store the policy mode
+ // and any optional mode flags of the requested NUMA policy in the location
+ // pointed to by this argument. If nodemask is not NULL, then the nodemask
+ // associated with the policy will be stored in the location pointed to by
+ // this argument."
+ if addrFlag {
+ policy, nodemaskVal, err := t.MemoryManager().NumaPolicy(addr)
+ if err != nil {
+ return 0, nil, err
+ }
+ if nodeFlag {
+ // "If flags specifies both MPOL_F_NODE and MPOL_F_ADDR,
+ // get_mempolicy() will return the node ID of the node on which the
+ // address addr is allocated into the location pointed to by mode.
+ // If no page has yet been allocated for the specified address,
+ // get_mempolicy() will allocate a page as if the thread had
+ // performed a read (load) access to that address, and return the
+ // ID of the node where that page was allocated."
+ buf := t.CopyScratchBuffer(1)
+ _, err := t.CopyInBytes(addr, buf)
+ if err != nil {
+ return 0, nil, err
+ }
+ policy = linux.MPOL_DEFAULT // maxNodes == 1
+ }
+ if mode != 0 {
+ if _, err := policy.CopyOut(t, mode); err != nil {
+ return 0, nil, err
+ }
+ }
+ if nodemask != 0 {
+ if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
+ return 0, nil, err
+ }
+ }
+ return 0, nil, nil
+ }
+
+ // "EINVAL: ... flags specified MPOL_F_ADDR and addr is NULL, or flags did
+ // not specify MPOL_F_ADDR and addr is not NULL." This is partially
+ // inaccurate: if flags specifies MPOL_F_ADDR,
+ // mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will
+ // just (usually) fail to find a VMA at address 0 and return EFAULT.
+ if addr != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // "If flags is specified as 0, then information about the calling thread's
+ // default policy (as set by set_mempolicy(2)) is returned, in the buffers
+ // pointed to by mode and nodemask. ... If flags specifies MPOL_F_NODE, but
+ // not MPOL_F_ADDR, and the thread's current policy is MPOL_INTERLEAVE,
+ // then get_mempolicy() will return in the location pointed to by a
+ // non-NULL mode argument, the node ID of the next node that will be used
+ // for interleaving of internal kernel pages allocated on behalf of the
+ // thread."
+ policy, nodemaskVal := t.NumaPolicy()
+ if nodeFlag {
+ if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE {
+ return 0, nil, syserror.EINVAL
+ }
+ policy = linux.MPOL_DEFAULT // maxNodes == 1
+ }
+ if mode != 0 {
+ if _, err := policy.CopyOut(t, mode); err != nil {
+ return 0, nil, err
+ }
+ }
+ if nodemask != 0 {
+ if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
+ return 0, nil, err
+ }
+ }
+ return 0, nil, nil
+}
+
+// SetMempolicy implements the syscall set_mempolicy(2).
+func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ modeWithFlags := linux.NumaPolicy(args[0].Int())
+ nodemask := args[1].Pointer()
+ maxnode := args[2].Uint()
+
+ modeWithFlags, nodemaskVal, err := copyInMempolicyNodemask(t, modeWithFlags, nodemask, maxnode)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ t.SetNumaPolicy(modeWithFlags, nodemaskVal)
+ return 0, nil, nil
+}
+
+// Mbind implements the syscall mbind(2).
+func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].Uint64()
+ mode := linux.NumaPolicy(args[2].Int())
+ nodemask := args[3].Pointer()
+ maxnode := args[4].Uint()
+ flags := args[5].Uint()
+
+ if flags&^linux.MPOL_MF_VALID != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ // "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be
+ // privileged (CAP_SYS_NICE) to use this flag." - mbind(2)
+ if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) {
+ return 0, nil, syserror.EPERM
+ }
+
+ mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // Since we claim to have only a single node, all flags can be ignored
+ // (since all pages must already be on that single node).
+ err = t.MemoryManager().SetNumaPolicy(addr, length, mode, nodemaskVal)
+ return 0, nil, err
+}
+
+func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nodemask usermem.Addr, maxnode uint32) (linux.NumaPolicy, uint64, error) {
+ flags := linux.NumaPolicy(modeWithFlags & linux.MPOL_MODE_FLAGS)
+ mode := linux.NumaPolicy(modeWithFlags &^ linux.MPOL_MODE_FLAGS)
+ if flags == linux.MPOL_MODE_FLAGS {
+ // Can't specify both mode flags simultaneously.
+ return 0, 0, syserror.EINVAL
+ }
+ if mode < 0 || mode >= linux.MPOL_MAX {
+ // Must specify a valid mode.
+ return 0, 0, syserror.EINVAL
+ }
+
+ var nodemaskVal uint64
+ if nodemask != 0 {
+ var err error
+ nodemaskVal, err = copyInNodemask(t, nodemask, maxnode)
+ if err != nil {
+ return 0, 0, err
+ }
+ }
+
+ switch mode {
+ case linux.MPOL_DEFAULT:
+ // "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate;
+ // Linux allows a nodemask to be specified, as long as it is empty.
+ if nodemaskVal != 0 {
+ return 0, 0, syserror.EINVAL
+ }
+ case linux.MPOL_BIND, linux.MPOL_INTERLEAVE:
+ // These require a non-empty nodemask.
+ if nodemaskVal == 0 {
+ return 0, 0, syserror.EINVAL
+ }
+ case linux.MPOL_PREFERRED:
+ // This permits an empty nodemask, as long as no flags are set.
+ if nodemaskVal == 0 && flags != 0 {
+ return 0, 0, syserror.EINVAL
+ }
+ case linux.MPOL_LOCAL:
+ // This requires an empty nodemask and no flags set ...
+ if nodemaskVal != 0 || flags != 0 {
+ return 0, 0, syserror.EINVAL
+ }
+ // ... and is implemented as MPOL_PREFERRED.
+ mode = linux.MPOL_PREFERRED
+ default:
+ // Unknown mode, which we should have rejected above.
+ panic(fmt.Sprintf("unknown mode: %v", mode))
+ }
+
+ return mode | flags, nodemaskVal, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
new file mode 100644
index 000000000..91694d374
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -0,0 +1,332 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "bytes"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Brk implements linux syscall brk(2).
+func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr, _ := t.MemoryManager().Brk(t, args[0].Pointer())
+ // "However, the actual Linux system call returns the new program break on
+ // success. On failure, the system call returns the current break." -
+ // brk(2)
+ return uintptr(addr), nil, nil
+}
+
+// LINT.IfChange
+
+// Mmap implements linux syscall mmap(2).
+func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ prot := args[2].Int()
+ flags := args[3].Int()
+ fd := args[4].Int()
+ fixed := flags&linux.MAP_FIXED != 0
+ private := flags&linux.MAP_PRIVATE != 0
+ shared := flags&linux.MAP_SHARED != 0
+ anon := flags&linux.MAP_ANONYMOUS != 0
+ map32bit := flags&linux.MAP_32BIT != 0
+
+ // Require exactly one of MAP_PRIVATE and MAP_SHARED.
+ if private == shared {
+ return 0, nil, syserror.EINVAL
+ }
+
+ opts := memmap.MMapOpts{
+ Length: args[1].Uint64(),
+ Offset: args[5].Uint64(),
+ Addr: args[0].Pointer(),
+ Fixed: fixed,
+ Unmap: fixed,
+ Map32Bit: map32bit,
+ Private: private,
+ Perms: usermem.AccessType{
+ Read: linux.PROT_READ&prot != 0,
+ Write: linux.PROT_WRITE&prot != 0,
+ Execute: linux.PROT_EXEC&prot != 0,
+ },
+ MaxPerms: usermem.AnyAccess,
+ GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
+ Precommit: linux.MAP_POPULATE&flags != 0,
+ }
+ if linux.MAP_LOCKED&flags != 0 {
+ opts.MLockMode = memmap.MLockEager
+ }
+ defer func() {
+ if opts.MappingIdentity != nil {
+ opts.MappingIdentity.DecRef()
+ }
+ }()
+
+ if !anon {
+ // Convert the passed FD to a file reference.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ flags := file.Flags()
+ // mmap unconditionally requires that the FD is readable.
+ if !flags.Read {
+ return 0, nil, syserror.EACCES
+ }
+ // MAP_SHARED requires that the FD be writable for PROT_WRITE.
+ if shared && !flags.Write {
+ opts.MaxPerms.Write = false
+ }
+
+ if err := file.ConfigureMMap(t, &opts); err != nil {
+ return 0, nil, err
+ }
+ }
+
+ rv, err := t.MemoryManager().MMap(t, opts)
+ return uintptr(rv), nil, err
+}
+
+// LINT.ThenChange(vfs2/mmap.go)
+
+// Munmap implements linux syscall munmap(2).
+func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64())
+}
+
+// Mremap implements linux syscall mremap(2).
+func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ oldAddr := args[0].Pointer()
+ oldSize := args[1].Uint64()
+ newSize := args[2].Uint64()
+ flags := args[3].Uint64()
+ newAddr := args[4].Pointer()
+
+ if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ mayMove := flags&linux.MREMAP_MAYMOVE != 0
+ fixed := flags&linux.MREMAP_FIXED != 0
+ var moveMode mm.MRemapMoveMode
+ switch {
+ case !mayMove && !fixed:
+ moveMode = mm.MRemapNoMove
+ case mayMove && !fixed:
+ moveMode = mm.MRemapMayMove
+ case mayMove && fixed:
+ moveMode = mm.MRemapMustMove
+ case !mayMove && fixed:
+ // "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be
+ // specified." - mremap(2)
+ return 0, nil, syserror.EINVAL
+ }
+
+ rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{
+ Move: moveMode,
+ NewAddr: newAddr,
+ })
+ return uintptr(rv), nil, err
+}
+
+// Mprotect implements linux syscall mprotect(2).
+func Mprotect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ length := args[1].Uint64()
+ prot := args[2].Int()
+ err := t.MemoryManager().MProtect(args[0].Pointer(), length, usermem.AccessType{
+ Read: linux.PROT_READ&prot != 0,
+ Write: linux.PROT_WRITE&prot != 0,
+ Execute: linux.PROT_EXEC&prot != 0,
+ }, linux.PROT_GROWSDOWN&prot != 0)
+ return 0, nil, err
+}
+
+// Madvise implements linux syscall madvise(2).
+func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := uint64(args[1].SizeT())
+ adv := args[2].Int()
+
+ // "The Linux implementation requires that the address addr be
+ // page-aligned, and allows length to be zero." - madvise(2)
+ if addr.RoundDown() != addr {
+ return 0, nil, syserror.EINVAL
+ }
+ if length == 0 {
+ return 0, nil, nil
+ }
+ // Not explicitly stated: length need not be page-aligned.
+ lenAddr, ok := usermem.Addr(length).RoundUp()
+ if !ok {
+ return 0, nil, syserror.EINVAL
+ }
+ length = uint64(lenAddr)
+
+ switch adv {
+ case linux.MADV_DONTNEED:
+ return 0, nil, t.MemoryManager().Decommit(addr, length)
+ case linux.MADV_DOFORK:
+ return 0, nil, t.MemoryManager().SetDontFork(addr, length, false)
+ case linux.MADV_DONTFORK:
+ return 0, nil, t.MemoryManager().SetDontFork(addr, length, true)
+ case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE:
+ fallthrough
+ case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE:
+ fallthrough
+ case linux.MADV_DONTDUMP, linux.MADV_DODUMP:
+ // TODO(b/72045799): Core dumping isn't implemented, so these are
+ // no-ops.
+ fallthrough
+ case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED:
+ // Do nothing, we totally ignore the suggestions above.
+ return 0, nil, nil
+ case linux.MADV_REMOVE:
+ // These "suggestions" have application-visible side effects, so we
+ // have to indicate that we don't support them.
+ return 0, nil, syserror.ENOSYS
+ case linux.MADV_HWPOISON:
+ // Only privileged processes are allowed to poison pages.
+ return 0, nil, syserror.EPERM
+ default:
+ // If adv is not a valid value tell the caller.
+ return 0, nil, syserror.EINVAL
+ }
+}
+
+// Mincore implements the syscall mincore(2).
+func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].SizeT()
+ vec := args[2].Pointer()
+
+ if addr != addr.RoundDown() {
+ return 0, nil, syserror.EINVAL
+ }
+ // "The length argument need not be a multiple of the page size, but since
+ // residency information is returned for whole pages, length is effectively
+ // rounded up to the next multiple of the page size." - mincore(2)
+ la, ok := usermem.Addr(length).RoundUp()
+ if !ok {
+ return 0, nil, syserror.ENOMEM
+ }
+ ar, ok := addr.ToRange(uint64(la))
+ if !ok {
+ return 0, nil, syserror.ENOMEM
+ }
+
+ // Pretend that all mapped pages are "resident in core".
+ mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
+ // "ENOMEM: addr to addr + length contained unmapped memory."
+ if mapped != uint64(la) {
+ return 0, nil, syserror.ENOMEM
+ }
+ resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize))
+ _, err := t.CopyOut(vec, resident)
+ return 0, nil, err
+}
+
+// Msync implements Linux syscall msync(2).
+func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].SizeT()
+ flags := args[2].Int()
+
+ // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
+ // and may additionally include the MS_INVALIDATE bit. ... However, Linux
+ // permits a call to msync() that specifies neither of these flags, with
+ // semantics that are (currently) equivalent to specifying MS_ASYNC." -
+ // msync(2)
+ if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ sync := flags&linux.MS_SYNC != 0
+ if sync && flags&linux.MS_ASYNC != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{
+ Sync: sync,
+ Invalidate: flags&linux.MS_INVALIDATE != 0,
+ })
+ // MSync calls fsync, the same interrupt conversion rules apply, see
+ // mm/msync.c, fsync POSIX.1-2008.
+ return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// Mlock implements linux syscall mlock(2).
+func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].SizeT()
+
+ return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager)
+}
+
+// Mlock2 implements linux syscall mlock2(2).
+func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].SizeT()
+ flags := args[2].Int()
+
+ if flags&^(linux.MLOCK_ONFAULT) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ mode := memmap.MLockEager
+ if flags&linux.MLOCK_ONFAULT != 0 {
+ mode = memmap.MLockLazy
+ }
+ return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode)
+}
+
+// Munlock implements linux syscall munlock(2).
+func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].SizeT()
+
+ return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone)
+}
+
+// Mlockall implements linux syscall mlockall(2).
+func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := args[0].Int()
+
+ if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ mode := memmap.MLockEager
+ if flags&linux.MCL_ONFAULT != 0 {
+ mode = memmap.MLockLazy
+ }
+ return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+ Current: flags&linux.MCL_CURRENT != 0,
+ Future: flags&linux.MCL_FUTURE != 0,
+ Mode: mode,
+ })
+}
+
+// Munlockall implements linux syscall munlockall(2).
+func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+ Current: true,
+ Future: true,
+ Mode: memmap.MLockNone,
+ })
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
new file mode 100644
index 000000000..eb5ff48f5
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -0,0 +1,154 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mount implements Linux syscall mount(2).
+func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ sourceAddr := args[0].Pointer()
+ targetAddr := args[1].Pointer()
+ typeAddr := args[2].Pointer()
+ flags := args[3].Uint64()
+ dataAddr := args[4].Pointer()
+
+ fsType, err := t.CopyInString(typeAddr, usermem.PageSize)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ sourcePath, _, err := copyInPath(t, sourceAddr, true /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ targetPath, _, err := copyInPath(t, targetAddr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ data := ""
+ if dataAddr != 0 {
+ // In Linux, a full page is always copied in regardless of null
+ // character placement, and the address is passed to each file system.
+ // Most file systems always treat this data as a string, though, and so
+ // do all of the ones we implement.
+ data, err = t.CopyInString(dataAddr, usermem.PageSize)
+ if err != nil {
+ return 0, nil, err
+ }
+ }
+
+ // Ignore magic value that was required before Linux 2.4.
+ if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL {
+ flags = flags &^ linux.MS_MGC_MSK
+ }
+
+ // Must have CAP_SYS_ADMIN in the mount namespace's associated user
+ // namespace.
+ if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) {
+ return 0, nil, syserror.EPERM
+ }
+
+ const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND |
+ linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE |
+ linux.MS_UNBINDABLE | linux.MS_MOVE
+
+ // Silently allow MS_NOSUID, since we don't implement set-id bits
+ // anyway.
+ const unsupportedFlags = linux.MS_NODEV |
+ linux.MS_NODIRATIME | linux.MS_STRICTATIME
+
+ // Linux just allows passing any flags to mount(2) - it won't fail when
+ // unknown or unsupported flags are passed. Since we don't implement
+ // everything, we fail explicitly on flags that are unimplemented.
+ if flags&(unsupportedOps|unsupportedFlags) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ rsys, ok := fs.FindFilesystem(fsType)
+ if !ok {
+ return 0, nil, syserror.ENODEV
+ }
+ if !rsys.AllowUserMount() {
+ return 0, nil, syserror.EPERM
+ }
+
+ var superFlags fs.MountSourceFlags
+ if flags&linux.MS_NOATIME == linux.MS_NOATIME {
+ superFlags.NoAtime = true
+ }
+ if flags&linux.MS_RDONLY == linux.MS_RDONLY {
+ superFlags.ReadOnly = true
+ }
+ if flags&linux.MS_NOEXEC == linux.MS_NOEXEC {
+ superFlags.NoExec = true
+ }
+
+ rootInode, err := rsys.Mount(t, sourcePath, superFlags, data, nil)
+ if err != nil {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if err := fileOpOn(t, linux.AT_FDCWD, targetPath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ // Mount will take a reference on rootInode if successful.
+ return t.MountNamespace().Mount(t, d, rootInode)
+ }); err != nil {
+ // Something went wrong. Drop our ref on rootInode before
+ // returning the error.
+ rootInode.DecRef()
+ return 0, nil, err
+ }
+
+ return 0, nil, nil
+}
+
+// Umount2 implements Linux syscall umount2(2).
+func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ flags := args[1].Int()
+
+ const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE
+ if flags&unsupported != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // Must have CAP_SYS_ADMIN in the mount namespace's associated user
+ // namespace.
+ //
+ // Currently, this is always the init task's user namespace.
+ if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) {
+ return 0, nil, syserror.EPERM
+ }
+
+ resolve := flags&linux.UMOUNT_NOFOLLOW != linux.UMOUNT_NOFOLLOW
+ detachOnly := flags&linux.MNT_DETACH == linux.MNT_DETACH
+
+ return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ return t.MountNamespace().Unmount(t, d, detachOnly)
+ })
+}
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
new file mode 100644
index 000000000..43c510930
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -0,0 +1,77 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// LINT.IfChange
+
+// pipe2 implements the actual system call with flags.
+func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
+ if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
+ return 0, syserror.EINVAL
+ }
+ r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize)
+
+ r.SetFlags(linuxToFlags(flags).Settable())
+ defer r.DecRef()
+
+ w.SetFlags(linuxToFlags(flags).Settable())
+ defer w.DecRef()
+
+ fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{
+ CloseOnExec: flags&linux.O_CLOEXEC != 0,
+ })
+ if err != nil {
+ return 0, err
+ }
+
+ if _, err := t.CopyOut(addr, fds); err != nil {
+ for _, fd := range fds {
+ if file, _ := t.FDTable().Remove(fd); file != nil {
+ file.DecRef()
+ }
+ }
+ return 0, err
+ }
+ return 0, nil
+}
+
+// Pipe implements linux syscall pipe(2).
+func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+
+ n, err := pipe2(t, addr, 0)
+ return n, nil, err
+}
+
+// Pipe2 implements linux syscall pipe2(2).
+func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ flags := uint(args[1].Uint())
+
+ n, err := pipe2(t, addr, flags)
+ return n, nil, err
+}
+
+// LINT.ThenChange(vfs2/pipe.go)
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
new file mode 100644
index 000000000..f0198141c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -0,0 +1,545 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// fileCap is the maximum allowable files for poll & select.
+const fileCap = 1024 * 1024
+
+// Masks for "readable", "writable", and "exceptional" events as defined by
+// select(2).
+const (
+ // selectReadEvents is analogous to the Linux kernel's
+ // fs/select.c:POLLIN_SET.
+ selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR
+
+ // selectWriteEvents is analogous to the Linux kernel's
+ // fs/select.c:POLLOUT_SET.
+ selectWriteEvents = linux.POLLOUT | linux.POLLERR
+
+ // selectExceptEvents is analogous to the Linux kernel's
+ // fs/select.c:POLLEX_SET.
+ selectExceptEvents = linux.POLLPRI
+)
+
+// pollState tracks the associated file descriptor and waiter of a PollFD.
+type pollState struct {
+ file *fs.File
+ waiter waiter.Entry
+}
+
+// initReadiness gets the current ready mask for the file represented by the FD
+// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
+// used to register with the file for event notifications, and a reference to
+// the file is stored in "state".
+func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) {
+ if pfd.FD < 0 {
+ pfd.REvents = 0
+ return
+ }
+
+ file := t.GetFile(pfd.FD)
+ if file == nil {
+ pfd.REvents = linux.POLLNVAL
+ return
+ }
+
+ if ch == nil {
+ defer file.DecRef()
+ } else {
+ state.file = file
+ state.waiter, _ = waiter.NewChannelEntry(ch)
+ file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events)))
+ }
+
+ r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events)))
+ pfd.REvents = int16(r.ToLinux()) & pfd.Events
+}
+
+// releaseState releases all the pollState in "state".
+func releaseState(state []pollState) {
+ for i := range state {
+ if state[i].file != nil {
+ state[i].file.EventUnregister(&state[i].waiter)
+ state[i].file.DecRef()
+ }
+ }
+}
+
+// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout"
+// when "timeout" is greater than zero.
+//
+// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or
+// positive if interrupted by a signal.
+func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
+ var ch chan struct{}
+ if timeout != 0 {
+ ch = make(chan struct{}, 1)
+ }
+
+ // Register for event notification in the files involved if we may
+ // block (timeout not zero). Once we find a file that has a non-zero
+ // result, we stop registering for events but still go through all files
+ // to get their ready masks.
+ state := make([]pollState, len(pfd))
+ defer releaseState(state)
+ n := uintptr(0)
+ for i := range pfd {
+ initReadiness(t, &pfd[i], &state[i], ch)
+ if pfd[i].REvents != 0 {
+ n++
+ ch = nil
+ }
+ }
+
+ if timeout == 0 {
+ return timeout, n, nil
+ }
+
+ forever := timeout < 0
+
+ for n == 0 {
+ var err error
+ // Wait for a notification.
+ timeout, err = t.BlockWithTimeout(ch, !forever, timeout)
+ if err != nil {
+ if err == syserror.ETIMEDOUT {
+ err = nil
+ }
+ return timeout, 0, err
+ }
+
+ // We got notified, count how many files are ready. If none,
+ // then this was a spurious notification, and we just go back
+ // to sleep with the remaining timeout.
+ for i := range state {
+ if state[i].file == nil {
+ continue
+ }
+
+ r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events)))
+ rl := int16(r.ToLinux()) & pfd[i].Events
+ if rl != 0 {
+ pfd[i].REvents = rl
+ n++
+ }
+ }
+ }
+
+ return timeout, n, nil
+}
+
+// CopyInPollFDs copies an array of struct pollfd unless nfds exceeds the max.
+func CopyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) {
+ if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
+ return nil, syserror.EINVAL
+ }
+
+ pfd := make([]linux.PollFD, nfds)
+ if nfds > 0 {
+ if _, err := t.CopyIn(addr, &pfd); err != nil {
+ return nil, err
+ }
+ }
+
+ return pfd, nil
+}
+
+func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
+ pfd, err := CopyInPollFDs(t, addr, nfds)
+ if err != nil {
+ return timeout, 0, err
+ }
+
+ // Compatibility warning: Linux adds POLLHUP and POLLERR just before
+ // polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
+ // polling, changing event masks here is an application-visible difference.
+ // (Linux also doesn't copy out event masks at all, only revents.)
+ for i := range pfd {
+ pfd[i].Events |= linux.POLLHUP | linux.POLLERR
+ }
+ remainingTimeout, n, err := pollBlock(t, pfd, timeout)
+ err = syserror.ConvertIntr(err, syserror.EINTR)
+
+ // The poll entries are copied out regardless of whether
+ // any are set or not. This aligns with the Linux behavior.
+ if nfds > 0 && err == nil {
+ if _, err := t.CopyOut(addr, pfd); err != nil {
+ return remainingTimeout, 0, err
+ }
+ }
+
+ return remainingTimeout, n, err
+}
+
+// CopyInFDSet copies an fd set from select(2)/pselect(2).
+func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
+ set := make([]byte, nBytes)
+
+ if addr != 0 {
+ if _, err := t.CopyIn(addr, &set); err != nil {
+ return nil, err
+ }
+ // If we only use part of the last byte, mask out the extraneous bits.
+ //
+ // N.B. This only works on little-endian architectures.
+ if nBitsInLastPartialByte != 0 {
+ set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
+ }
+ }
+ return set, nil
+}
+
+func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
+ if nfds < 0 || nfds > fileCap {
+ return 0, syserror.EINVAL
+ }
+
+ // Calculate the size of the fd sets (one bit per fd).
+ nBytes := (nfds + 7) / 8
+ nBitsInLastPartialByte := nfds % 8
+
+ // Capture all the provided input vectors.
+ r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
+ if err != nil {
+ return 0, err
+ }
+ w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
+ if err != nil {
+ return 0, err
+ }
+ e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
+ if err != nil {
+ return 0, err
+ }
+
+ // Count how many FDs are actually being requested so that we can build
+ // a PollFD array.
+ fdCount := 0
+ for i := 0; i < nBytes; i++ {
+ v := r[i] | w[i] | e[i]
+ for v != 0 {
+ v &= (v - 1)
+ fdCount++
+ }
+ }
+
+ // Build the PollFD array.
+ pfd := make([]linux.PollFD, 0, fdCount)
+ var fd int32
+ for i := 0; i < nBytes; i++ {
+ rV, wV, eV := r[i], w[i], e[i]
+ v := rV | wV | eV
+ m := byte(1)
+ for j := 0; j < 8; j++ {
+ if (v & m) != 0 {
+ // Make sure the fd is valid and decrement the reference
+ // immediately to ensure we don't leak. Note, another thread
+ // might be about to close fd. This is racy, but that's
+ // OK. Linux is racy in the same way.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, syserror.EBADF
+ }
+ file.DecRef()
+
+ var mask int16
+ if (rV & m) != 0 {
+ mask |= selectReadEvents
+ }
+
+ if (wV & m) != 0 {
+ mask |= selectWriteEvents
+ }
+
+ if (eV & m) != 0 {
+ mask |= selectExceptEvents
+ }
+
+ pfd = append(pfd, linux.PollFD{
+ FD: fd,
+ Events: mask,
+ })
+ }
+
+ fd++
+ m <<= 1
+ }
+ }
+
+ // Do the syscall, then count the number of bits set.
+ if _, _, err = pollBlock(t, pfd, timeout); err != nil {
+ return 0, syserror.ConvertIntr(err, syserror.EINTR)
+ }
+
+ // r, w, and e are currently event mask bitsets; unset bits corresponding
+ // to events that *didn't* occur.
+ bitSetCount := uintptr(0)
+ for idx := range pfd {
+ events := pfd[idx].REvents
+ i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
+ m := byte(1) << j
+ if r[i]&m != 0 {
+ if (events & selectReadEvents) != 0 {
+ bitSetCount++
+ } else {
+ r[i] &^= m
+ }
+ }
+ if w[i]&m != 0 {
+ if (events & selectWriteEvents) != 0 {
+ bitSetCount++
+ } else {
+ w[i] &^= m
+ }
+ }
+ if e[i]&m != 0 {
+ if (events & selectExceptEvents) != 0 {
+ bitSetCount++
+ } else {
+ e[i] &^= m
+ }
+ }
+ }
+
+ // Copy updated vectors back.
+ if readFDs != 0 {
+ if _, err := t.CopyOut(readFDs, r); err != nil {
+ return 0, err
+ }
+ }
+
+ if writeFDs != 0 {
+ if _, err := t.CopyOut(writeFDs, w); err != nil {
+ return 0, err
+ }
+ }
+
+ if exceptFDs != 0 {
+ if _, err := t.CopyOut(exceptFDs, e); err != nil {
+ return 0, err
+ }
+ }
+
+ return bitSetCount, nil
+}
+
+// timeoutRemaining returns the amount of time remaining for the specified
+// timeout or 0 if it has elapsed.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
+ now := t.Kernel().MonotonicClock().Now()
+ remaining := timeout - now.Sub(startNs)
+ if remaining < 0 {
+ remaining = 0
+ }
+ return remaining
+}
+
+// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error {
+ if timeout <= 0 {
+ return nil
+ }
+ remaining := timeoutRemaining(t, startNs, timeout)
+ tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
+ return copyTimespecOut(t, timespecAddr, &tsRemaining)
+}
+
+// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error {
+ if timeout <= 0 {
+ return nil
+ }
+ remaining := timeoutRemaining(t, startNs, timeout)
+ tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
+ return copyTimevalOut(t, timevalAddr, &tvRemaining)
+}
+
+// pollRestartBlock encapsulates the state required to restart poll(2) via
+// restart_syscall(2).
+//
+// +stateify savable
+type pollRestartBlock struct {
+ pfdAddr usermem.Addr
+ nfds uint
+ timeout time.Duration
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+ return poll(t, p.pfdAddr, p.nfds, p.timeout)
+}
+
+func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
+ remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
+ // On an interrupt poll(2) is restarted with the remaining timeout.
+ if err == syserror.EINTR {
+ t.SetSyscallRestartBlock(&pollRestartBlock{
+ pfdAddr: pfdAddr,
+ nfds: nfds,
+ timeout: remainingTimeout,
+ })
+ return 0, kernel.ERESTART_RESTARTBLOCK
+ }
+ return n, err
+}
+
+// Poll implements linux syscall poll(2).
+func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pfdAddr := args[0].Pointer()
+ nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+ timeout := time.Duration(args[2].Int()) * time.Millisecond
+ n, err := poll(t, pfdAddr, nfds, timeout)
+ return n, nil, err
+}
+
+// Ppoll implements linux syscall ppoll(2).
+func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pfdAddr := args[0].Pointer()
+ nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+ timespecAddr := args[2].Pointer()
+ maskAddr := args[3].Pointer()
+ maskSize := uint(args[4].Uint())
+
+ timeout, err := copyTimespecInToDuration(t, timespecAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ var startNs ktime.Time
+ if timeout > 0 {
+ startNs = t.Kernel().MonotonicClock().Now()
+ }
+
+ if maskAddr != 0 {
+ mask, err := CopyInSigSet(t, maskAddr, maskSize)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ oldmask := t.SignalMask()
+ t.SetSignalMask(mask)
+ t.SetSavedSignalMask(oldmask)
+ }
+
+ _, n, err := doPoll(t, pfdAddr, nfds, timeout)
+ copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+ // doPoll returns EINTR if interrupted, but ppoll is normally restartable
+ // if interrupted by something other than a signal handled by the
+ // application (i.e. returns ERESTARTNOHAND). However, if
+ // copyOutTimespecRemaining failed, then the restarted ppoll would use the
+ // wrong timeout, so the error should be left as EINTR.
+ //
+ // Note that this means that if err is nil but copyErr is not, copyErr is
+ // ignored. This is consistent with Linux.
+ if err == syserror.EINTR && copyErr == nil {
+ err = kernel.ERESTARTNOHAND
+ }
+ return n, nil, err
+}
+
+// Select implements linux syscall select(2).
+func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ nfds := int(args[0].Int()) // select(2) uses an int.
+ readFDs := args[1].Pointer()
+ writeFDs := args[2].Pointer()
+ exceptFDs := args[3].Pointer()
+ timevalAddr := args[4].Pointer()
+
+ // Use a negative Duration to indicate "no timeout".
+ timeout := time.Duration(-1)
+ if timevalAddr != 0 {
+ timeval, err := copyTimevalIn(t, timevalAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+ if timeval.Sec < 0 || timeval.Usec < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ timeout = time.Duration(timeval.ToNsecCapped())
+ }
+ startNs := t.Kernel().MonotonicClock().Now()
+ n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+ copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
+ // See comment in Ppoll.
+ if err == syserror.EINTR && copyErr == nil {
+ err = kernel.ERESTARTNOHAND
+ }
+ return n, nil, err
+}
+
+// Pselect implements linux syscall pselect(2).
+func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ nfds := int(args[0].Int()) // select(2) uses an int.
+ readFDs := args[1].Pointer()
+ writeFDs := args[2].Pointer()
+ exceptFDs := args[3].Pointer()
+ timespecAddr := args[4].Pointer()
+ maskWithSizeAddr := args[5].Pointer()
+
+ timeout, err := copyTimespecInToDuration(t, timespecAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ var startNs ktime.Time
+ if timeout > 0 {
+ startNs = t.Kernel().MonotonicClock().Now()
+ }
+
+ if maskWithSizeAddr != 0 {
+ maskAddr, size, err := copyInSigSetWithSize(t, maskWithSizeAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ if maskAddr != 0 {
+ mask, err := CopyInSigSet(t, maskAddr, size)
+ if err != nil {
+ return 0, nil, err
+ }
+ oldmask := t.SignalMask()
+ t.SetSignalMask(mask)
+ t.SetSavedSignalMask(oldmask)
+ }
+ }
+
+ n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+ copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+ // See comment in Ppoll.
+ if err == syserror.EINTR && copyErr == nil {
+ err = kernel.ERESTARTNOHAND
+ }
+ return n, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
new file mode 100644
index 000000000..f92bf8096
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -0,0 +1,228 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Prctl implements linux syscall prctl(2).
+// It has a list of subfunctions which operate on the process. The arguments are
+// all based on each subfunction.
+func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ option := args[0].Int()
+
+ switch option {
+ case linux.PR_SET_PDEATHSIG:
+ sig := linux.Signal(args[1].Int())
+ if sig != 0 && !sig.IsValid() {
+ return 0, nil, syserror.EINVAL
+ }
+ t.SetParentDeathSignal(sig)
+ return 0, nil, nil
+
+ case linux.PR_GET_PDEATHSIG:
+ _, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal()))
+ return 0, nil, err
+
+ case linux.PR_GET_DUMPABLE:
+ d := t.MemoryManager().Dumpability()
+ switch d {
+ case mm.NotDumpable:
+ return linux.SUID_DUMP_DISABLE, nil, nil
+ case mm.UserDumpable:
+ return linux.SUID_DUMP_USER, nil, nil
+ case mm.RootDumpable:
+ return linux.SUID_DUMP_ROOT, nil, nil
+ default:
+ panic(fmt.Sprintf("Unknown dumpability %v", d))
+ }
+
+ case linux.PR_SET_DUMPABLE:
+ var d mm.Dumpability
+ switch args[1].Int() {
+ case linux.SUID_DUMP_DISABLE:
+ d = mm.NotDumpable
+ case linux.SUID_DUMP_USER:
+ d = mm.UserDumpable
+ default:
+ // N.B. Userspace may not pass SUID_DUMP_ROOT.
+ return 0, nil, syserror.EINVAL
+ }
+ t.MemoryManager().SetDumpability(d)
+ return 0, nil, nil
+
+ case linux.PR_GET_KEEPCAPS:
+ if t.Credentials().KeepCaps {
+ return 1, nil, nil
+ }
+
+ return 0, nil, nil
+
+ case linux.PR_SET_KEEPCAPS:
+ val := args[1].Int()
+ // prctl(2): arg2 must be either 0 (permitted capabilities are cleared)
+ // or 1 (permitted capabilities are kept).
+ if val == 0 {
+ t.SetKeepCaps(false)
+ } else if val == 1 {
+ t.SetKeepCaps(true)
+ } else {
+ return 0, nil, syserror.EINVAL
+ }
+
+ return 0, nil, nil
+
+ case linux.PR_SET_NAME:
+ addr := args[1].Pointer()
+ name, err := t.CopyInString(addr, linux.TASK_COMM_LEN-1)
+ if err != nil && err != syserror.ENAMETOOLONG {
+ return 0, nil, err
+ }
+ t.SetName(name)
+
+ case linux.PR_GET_NAME:
+ addr := args[1].Pointer()
+ buf := t.CopyScratchBuffer(linux.TASK_COMM_LEN)
+ len := copy(buf, t.Name())
+ if len < linux.TASK_COMM_LEN {
+ buf[len] = 0
+ len++
+ }
+ _, err := t.CopyOut(addr, buf[:len])
+ if err != nil {
+ return 0, nil, err
+ }
+
+ case linux.PR_SET_MM:
+ if !t.HasCapability(linux.CAP_SYS_RESOURCE) {
+ return 0, nil, syserror.EPERM
+ }
+
+ switch args[1].Int() {
+ case linux.PR_SET_MM_EXE_FILE:
+ fd := args[2].Int()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // They trying to set exe to a non-file?
+ if !fs.IsFile(file.Dirent.Inode.StableAttr) {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Set the underlying executable.
+ t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file))
+
+ case linux.PR_SET_MM_AUXV,
+ linux.PR_SET_MM_START_CODE,
+ linux.PR_SET_MM_END_CODE,
+ linux.PR_SET_MM_START_DATA,
+ linux.PR_SET_MM_END_DATA,
+ linux.PR_SET_MM_START_STACK,
+ linux.PR_SET_MM_START_BRK,
+ linux.PR_SET_MM_BRK,
+ linux.PR_SET_MM_ARG_START,
+ linux.PR_SET_MM_ARG_END,
+ linux.PR_SET_MM_ENV_START,
+ linux.PR_SET_MM_ENV_END:
+
+ t.Kernel().EmitUnimplementedEvent(t)
+ fallthrough
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ case linux.PR_SET_NO_NEW_PRIVS:
+ if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ // PR_SET_NO_NEW_PRIVS is assumed to always be set.
+ // See kernel.Task.updateCredsForExecLocked.
+ return 0, nil, nil
+
+ case linux.PR_GET_NO_NEW_PRIVS:
+ if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ return 1, nil, nil
+
+ case linux.PR_SET_SECCOMP:
+ if args[1].Int() != linux.SECCOMP_MODE_FILTER {
+ // Unsupported mode.
+ return 0, nil, syserror.EINVAL
+ }
+
+ return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer())
+
+ case linux.PR_GET_SECCOMP:
+ return uintptr(t.SeccompMode()), nil, nil
+
+ case linux.PR_CAPBSET_READ:
+ cp := linux.Capability(args[1].Uint64())
+ if !cp.Ok() {
+ return 0, nil, syserror.EINVAL
+ }
+ var rv uintptr
+ if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 {
+ rv = 1
+ }
+ return rv, nil, nil
+
+ case linux.PR_CAPBSET_DROP:
+ cp := linux.Capability(args[1].Uint64())
+ if !cp.Ok() {
+ return 0, nil, syserror.EINVAL
+ }
+ return 0, nil, t.DropBoundingCapability(cp)
+
+ case linux.PR_GET_TIMING,
+ linux.PR_SET_TIMING,
+ linux.PR_GET_TSC,
+ linux.PR_SET_TSC,
+ linux.PR_TASK_PERF_EVENTS_DISABLE,
+ linux.PR_TASK_PERF_EVENTS_ENABLE,
+ linux.PR_GET_TIMERSLACK,
+ linux.PR_SET_TIMERSLACK,
+ linux.PR_MCE_KILL,
+ linux.PR_MCE_KILL_GET,
+ linux.PR_GET_TID_ADDRESS,
+ linux.PR_SET_CHILD_SUBREAPER,
+ linux.PR_GET_CHILD_SUBREAPER,
+ linux.PR_GET_THP_DISABLE,
+ linux.PR_SET_THP_DISABLE,
+ linux.PR_MPX_ENABLE_MANAGEMENT,
+ linux.PR_MPX_DISABLE_MANAGEMENT:
+
+ t.Kernel().EmitUnimplementedEvent(t)
+ fallthrough
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go
new file mode 100644
index 000000000..c0aa0fd60
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_random.go
@@ -0,0 +1,92 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "io"
+ "math"
+
+ "gvisor.dev/gvisor/pkg/rand"
+ "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+ _GRND_NONBLOCK = 0x1
+ _GRND_RANDOM = 0x2
+)
+
+// GetRandom implements the linux syscall getrandom(2).
+//
+// In a multi-tenant/shared environment, the only valid implementation is to
+// fetch data from the urandom pool, otherwise starvation attacks become
+// possible. The urandom pool is also expected to have plenty of entropy, thus
+// the GRND_RANDOM flag is ignored. The GRND_NONBLOCK flag does not apply, as
+// the pool will already be initialized.
+func GetRandom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].SizeT()
+ flags := args[2].Int()
+
+ // Flags are checked for validity but otherwise ignored. See above.
+ if flags & ^(_GRND_NONBLOCK|_GRND_RANDOM) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if length > math.MaxInt32 {
+ length = math.MaxInt32
+ }
+ ar, ok := addr.ToRange(uint64(length))
+ if !ok {
+ return 0, nil, syserror.EFAULT
+ }
+
+ // "If the urandom source has been initialized, reads of up to 256 bytes
+ // will always return as many bytes as requested and will not be
+ // interrupted by signals. No such guarantees apply for larger buffer
+ // sizes." - getrandom(2)
+ min := int(length)
+ if min > 256 {
+ min = 256
+ }
+ n, err := t.MemoryManager().CopyOutFrom(t, usermem.AddrRangeSeqOf(ar), safemem.FromIOReader{&randReader{-1, min}}, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if n >= int64(min) {
+ return uintptr(n), nil, nil
+ }
+ return 0, nil, err
+}
+
+// randReader is a io.Reader that handles partial reads from rand.Reader.
+type randReader struct {
+ done int
+ min int
+}
+
+// Read implements io.Reader.Read.
+func (r *randReader) Read(dst []byte) (int, error) {
+ if r.done >= r.min {
+ return rand.Reader.Read(dst)
+ }
+ min := r.min - r.done
+ if min > len(dst) {
+ min = len(dst)
+ }
+ return io.ReadAtLeast(rand.Reader, dst, min)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
new file mode 100644
index 000000000..071b4bacc
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -0,0 +1,394 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/socket"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// LINT.IfChange
+
+const (
+ // EventMaskRead contains events that can be triggered on reads.
+ EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+)
+
+// Read implements linux syscall read(2). Note that we try to get a buffer that
+// is exactly the size requested because some applications like qemu expect
+// they can do large reads all at once. Bug for bug. Same for other read
+// calls below.
+func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ size := args[2].SizeT()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the file is readable.
+ if !file.Flags().Read {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Check that the size is legitimate.
+ si := int(size)
+ if si < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the destination of the read.
+ dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := readv(t, file, dst)
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+}
+
+// Readahead implements readahead(2).
+func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ offset := args[1].Int64()
+ size := args[2].SizeT()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the file is readable.
+ if !file.Flags().Read {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Check that the size is valid.
+ if int(size) < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Check that the offset is legitimate and does not overflow.
+ if offset < 0 || offset+int64(size) < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Return EINVAL; if the underlying file type does not support readahead,
+ // then Linux will return EINVAL to indicate as much. In the future, we
+ // may extend this function to actually support readahead hints.
+ return 0, nil, syserror.EINVAL
+}
+
+// Pread64 implements linux syscall pread64(2).
+func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ size := args[2].SizeT()
+ offset := args[3].Int64()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the offset is legitimate and does not overflow.
+ if offset < 0 || offset+int64(size) < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Is reading at an offset supported?
+ if !file.Flags().Pread {
+ return 0, nil, syserror.ESPIPE
+ }
+
+ // Check that the file is readable.
+ if !file.Flags().Read {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Check that the size is legitimate.
+ si := int(size)
+ if si < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the destination of the read.
+ dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := preadv(t, file, dst, offset)
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+}
+
+// Readv implements linux syscall readv(2).
+func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ iovcnt := int(args[2].Int())
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the file is readable.
+ if !file.Flags().Read {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Read the iovecs that specify the destination of the read.
+ dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := readv(t, file, dst)
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+}
+
+// Preadv implements linux syscall preadv(2).
+func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ iovcnt := int(args[2].Int())
+ offset := args[3].Int64()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the offset is legitimate.
+ if offset < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Is reading at an offset supported?
+ if !file.Flags().Pread {
+ return 0, nil, syserror.ESPIPE
+ }
+
+ // Check that the file is readable.
+ if !file.Flags().Read {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Read the iovecs that specify the destination of the read.
+ dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := preadv(t, file, dst, offset)
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+}
+
+// Preadv2 implements linux syscall preadv2(2).
+func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ // While the syscall is
+ // preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+ // the linux internal call
+ // (https://elixir.bootlin.com/linux/v4.18/source/fs/read_write.c#L1248)
+ // splits the offset argument into a high/low value for compatibility with
+ // 32-bit architectures. The flags argument is the 5th argument.
+
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ iovcnt := int(args[2].Int())
+ offset := args[3].Int64()
+ flags := int(args[5].Int())
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the offset is legitimate.
+ if offset < -1 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Is reading at an offset supported?
+ if offset > -1 && !file.Flags().Pread {
+ return 0, nil, syserror.ESPIPE
+ }
+
+ // Check that the file is readable.
+ if !file.Flags().Read {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Check flags field.
+ // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is
+ // accepted as a valid flag argument for preadv2.
+ if flags&^linux.RWF_VALID != 0 {
+ return 0, nil, syserror.EOPNOTSUPP
+ }
+
+ // Read the iovecs that specify the destination of the read.
+ dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // If preadv2 is called with an offset of -1, readv is called.
+ if offset == -1 {
+ n, err := readv(t, file, dst)
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+ }
+
+ n, err := preadv(t, file, dst, offset)
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+}
+
+func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) {
+ n, err := f.Readv(t, dst)
+ if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+ if n > 0 {
+ // Queue notification if we read anything.
+ f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+ }
+ return n, err
+ }
+
+ // Sockets support read timeouts.
+ var haveDeadline bool
+ var deadline ktime.Time
+ if s, ok := f.FileOperations.(socket.Socket); ok {
+ dl := s.RecvTimeout()
+ if dl < 0 && err == syserror.ErrWouldBlock {
+ return n, err
+ }
+ if dl > 0 {
+ deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+ haveDeadline = true
+ }
+ }
+
+ // Register for notifications.
+ w, ch := waiter.NewChannelEntry(nil)
+ f.EventRegister(&w, EventMaskRead)
+
+ total := n
+ for {
+ // Shorten dst to reflect bytes previously read.
+ dst = dst.DropFirst64(n)
+
+ // Issue the request and break out if it completes with anything
+ // other than "would block".
+ n, err = f.Readv(t, dst)
+ total += n
+ if err != syserror.ErrWouldBlock {
+ break
+ }
+
+ // Wait for a notification that we should retry.
+ if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+ if err == syserror.ETIMEDOUT {
+ err = syserror.ErrWouldBlock
+ }
+ break
+ }
+ }
+
+ f.EventUnregister(&w)
+
+ if total > 0 {
+ // Queue notification if we read anything.
+ f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+ }
+
+ return total, err
+}
+
+func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ n, err := f.Preadv(t, dst, offset)
+ if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+ if n > 0 {
+ // Queue notification if we read anything.
+ f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+ }
+ return n, err
+ }
+
+ // Register for notifications.
+ w, ch := waiter.NewChannelEntry(nil)
+ f.EventRegister(&w, EventMaskRead)
+
+ total := n
+ for {
+ // Shorten dst to reflect bytes previously read.
+ dst = dst.DropFirst64(n)
+
+ // Issue the request and break out if it completes with anything
+ // other than "would block".
+ n, err = f.Preadv(t, dst, offset+total)
+ total += n
+ if err != syserror.ErrWouldBlock {
+ break
+ }
+
+ // Wait for a notification that we should retry.
+ if err = t.Block(ch); err != nil {
+ break
+ }
+ }
+
+ f.EventUnregister(&w)
+
+ if total > 0 {
+ // Queue notification if we read anything.
+ f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+ }
+
+ return total, err
+}
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
new file mode 100644
index 000000000..d5d5b6959
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -0,0 +1,224 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// rlimit describes an implementation of 'struct rlimit', which may vary from
+// system-to-system.
+type rlimit interface {
+ // toLimit converts an rlimit to a limits.Limit.
+ toLimit() *limits.Limit
+
+ // fromLimit converts a limits.Limit to an rlimit.
+ fromLimit(lim limits.Limit)
+
+ // copyIn copies an rlimit from the untrusted app to the kernel.
+ copyIn(t *kernel.Task, addr usermem.Addr) error
+
+ // copyOut copies an rlimit from the kernel to the untrusted app.
+ copyOut(t *kernel.Task, addr usermem.Addr) error
+}
+
+// newRlimit returns the appropriate rlimit type for 'struct rlimit' on this system.
+func newRlimit(t *kernel.Task) (rlimit, error) {
+ switch t.Arch().Width() {
+ case 8:
+ // On 64-bit system, struct rlimit and struct rlimit64 are identical.
+ return &rlimit64{}, nil
+ default:
+ return nil, syserror.ENOSYS
+ }
+}
+
+type rlimit64 struct {
+ Cur uint64
+ Max uint64
+}
+
+func (r *rlimit64) toLimit() *limits.Limit {
+ return &limits.Limit{
+ Cur: limits.FromLinux(r.Cur),
+ Max: limits.FromLinux(r.Max),
+ }
+}
+
+func (r *rlimit64) fromLimit(lim limits.Limit) {
+ *r = rlimit64{
+ Cur: limits.ToLinux(lim.Cur),
+ Max: limits.ToLinux(lim.Max),
+ }
+}
+
+func (r *rlimit64) copyIn(t *kernel.Task, addr usermem.Addr) error {
+ _, err := t.CopyIn(addr, r)
+ return err
+}
+
+func (r *rlimit64) copyOut(t *kernel.Task, addr usermem.Addr) error {
+ _, err := t.CopyOut(addr, *r)
+ return err
+}
+
+func makeRlimit64(lim limits.Limit) *rlimit64 {
+ return &rlimit64{Cur: lim.Cur, Max: lim.Max}
+}
+
+// setableLimits is the set of supported setable limits.
+var setableLimits = map[limits.LimitType]struct{}{
+ limits.NumberOfFiles: {},
+ limits.AS: {},
+ limits.CPU: {},
+ limits.Data: {},
+ limits.FileSize: {},
+ limits.MemoryLocked: {},
+ limits.Stack: {},
+ // These are not enforced, but we include them here to avoid returning
+ // EPERM, since some apps expect them to succeed.
+ limits.Core: {},
+ limits.ProcessCount: {},
+}
+
+func prlimit64(t *kernel.Task, resource limits.LimitType, newLim *limits.Limit) (limits.Limit, error) {
+ if newLim == nil {
+ return t.ThreadGroup().Limits().Get(resource), nil
+ }
+
+ if _, ok := setableLimits[resource]; !ok {
+ return limits.Limit{}, syserror.EPERM
+ }
+
+ // "A privileged process (under Linux: one with the CAP_SYS_RESOURCE
+ // capability in the initial user namespace) may make arbitrary changes
+ // to either limit value."
+ privileged := t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.Kernel().RootUserNamespace())
+
+ oldLim, err := t.ThreadGroup().Limits().Set(resource, *newLim, privileged)
+ if err != nil {
+ return limits.Limit{}, err
+ }
+
+ if resource == limits.CPU {
+ t.NotifyRlimitCPUUpdated()
+ }
+ return oldLim, nil
+}
+
+// Getrlimit implements linux syscall getrlimit(2).
+func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ resource, ok := limits.FromLinuxResource[int(args[0].Int())]
+ if !ok {
+ // Return err; unknown limit.
+ return 0, nil, syserror.EINVAL
+ }
+ addr := args[1].Pointer()
+ rlim, err := newRlimit(t)
+ if err != nil {
+ return 0, nil, err
+ }
+ lim, err := prlimit64(t, resource, nil)
+ if err != nil {
+ return 0, nil, err
+ }
+ rlim.fromLimit(lim)
+ return 0, nil, rlim.copyOut(t, addr)
+}
+
+// Setrlimit implements linux syscall setrlimit(2).
+func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ resource, ok := limits.FromLinuxResource[int(args[0].Int())]
+ if !ok {
+ // Return err; unknown limit.
+ return 0, nil, syserror.EINVAL
+ }
+ addr := args[1].Pointer()
+ rlim, err := newRlimit(t)
+ if err != nil {
+ return 0, nil, err
+ }
+ if err := rlim.copyIn(t, addr); err != nil {
+ return 0, nil, syserror.EFAULT
+ }
+ _, err = prlimit64(t, resource, rlim.toLimit())
+ return 0, nil, err
+}
+
+// Prlimit64 implements linux syscall prlimit64(2).
+func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ tid := kernel.ThreadID(args[0].Int())
+ resource, ok := limits.FromLinuxResource[int(args[1].Int())]
+ if !ok {
+ // Return err; unknown limit.
+ return 0, nil, syserror.EINVAL
+ }
+ newRlimAddr := args[2].Pointer()
+ oldRlimAddr := args[3].Pointer()
+
+ var newLim *limits.Limit
+ if newRlimAddr != 0 {
+ var nrl rlimit64
+ if err := nrl.copyIn(t, newRlimAddr); err != nil {
+ return 0, nil, syserror.EFAULT
+ }
+ newLim = nrl.toLimit()
+ }
+
+ if tid < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ ot := t
+ if tid > 0 {
+ if ot = t.PIDNamespace().TaskWithID(tid); ot == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ }
+
+ // "To set or get the resources of a process other than itself, the caller
+ // must have the CAP_SYS_RESOURCE capability, or the real, effective, and
+ // saved set user IDs of the target process must match the real user ID of
+ // the caller and the real, effective, and saved set group IDs of the
+ // target process must match the real group ID of the caller."
+ if ot != t && !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) {
+ cred, tcred := t.Credentials(), ot.Credentials()
+ if cred.RealKUID != tcred.RealKUID ||
+ cred.RealKUID != tcred.EffectiveKUID ||
+ cred.RealKUID != tcred.SavedKUID ||
+ cred.RealKGID != tcred.RealKGID ||
+ cred.RealKGID != tcred.EffectiveKGID ||
+ cred.RealKGID != tcred.SavedKGID {
+ return 0, nil, syserror.EPERM
+ }
+ }
+
+ oldLim, err := prlimit64(ot, resource, newLim)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ if oldRlimAddr != 0 {
+ if err := makeRlimit64(oldLim).copyOut(t, oldRlimAddr); err != nil {
+ return 0, nil, syserror.EFAULT
+ }
+ }
+
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_rseq.go b/pkg/sentry/syscalls/linux/sys_rseq.go
new file mode 100644
index 000000000..90db10ea6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rseq.go
@@ -0,0 +1,48 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// RSeq implements syscall rseq(2).
+func RSeq(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].Uint()
+ flags := args[2].Int()
+ signature := args[3].Uint()
+
+ if !t.RSeqAvailable() {
+ // Event for applications that want rseq on a configuration
+ // that doesn't support them.
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, syserror.ENOSYS
+ }
+
+ switch flags {
+ case 0:
+ // Register.
+ return 0, nil, t.SetRSeq(addr, length, signature)
+ case linux.RSEQ_FLAG_UNREGISTER:
+ return 0, nil, t.ClearRSeq(addr, length, signature)
+ default:
+ // Unknown flag.
+ return 0, nil, syserror.EINVAL
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go
new file mode 100644
index 000000000..1674c7445
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rusage.go
@@ -0,0 +1,112 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+func getrusage(t *kernel.Task, which int32) linux.Rusage {
+ var cs usage.CPUStats
+
+ switch which {
+ case linux.RUSAGE_SELF:
+ cs = t.ThreadGroup().CPUStats()
+
+ case linux.RUSAGE_CHILDREN:
+ cs = t.ThreadGroup().JoinedChildCPUStats()
+
+ case linux.RUSAGE_THREAD:
+ cs = t.CPUStats()
+
+ case linux.RUSAGE_BOTH:
+ tg := t.ThreadGroup()
+ cs = tg.CPUStats()
+ cs.Accumulate(tg.JoinedChildCPUStats())
+ }
+
+ return linux.Rusage{
+ UTime: linux.NsecToTimeval(cs.UserTime.Nanoseconds()),
+ STime: linux.NsecToTimeval(cs.SysTime.Nanoseconds()),
+ NVCSw: int64(cs.VoluntarySwitches),
+ MaxRSS: int64(t.MaxRSS(which) / 1024),
+ }
+}
+
+// Getrusage implements linux syscall getrusage(2).
+// marked "y" are supported now
+// marked "*" are not used on Linux
+// marked "p" are pending for support
+//
+// y struct timeval ru_utime; /* user CPU time used */
+// y struct timeval ru_stime; /* system CPU time used */
+// p long ru_maxrss; /* maximum resident set size */
+// * long ru_ixrss; /* integral shared memory size */
+// * long ru_idrss; /* integral unshared data size */
+// * long ru_isrss; /* integral unshared stack size */
+// p long ru_minflt; /* page reclaims (soft page faults) */
+// p long ru_majflt; /* page faults (hard page faults) */
+// * long ru_nswap; /* swaps */
+// p long ru_inblock; /* block input operations */
+// p long ru_oublock; /* block output operations */
+// * long ru_msgsnd; /* IPC messages sent */
+// * long ru_msgrcv; /* IPC messages received */
+// * long ru_nsignals; /* signals received */
+// y long ru_nvcsw; /* voluntary context switches */
+// y long ru_nivcsw; /* involuntary context switches */
+func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ which := args[0].Int()
+ addr := args[1].Pointer()
+
+ if which != linux.RUSAGE_SELF && which != linux.RUSAGE_CHILDREN && which != linux.RUSAGE_THREAD {
+ return 0, nil, syserror.EINVAL
+ }
+
+ ru := getrusage(t, which)
+ _, err := t.CopyOut(addr, &ru)
+ return 0, nil, err
+}
+
+// Times implements linux syscall times(2).
+func Times(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+
+ // Calculate the ticks first, and figure out if any additional work is
+ // necessary. Linux allows for a NULL addr, in which case only the
+ // return value is meaningful. We don't need to do anything else.
+ ticks := uintptr(ktime.NowFromContext(t).Nanoseconds() / linux.ClockTick.Nanoseconds())
+ if addr == 0 {
+ return ticks, nil, nil
+ }
+
+ cs1 := t.ThreadGroup().CPUStats()
+ cs2 := t.ThreadGroup().JoinedChildCPUStats()
+ r := linux.Tms{
+ UTime: linux.ClockTFromDuration(cs1.UserTime),
+ STime: linux.ClockTFromDuration(cs1.SysTime),
+ CUTime: linux.ClockTFromDuration(cs2.UserTime),
+ CSTime: linux.ClockTFromDuration(cs2.SysTime),
+ }
+ if _, err := t.CopyOut(addr, &r); err != nil {
+ return 0, nil, err
+ }
+
+ return ticks, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
new file mode 100644
index 000000000..99f6993f5
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -0,0 +1,99 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+ onlyScheduler = linux.SCHED_NORMAL
+ onlyPriority = 0
+)
+
+// SchedParam replicates struct sched_param in sched.h.
+type SchedParam struct {
+ schedPriority int64
+}
+
+// SchedGetparam implements linux syscall sched_getparam(2).
+func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pid := args[0].Int()
+ param := args[1].Pointer()
+ if param == 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if pid < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ r := SchedParam{schedPriority: onlyPriority}
+ if _, err := t.CopyOut(param, r); err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, nil
+}
+
+// SchedGetscheduler implements linux syscall sched_getscheduler(2).
+func SchedGetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pid := args[0].Int()
+ if pid < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ return onlyScheduler, nil, nil
+}
+
+// SchedSetscheduler implements linux syscall sched_setscheduler(2).
+func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pid := args[0].Int()
+ policy := args[1].Int()
+ param := args[2].Pointer()
+ if pid < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if policy != onlyScheduler {
+ return 0, nil, syserror.EINVAL
+ }
+ if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ var r SchedParam
+ if _, err := t.CopyIn(param, &r); err != nil {
+ return 0, nil, syserror.EINVAL
+ }
+ if r.schedPriority != onlyPriority {
+ return 0, nil, syserror.EINVAL
+ }
+ return 0, nil, nil
+}
+
+// SchedGetPriorityMax implements linux syscall sched_get_priority_max(2).
+func SchedGetPriorityMax(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return onlyPriority, nil, nil
+}
+
+// SchedGetPriorityMin implements linux syscall sched_get_priority_min(2).
+func SchedGetPriorityMin(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return onlyPriority, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
new file mode 100644
index 000000000..5b7a66f4d
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -0,0 +1,76 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/bpf"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
+type userSockFprog struct {
+ // Len is the length of the filter in BPF instructions.
+ Len uint16
+
+ _ [6]byte // padding for alignment
+
+ // Filter is a user pointer to the struct sock_filter array that makes up
+ // the filter program. Filter is a uint64 rather than a usermem.Addr
+ // because usermem.Addr is actually uintptr, which is not a fixed-size
+ // type, and encoding/binary.Read objects to this.
+ Filter uint64
+}
+
+// seccomp applies a seccomp policy to the current task.
+func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error {
+ // We only support SECCOMP_SET_MODE_FILTER at the moment.
+ if mode != linux.SECCOMP_SET_MODE_FILTER {
+ // Unsupported mode.
+ return syserror.EINVAL
+ }
+
+ tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0
+
+ // The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC.
+ if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 {
+ // Unsupported flag.
+ return syserror.EINVAL
+ }
+
+ var fprog userSockFprog
+ if _, err := t.CopyIn(addr, &fprog); err != nil {
+ return err
+ }
+ filter := make([]linux.BPFInstruction, int(fprog.Len))
+ if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil {
+ return err
+ }
+ compiledFilter, err := bpf.Compile(filter)
+ if err != nil {
+ t.Debugf("Invalid seccomp-bpf filter: %v", err)
+ return syserror.EINVAL
+ }
+
+ return t.AppendSyscallFilter(compiledFilter, tsync)
+}
+
+// Seccomp implements linux syscall seccomp(2).
+func Seccomp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, seccomp(t, args[0].Uint64(), args[1].Uint64(), args[2].Pointer())
+}
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
new file mode 100644
index 000000000..5f54f2456
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -0,0 +1,241 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "math"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+const opsMax = 500 // SEMOPM
+
+// Semget handles: semget(key_t key, int nsems, int semflg)
+func Semget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ key := args[0].Int()
+ nsems := args[1].Int()
+ flag := args[2].Int()
+
+ private := key == linux.IPC_PRIVATE
+ create := flag&linux.IPC_CREAT == linux.IPC_CREAT
+ exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL
+ mode := linux.FileMode(flag & 0777)
+
+ r := t.IPCNamespace().SemaphoreRegistry()
+ set, err := r.FindOrCreate(t, key, nsems, mode, private, create, exclusive)
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(set.ID), nil, nil
+}
+
+// Semop handles: semop(int semid, struct sembuf *sops, size_t nsops)
+func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ id := args[0].Int()
+ sembufAddr := args[1].Pointer()
+ nsops := args[2].SizeT()
+
+ r := t.IPCNamespace().SemaphoreRegistry()
+ set := r.FindByID(id)
+ if set == nil {
+ return 0, nil, syserror.EINVAL
+ }
+ if nsops <= 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if nsops > opsMax {
+ return 0, nil, syserror.E2BIG
+ }
+
+ ops := make([]linux.Sembuf, nsops)
+ if _, err := t.CopyIn(sembufAddr, ops); err != nil {
+ return 0, nil, err
+ }
+
+ creds := auth.CredentialsFromContext(t)
+ pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
+ for {
+ ch, num, err := set.ExecuteOps(t, ops, creds, int32(pid))
+ if ch == nil || err != nil {
+ // We're done (either on success or a failure).
+ return 0, nil, err
+ }
+ if err = t.Block(ch); err != nil {
+ set.AbortWait(num, ch)
+ return 0, nil, err
+ }
+ }
+}
+
+// Semctl handles: semctl(int semid, int semnum, int cmd, ...)
+func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ id := args[0].Int()
+ num := args[1].Int()
+ cmd := args[2].Int()
+
+ switch cmd {
+ case linux.SETVAL:
+ val := args[3].Int()
+ if val > math.MaxInt16 {
+ return 0, nil, syserror.ERANGE
+ }
+ return 0, nil, setVal(t, id, num, int16(val))
+
+ case linux.SETALL:
+ array := args[3].Pointer()
+ return 0, nil, setValAll(t, id, array)
+
+ case linux.GETVAL:
+ v, err := getVal(t, id, num)
+ return uintptr(v), nil, err
+
+ case linux.GETALL:
+ array := args[3].Pointer()
+ return 0, nil, getValAll(t, id, array)
+
+ case linux.IPC_RMID:
+ return 0, nil, remove(t, id)
+
+ case linux.IPC_SET:
+ arg := args[3].Pointer()
+ s := linux.SemidDS{}
+ if _, err := t.CopyIn(arg, &s); err != nil {
+ return 0, nil, err
+ }
+
+ perms := fs.FilePermsFromMode(linux.FileMode(s.SemPerm.Mode & 0777))
+ return 0, nil, ipcSet(t, id, auth.UID(s.SemPerm.UID), auth.GID(s.SemPerm.GID), perms)
+
+ case linux.GETPID:
+ v, err := getPID(t, id, num)
+ return uintptr(v), nil, err
+
+ case linux.IPC_INFO,
+ linux.SEM_INFO,
+ linux.IPC_STAT,
+ linux.SEM_STAT,
+ linux.SEM_STAT_ANY,
+ linux.GETNCNT,
+ linux.GETZCNT:
+
+ t.Kernel().EmitUnimplementedEvent(t)
+ fallthrough
+
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+}
+
+func remove(t *kernel.Task, id int32) error {
+ r := t.IPCNamespace().SemaphoreRegistry()
+ creds := auth.CredentialsFromContext(t)
+ return r.RemoveID(id, creds)
+}
+
+func ipcSet(t *kernel.Task, id int32, uid auth.UID, gid auth.GID, perms fs.FilePermissions) error {
+ r := t.IPCNamespace().SemaphoreRegistry()
+ set := r.FindByID(id)
+ if set == nil {
+ return syserror.EINVAL
+ }
+
+ creds := auth.CredentialsFromContext(t)
+ kuid := creds.UserNamespace.MapToKUID(uid)
+ if !kuid.Ok() {
+ return syserror.EINVAL
+ }
+ kgid := creds.UserNamespace.MapToKGID(gid)
+ if !kgid.Ok() {
+ return syserror.EINVAL
+ }
+ owner := fs.FileOwner{UID: kuid, GID: kgid}
+ return set.Change(t, creds, owner, perms)
+}
+
+func setVal(t *kernel.Task, id int32, num int32, val int16) error {
+ r := t.IPCNamespace().SemaphoreRegistry()
+ set := r.FindByID(id)
+ if set == nil {
+ return syserror.EINVAL
+ }
+ creds := auth.CredentialsFromContext(t)
+ pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
+ return set.SetVal(t, num, val, creds, int32(pid))
+}
+
+func setValAll(t *kernel.Task, id int32, array usermem.Addr) error {
+ r := t.IPCNamespace().SemaphoreRegistry()
+ set := r.FindByID(id)
+ if set == nil {
+ return syserror.EINVAL
+ }
+ vals := make([]uint16, set.Size())
+ if _, err := t.CopyIn(array, vals); err != nil {
+ return err
+ }
+ creds := auth.CredentialsFromContext(t)
+ pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
+ return set.SetValAll(t, vals, creds, int32(pid))
+}
+
+func getVal(t *kernel.Task, id int32, num int32) (int16, error) {
+ r := t.IPCNamespace().SemaphoreRegistry()
+ set := r.FindByID(id)
+ if set == nil {
+ return 0, syserror.EINVAL
+ }
+ creds := auth.CredentialsFromContext(t)
+ return set.GetVal(num, creds)
+}
+
+func getValAll(t *kernel.Task, id int32, array usermem.Addr) error {
+ r := t.IPCNamespace().SemaphoreRegistry()
+ set := r.FindByID(id)
+ if set == nil {
+ return syserror.EINVAL
+ }
+ creds := auth.CredentialsFromContext(t)
+ vals, err := set.GetValAll(creds)
+ if err != nil {
+ return err
+ }
+ _, err = t.CopyOut(array, vals)
+ return err
+}
+
+func getPID(t *kernel.Task, id int32, num int32) (int32, error) {
+ r := t.IPCNamespace().SemaphoreRegistry()
+ set := r.FindByID(id)
+ if set == nil {
+ return 0, syserror.EINVAL
+ }
+ creds := auth.CredentialsFromContext(t)
+ gpid, err := set.GetPID(num, creds)
+ if err != nil {
+ return 0, err
+ }
+ // Convert pid from init namespace to the caller's namespace.
+ tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(gpid))
+ if tg == nil {
+ return 0, nil
+ }
+ return int32(tg.ID()), nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
new file mode 100644
index 000000000..4a8bc24a2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -0,0 +1,161 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/shm"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Shmget implements shmget(2).
+func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ key := shm.Key(args[0].Int())
+ size := uint64(args[1].SizeT())
+ flag := args[2].Int()
+
+ private := key == linux.IPC_PRIVATE
+ create := flag&linux.IPC_CREAT == linux.IPC_CREAT
+ exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL
+ mode := linux.FileMode(flag & 0777)
+
+ pid := int32(t.ThreadGroup().ID())
+ r := t.IPCNamespace().ShmRegistry()
+ segment, err := r.FindOrCreate(t, pid, key, size, mode, private, create, exclusive)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer segment.DecRef()
+ return uintptr(segment.ID), nil, nil
+}
+
+// findSegment retrives a shm segment by the given id.
+//
+// findSegment returns a reference on Shm.
+func findSegment(t *kernel.Task, id shm.ID) (*shm.Shm, error) {
+ r := t.IPCNamespace().ShmRegistry()
+ segment := r.FindByID(id)
+ if segment == nil {
+ // No segment with provided id.
+ return nil, syserror.EINVAL
+ }
+ return segment, nil
+}
+
+// Shmat implements shmat(2).
+func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ id := shm.ID(args[0].Int())
+ addr := args[1].Pointer()
+ flag := args[2].Int()
+
+ segment, err := findSegment(t, id)
+ if err != nil {
+ return 0, nil, syserror.EINVAL
+ }
+ defer segment.DecRef()
+
+ opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{
+ Execute: flag&linux.SHM_EXEC == linux.SHM_EXEC,
+ Readonly: flag&linux.SHM_RDONLY == linux.SHM_RDONLY,
+ Remap: flag&linux.SHM_REMAP == linux.SHM_REMAP,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ addr, err = t.MemoryManager().MMap(t, opts)
+ return uintptr(addr), nil, err
+}
+
+// Shmdt implements shmdt(2).
+func Shmdt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ err := t.MemoryManager().DetachShm(t, addr)
+ return 0, nil, err
+}
+
+// Shmctl implements shmctl(2).
+func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ id := shm.ID(args[0].Int())
+ cmd := args[1].Int()
+ buf := args[2].Pointer()
+
+ r := t.IPCNamespace().ShmRegistry()
+
+ switch cmd {
+ case linux.SHM_STAT:
+ // Technically, we should be treating id as "an index into the kernel's
+ // internal array that maintains information about all shared memory
+ // segments on the system". Since we don't track segments in an array,
+ // we'll just pretend the shmid is the index and do the same thing as
+ // IPC_STAT. Linux also uses the index as the shmid.
+ fallthrough
+ case linux.IPC_STAT:
+ segment, err := findSegment(t, id)
+ if err != nil {
+ return 0, nil, syserror.EINVAL
+ }
+ defer segment.DecRef()
+
+ stat, err := segment.IPCStat(t)
+ if err == nil {
+ _, err = t.CopyOut(buf, stat)
+ }
+ return 0, nil, err
+
+ case linux.IPC_INFO:
+ params := r.IPCInfo()
+ _, err := t.CopyOut(buf, params)
+ return 0, nil, err
+
+ case linux.SHM_INFO:
+ info := r.ShmInfo()
+ _, err := t.CopyOut(buf, info)
+ return 0, nil, err
+ }
+
+ // Remaining commands refer to a specific segment.
+ segment, err := findSegment(t, id)
+ if err != nil {
+ return 0, nil, syserror.EINVAL
+ }
+ defer segment.DecRef()
+
+ switch cmd {
+ case linux.IPC_SET:
+ var ds linux.ShmidDS
+ _, err = t.CopyIn(buf, &ds)
+ if err != nil {
+ return 0, nil, err
+ }
+ err = segment.Set(t, &ds)
+ return 0, nil, err
+
+ case linux.IPC_RMID:
+ segment.MarkDestroyed()
+ return 0, nil, nil
+
+ case linux.SHM_LOCK, linux.SHM_UNLOCK:
+ // We currently do not support memory locking anywhere.
+ // mlock(2)/munlock(2) are currently stubbed out as no-ops so do the
+ // same here.
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, nil
+
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
new file mode 100644
index 000000000..d2b0012ae
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -0,0 +1,590 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "math"
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// "For a process to have permission to send a signal it must
+// - either be privileged (CAP_KILL), or
+// - the real or effective user ID of the sending process must be equal to the
+// real or saved set-user-ID of the target process.
+//
+// In the case of SIGCONT it suffices when the sending and receiving processes
+// belong to the same session." - kill(2)
+//
+// Equivalent to kernel/signal.c:check_kill_permission.
+func mayKill(t *kernel.Task, target *kernel.Task, sig linux.Signal) bool {
+ // kernel/signal.c:check_kill_permission also allows a signal if the
+ // sending and receiving tasks share a thread group, which is not
+ // mentioned in kill(2) since kill does not allow task-level
+ // granularity in signal sending.
+ if t.ThreadGroup() == target.ThreadGroup() {
+ return true
+ }
+
+ if t.HasCapabilityIn(linux.CAP_KILL, target.UserNamespace()) {
+ return true
+ }
+
+ creds := t.Credentials()
+ tcreds := target.Credentials()
+ if creds.EffectiveKUID == tcreds.SavedKUID ||
+ creds.EffectiveKUID == tcreds.RealKUID ||
+ creds.RealKUID == tcreds.SavedKUID ||
+ creds.RealKUID == tcreds.RealKUID {
+ return true
+ }
+
+ if sig == linux.SIGCONT && target.ThreadGroup().Session() == t.ThreadGroup().Session() {
+ return true
+ }
+ return false
+}
+
+// Kill implements linux syscall kill(2).
+func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pid := kernel.ThreadID(args[0].Int())
+ sig := linux.Signal(args[1].Int())
+
+ switch {
+ case pid > 0:
+ // "If pid is positive, then signal sig is sent to the process with the
+ // ID specified by pid." - kill(2)
+ // This loops to handle races with execve where target dies between
+ // TaskWithID and SendGroupSignal. Compare Linux's
+ // kernel/signal.c:kill_pid_info().
+ for {
+ target := t.PIDNamespace().TaskWithID(pid)
+ if target == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ if !mayKill(t, target, sig) {
+ return 0, nil, syserror.EPERM
+ }
+ info := &arch.SignalInfo{
+ Signo: int32(sig),
+ Code: arch.SignalInfoUser,
+ }
+ info.SetPid(int32(target.PIDNamespace().IDOfTask(t)))
+ info.SetUid(int32(t.Credentials().RealKUID.In(target.UserNamespace()).OrOverflow()))
+ if err := target.SendGroupSignal(info); err != syserror.ESRCH {
+ return 0, nil, err
+ }
+ }
+ case pid == -1:
+ // "If pid equals -1, then sig is sent to every process for which the
+ // calling process has permission to send signals, except for process 1
+ // (init), but see below. ... POSIX.1-2001 requires that kill(-1,sig)
+ // send sig to all processes that the calling process may send signals
+ // to, except possibly for some implementation-defined system
+ // processes. Linux allows a process to signal itself, but on Linux the
+ // call kill(-1,sig) does not signal the calling process."
+ var (
+ lastErr error
+ delivered int
+ )
+ for _, tg := range t.PIDNamespace().ThreadGroups() {
+ if tg == t.ThreadGroup() {
+ continue
+ }
+ if t.PIDNamespace().IDOfThreadGroup(tg) == kernel.InitTID {
+ continue
+ }
+
+ // If pid == -1, the returned error is the last non-EPERM error
+ // from any call to group_send_sig_info.
+ if !mayKill(t, tg.Leader(), sig) {
+ continue
+ }
+ // Here and below, whether or not kill returns an error may
+ // depend on the iteration order. We at least implement the
+ // semantics documented by the man page: "On success (at least
+ // one signal was sent), zero is returned."
+ info := &arch.SignalInfo{
+ Signo: int32(sig),
+ Code: arch.SignalInfoUser,
+ }
+ info.SetPid(int32(tg.PIDNamespace().IDOfTask(t)))
+ info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow()))
+ err := tg.SendSignal(info)
+ if err == syserror.ESRCH {
+ // ESRCH is ignored because it means the task
+ // exited while we were iterating. This is a
+ // race which would not normally exist on
+ // Linux, so we suppress it.
+ continue
+ }
+ delivered++
+ if err != nil {
+ lastErr = err
+ }
+ }
+ if delivered > 0 {
+ return 0, nil, lastErr
+ }
+ return 0, nil, syserror.ESRCH
+ default:
+ // "If pid equals 0, then sig is sent to every process in the process
+ // group of the calling process."
+ //
+ // "If pid is less than -1, then sig is sent to every process
+ // in the process group whose ID is -pid."
+ pgid := kernel.ProcessGroupID(-pid)
+ if pgid == 0 {
+ pgid = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())
+ }
+
+ // If pid != -1 (i.e. signalling a process group), the returned error
+ // is the last error from any call to group_send_sig_info.
+ lastErr := syserror.ESRCH
+ for _, tg := range t.PIDNamespace().ThreadGroups() {
+ if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid {
+ if !mayKill(t, tg.Leader(), sig) {
+ lastErr = syserror.EPERM
+ continue
+ }
+
+ info := &arch.SignalInfo{
+ Signo: int32(sig),
+ Code: arch.SignalInfoUser,
+ }
+ info.SetPid(int32(tg.PIDNamespace().IDOfTask(t)))
+ info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow()))
+ // See note above regarding ESRCH race above.
+ if err := tg.SendSignal(info); err != syserror.ESRCH {
+ lastErr = err
+ }
+ }
+ }
+
+ return 0, nil, lastErr
+ }
+}
+
+func tkillSigInfo(sender, receiver *kernel.Task, sig linux.Signal) *arch.SignalInfo {
+ info := &arch.SignalInfo{
+ Signo: int32(sig),
+ Code: arch.SignalInfoTkill,
+ }
+ info.SetPid(int32(receiver.PIDNamespace().IDOfThreadGroup(sender.ThreadGroup())))
+ info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+ return info
+}
+
+// Tkill implements linux syscall tkill(2).
+func Tkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ tid := kernel.ThreadID(args[0].Int())
+ sig := linux.Signal(args[1].Int())
+
+ // N.B. Inconsistent with man page, linux actually rejects calls with
+ // tid <=0 by EINVAL. This isn't the same for all signal calls.
+ if tid <= 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ target := t.PIDNamespace().TaskWithID(tid)
+ if target == nil {
+ return 0, nil, syserror.ESRCH
+ }
+
+ if !mayKill(t, target, sig) {
+ return 0, nil, syserror.EPERM
+ }
+ return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig))
+}
+
+// Tgkill implements linux syscall tgkill(2).
+func Tgkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ tgid := kernel.ThreadID(args[0].Int())
+ tid := kernel.ThreadID(args[1].Int())
+ sig := linux.Signal(args[2].Int())
+
+ // N.B. Inconsistent with man page, linux actually rejects calls with
+ // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls.
+ if tgid <= 0 || tid <= 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ targetTG := t.PIDNamespace().ThreadGroupWithID(tgid)
+ target := t.PIDNamespace().TaskWithID(tid)
+ if targetTG == nil || target == nil || target.ThreadGroup() != targetTG {
+ return 0, nil, syserror.ESRCH
+ }
+
+ if !mayKill(t, target, sig) {
+ return 0, nil, syserror.EPERM
+ }
+ return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig))
+}
+
+// RtSigaction implements linux syscall rt_sigaction(2).
+func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ sig := linux.Signal(args[0].Int())
+ newactarg := args[1].Pointer()
+ oldactarg := args[2].Pointer()
+ sigsetsize := args[3].SizeT()
+
+ if sigsetsize != linux.SignalSetSize {
+ return 0, nil, syserror.EINVAL
+ }
+
+ var newactptr *arch.SignalAct
+ if newactarg != 0 {
+ newact, err := t.CopyInSignalAct(newactarg)
+ if err != nil {
+ return 0, nil, err
+ }
+ newactptr = &newact
+ }
+ oldact, err := t.ThreadGroup().SetSignalAct(sig, newactptr)
+ if err != nil {
+ return 0, nil, err
+ }
+ if oldactarg != 0 {
+ if err := t.CopyOutSignalAct(oldactarg, &oldact); err != nil {
+ return 0, nil, err
+ }
+ }
+ return 0, nil, nil
+}
+
+// Sigreturn implements linux syscall sigreturn(2).
+func Sigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ ctrl, err := t.SignalReturn(false)
+ return 0, ctrl, err
+}
+
+// RtSigreturn implements linux syscall rt_sigreturn(2).
+func RtSigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ ctrl, err := t.SignalReturn(true)
+ return 0, ctrl, err
+}
+
+// RtSigprocmask implements linux syscall rt_sigprocmask(2).
+func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ how := args[0].Int()
+ setaddr := args[1].Pointer()
+ oldaddr := args[2].Pointer()
+ sigsetsize := args[3].SizeT()
+
+ if sigsetsize != linux.SignalSetSize {
+ return 0, nil, syserror.EINVAL
+ }
+ oldmask := t.SignalMask()
+ if setaddr != 0 {
+ mask, err := CopyInSigSet(t, setaddr, sigsetsize)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ switch how {
+ case linux.SIG_BLOCK:
+ t.SetSignalMask(oldmask | mask)
+ case linux.SIG_UNBLOCK:
+ t.SetSignalMask(oldmask &^ mask)
+ case linux.SIG_SETMASK:
+ t.SetSignalMask(mask)
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+ }
+ if oldaddr != 0 {
+ return 0, nil, copyOutSigSet(t, oldaddr, oldmask)
+ }
+
+ return 0, nil, nil
+}
+
+// Sigaltstack implements linux syscall sigaltstack(2).
+func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ setaddr := args[0].Pointer()
+ oldaddr := args[1].Pointer()
+
+ alt := t.SignalStack()
+ if oldaddr != 0 {
+ if err := t.CopyOutSignalStack(oldaddr, &alt); err != nil {
+ return 0, nil, err
+ }
+ }
+ if setaddr != 0 {
+ alt, err := t.CopyInSignalStack(setaddr)
+ if err != nil {
+ return 0, nil, err
+ }
+ // The signal stack cannot be changed if the task is currently
+ // on the stack. This is enforced at the lowest level because
+ // these semantics apply to changing the signal stack via a
+ // ucontext during a signal handler.
+ if !t.SetSignalStack(alt) {
+ return 0, nil, syserror.EPERM
+ }
+ }
+
+ return 0, nil, nil
+}
+
+// Pause implements linux syscall pause(2).
+func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND)
+}
+
+// RtSigpending implements linux syscall rt_sigpending(2).
+func RtSigpending(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ pending := t.PendingSignals()
+ _, err := pending.CopyOut(t, addr)
+ return 0, nil, err
+}
+
+// RtSigtimedwait implements linux syscall rt_sigtimedwait(2).
+func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ sigset := args[0].Pointer()
+ siginfo := args[1].Pointer()
+ timespec := args[2].Pointer()
+ sigsetsize := args[3].SizeT()
+
+ mask, err := CopyInSigSet(t, sigset, sigsetsize)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ var timeout time.Duration
+ if timespec != 0 {
+ d, err := copyTimespecIn(t, timespec)
+ if err != nil {
+ return 0, nil, err
+ }
+ if !d.Valid() {
+ return 0, nil, syserror.EINVAL
+ }
+ timeout = time.Duration(d.ToNsecCapped())
+ } else {
+ timeout = time.Duration(math.MaxInt64)
+ }
+
+ si, err := t.Sigtimedwait(mask, timeout)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ if siginfo != 0 {
+ si.FixSignalCodeForUser()
+ if _, err := si.CopyOut(t, siginfo); err != nil {
+ return 0, nil, err
+ }
+ }
+ return uintptr(si.Signo), nil, nil
+}
+
+// RtSigqueueinfo implements linux syscall rt_sigqueueinfo(2).
+func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pid := kernel.ThreadID(args[0].Int())
+ sig := linux.Signal(args[1].Int())
+ infoAddr := args[2].Pointer()
+
+ // Copy in the info.
+ //
+ // We must ensure that the Signo is set (Linux overrides this in the
+ // same way), and that the code is in the allowed set. This same logic
+ // appears below in RtSigtgqueueinfo and should be kept in sync.
+ var info arch.SignalInfo
+ if _, err := info.CopyIn(t, infoAddr); err != nil {
+ return 0, nil, err
+ }
+ info.Signo = int32(sig)
+
+ // This must loop to handle the race with execve described in Kill.
+ for {
+ // Deliver to the given task's thread group.
+ target := t.PIDNamespace().TaskWithID(pid)
+ if target == nil {
+ return 0, nil, syserror.ESRCH
+ }
+
+ // If the sender is not the receiver, it can't use si_codes used by the
+ // kernel or SI_TKILL.
+ if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t {
+ return 0, nil, syserror.EPERM
+ }
+
+ if !mayKill(t, target, sig) {
+ return 0, nil, syserror.EPERM
+ }
+
+ if err := target.SendGroupSignal(&info); err != syserror.ESRCH {
+ return 0, nil, err
+ }
+ }
+}
+
+// RtTgsigqueueinfo implements linux syscall rt_tgsigqueueinfo(2).
+func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ tgid := kernel.ThreadID(args[0].Int())
+ tid := kernel.ThreadID(args[1].Int())
+ sig := linux.Signal(args[2].Int())
+ infoAddr := args[3].Pointer()
+
+ // N.B. Inconsistent with man page, linux actually rejects calls with
+ // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls.
+ if tgid <= 0 || tid <= 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Copy in the info. See RtSigqueueinfo above.
+ var info arch.SignalInfo
+ if _, err := info.CopyIn(t, infoAddr); err != nil {
+ return 0, nil, err
+ }
+ info.Signo = int32(sig)
+
+ // Deliver to the given task.
+ targetTG := t.PIDNamespace().ThreadGroupWithID(tgid)
+ target := t.PIDNamespace().TaskWithID(tid)
+ if targetTG == nil || target == nil || target.ThreadGroup() != targetTG {
+ return 0, nil, syserror.ESRCH
+ }
+
+ // If the sender is not the receiver, it can't use si_codes used by the
+ // kernel or SI_TKILL.
+ if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t {
+ return 0, nil, syserror.EPERM
+ }
+
+ if !mayKill(t, target, sig) {
+ return 0, nil, syserror.EPERM
+ }
+ return 0, nil, target.SendSignal(&info)
+}
+
+// RtSigsuspend implements linux syscall rt_sigsuspend(2).
+func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ sigset := args[0].Pointer()
+
+ // Copy in the signal mask.
+ var mask linux.SignalSet
+ if _, err := mask.CopyIn(t, sigset); err != nil {
+ return 0, nil, err
+ }
+ mask &^= kernel.UnblockableSignals
+
+ // Swap the mask.
+ oldmask := t.SignalMask()
+ t.SetSignalMask(mask)
+ t.SetSavedSignalMask(oldmask)
+
+ // Perform the wait.
+ return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND)
+}
+
+// RestartSyscall implements the linux syscall restart_syscall(2).
+func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ if r := t.SyscallRestartBlock(); r != nil {
+ n, err := r.Restart(t)
+ return n, nil, err
+ }
+ // The restart block should never be nil here, but it's possible
+ // ERESTART_RESTARTBLOCK was set by ptrace without the current syscall
+ // setting up a restart block. If ptrace didn't manipulate the return value,
+ // finding a nil restart block is a bug. Linux ensures that the restart
+ // function is never null by (re)initializing it with one that translates
+ // the restart into EINTR. We'll emulate that behaviour.
+ t.Debugf("Restart block missing in restart_syscall(2). Did ptrace inject a return value of ERESTART_RESTARTBLOCK?")
+ return 0, nil, syserror.EINTR
+}
+
+// sharedSignalfd is shared between the two calls.
+func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) {
+ // Copy in the signal mask.
+ mask, err := CopyInSigSet(t, sigset, sigsetsize)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // Always check for valid flags, even if not creating.
+ if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Is this a change to an existing signalfd?
+ //
+ // The spec indicates that this should adjust the mask.
+ if fd != -1 {
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Is this a signalfd?
+ if s, ok := file.FileOperations.(*signalfd.SignalOperations); ok {
+ s.SetMask(mask)
+ return 0, nil, nil
+ }
+
+ // Not a signalfd.
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Create a new file.
+ file, err := signalfd.New(t, mask)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer file.DecRef()
+
+ // Set appropriate flags.
+ file.SetFlags(fs.SettableFileFlags{
+ NonBlocking: flags&linux.SFD_NONBLOCK != 0,
+ })
+
+ // Create a new descriptor.
+ fd, err = t.NewFDFrom(0, file, kernel.FDFlags{
+ CloseOnExec: flags&linux.SFD_CLOEXEC != 0,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // Done.
+ return uintptr(fd), nil, nil
+}
+
+// Signalfd implements the linux syscall signalfd(2).
+func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ sigset := args[1].Pointer()
+ sigsetsize := args[2].SizeT()
+ return sharedSignalfd(t, fd, sigset, sigsetsize, 0)
+}
+
+// Signalfd4 implements the linux syscall signalfd4(2).
+func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ sigset := args[1].Pointer()
+ sigsetsize := args[2].SizeT()
+ flags := args[3].Int()
+ return sharedSignalfd(t, fd, sigset, sigsetsize, flags)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
new file mode 100644
index 000000000..0760af77b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -0,0 +1,1138 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/socket"
+ "gvisor.dev/gvisor/pkg/sentry/socket/control"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/syserr"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// LINT.IfChange
+
+// minListenBacklog is the minimum reasonable backlog for listening sockets.
+const minListenBacklog = 8
+
+// maxListenBacklog is the maximum allowed backlog for listening sockets.
+const maxListenBacklog = 1024
+
+// maxAddrLen is the maximum socket address length we're willing to accept.
+const maxAddrLen = 200
+
+// maxOptLen is the maximum sockopt parameter length we're willing to accept.
+const maxOptLen = 1024 * 8
+
+// maxControlLen is the maximum length of the msghdr.msg_control buffer we're
+// willing to accept. Note that this limit is smaller than Linux, which allows
+// buffers upto INT_MAX.
+const maxControlLen = 10 * 1024 * 1024
+
+// nameLenOffset is the offset from the start of the MessageHeader64 struct to
+// the NameLen field.
+const nameLenOffset = 8
+
+// controlLenOffset is the offset form the start of the MessageHeader64 struct
+// to the ControlLen field.
+const controlLenOffset = 40
+
+// flagsOffset is the offset form the start of the MessageHeader64 struct
+// to the Flags field.
+const flagsOffset = 48
+
+const sizeOfInt32 = 4
+
+// messageHeader64Len is the length of a MessageHeader64 struct.
+var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+
+// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
+var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+
+// baseRecvFlags are the flags that are accepted across recvmsg(2),
+// recvmmsg(2), and recvfrom(2).
+const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC
+
+// MessageHeader64 is the 64-bit representation of the msghdr struct used in
+// the recvmsg and sendmsg syscalls.
+type MessageHeader64 struct {
+ // Name is the optional pointer to a network address buffer.
+ Name uint64
+
+ // NameLen is the length of the buffer pointed to by Name.
+ NameLen uint32
+ _ uint32
+
+ // Iov is a pointer to an array of io vectors that describe the memory
+ // locations involved in the io operation.
+ Iov uint64
+
+ // IovLen is the length of the array pointed to by Iov.
+ IovLen uint64
+
+ // Control is the optional pointer to ancillary control data.
+ Control uint64
+
+ // ControlLen is the length of the data pointed to by Control.
+ ControlLen uint64
+
+ // Flags on the sent/received message.
+ Flags int32
+ _ int32
+}
+
+// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
+// the recvmmsg and sendmmsg syscalls.
+type multipleMessageHeader64 struct {
+ msgHdr MessageHeader64
+ msgLen uint32
+ _ int32
+}
+
+// CopyInMessageHeader64 copies a message header from user to kernel memory.
+func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
+ b := t.CopyScratchBuffer(52)
+ if _, err := t.CopyInBytes(addr, b); err != nil {
+ return err
+ }
+
+ msg.Name = usermem.ByteOrder.Uint64(b[0:])
+ msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
+ msg.Iov = usermem.ByteOrder.Uint64(b[16:])
+ msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
+ msg.Control = usermem.ByteOrder.Uint64(b[32:])
+ msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
+ msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
+
+ return nil
+}
+
+// CaptureAddress allocates memory for and copies a socket address structure
+// from the untrusted address space range.
+func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
+ if addrlen > maxAddrLen {
+ return nil, syserror.EINVAL
+ }
+
+ addrBuf := make([]byte, addrlen)
+ if _, err := t.CopyInBytes(addr, addrBuf); err != nil {
+ return nil, err
+ }
+
+ return addrBuf, nil
+}
+
+// writeAddress writes a sockaddr structure and its length to an output buffer
+// in the unstrusted address space range. If the address is bigger than the
+// buffer, it is truncated.
+func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+ // Get the buffer length.
+ var bufLen uint32
+ if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+ return err
+ }
+
+ if int32(bufLen) < 0 {
+ return syserror.EINVAL
+ }
+
+ // Write the length unconditionally.
+ if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+ return err
+ }
+
+ if addr == nil {
+ return nil
+ }
+
+ if bufLen > addrLen {
+ bufLen = addrLen
+ }
+
+ // Copy as much of the address as will fit in the buffer.
+ encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+ if bufLen > uint32(len(encodedAddr)) {
+ bufLen = uint32(len(encodedAddr))
+ }
+ _, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)])
+ return err
+}
+
+// Socket implements the linux syscall socket(2).
+func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ domain := int(args[0].Int())
+ stype := args[1].Int()
+ protocol := int(args[2].Int())
+
+ // Check and initialize the flags.
+ if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Create the new socket.
+ s, e := socket.New(t, domain, linux.SockType(stype&0xf), protocol)
+ if e != nil {
+ return 0, nil, e.ToError()
+ }
+ s.SetFlags(fs.SettableFileFlags{
+ NonBlocking: stype&linux.SOCK_NONBLOCK != 0,
+ })
+ defer s.DecRef()
+
+ fd, err := t.NewFDFrom(0, s, kernel.FDFlags{
+ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
+
+// SocketPair implements the linux syscall socketpair(2).
+func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ domain := int(args[0].Int())
+ stype := args[1].Int()
+ protocol := int(args[2].Int())
+ socks := args[3].Pointer()
+
+ // Check and initialize the flags.
+ if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ fileFlags := fs.SettableFileFlags{
+ NonBlocking: stype&linux.SOCK_NONBLOCK != 0,
+ }
+
+ // Create the socket pair.
+ s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol)
+ if e != nil {
+ return 0, nil, e.ToError()
+ }
+ s1.SetFlags(fileFlags)
+ s2.SetFlags(fileFlags)
+ defer s1.DecRef()
+ defer s2.DecRef()
+
+ // Create the FDs for the sockets.
+ fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{
+ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // Copy the file descriptors out.
+ if _, err := t.CopyOut(socks, fds); err != nil {
+ for _, fd := range fds {
+ if file, _ := t.FDTable().Remove(fd); file != nil {
+ file.DecRef()
+ }
+ }
+ return 0, nil, err
+ }
+
+ return 0, nil, nil
+}
+
+// Connect implements the linux syscall connect(2).
+func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ addrlen := args[2].Uint()
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Capture address and call syscall implementation.
+ a, err := CaptureAddress(t, addr, addrlen)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ blocking := !file.Flags().NonBlocking
+ return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS)
+}
+
+// accept is the implementation of the accept syscall. It is called by accept
+// and accept4 syscall handlers.
+func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) {
+ // Check that no unsupported flags are passed in.
+ if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, syserror.ENOTSOCK
+ }
+
+ // Call the syscall implementation for this socket, then copy the
+ // output address if one is specified.
+ blocking := !file.Flags().NonBlocking
+
+ peerRequested := addrLen != 0
+ nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
+ if e != nil {
+ return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+ }
+ if peerRequested {
+ // NOTE(magi): Linux does not give you an error if it can't
+ // write the data back out so neither do we.
+ if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL {
+ return 0, err
+ }
+ }
+ return uintptr(nfd), nil
+}
+
+// Accept4 implements the linux syscall accept4(2).
+func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ addrlen := args[2].Pointer()
+ flags := int(args[3].Int())
+
+ n, err := accept(t, fd, addr, addrlen, flags)
+ return n, nil, err
+}
+
+// Accept implements the linux syscall accept(2).
+func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ addrlen := args[2].Pointer()
+
+ n, err := accept(t, fd, addr, addrlen, 0)
+ return n, nil, err
+}
+
+// Bind implements the linux syscall bind(2).
+func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ addrlen := args[2].Uint()
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Capture address and call syscall implementation.
+ a, err := CaptureAddress(t, addr, addrlen)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, s.Bind(t, a).ToError()
+}
+
+// Listen implements the linux syscall listen(2).
+func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ backlog := args[1].Int()
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Per Linux, the backlog is silently capped to reasonable values.
+ if backlog <= 0 {
+ backlog = minListenBacklog
+ }
+ if backlog > maxListenBacklog {
+ backlog = maxListenBacklog
+ }
+
+ return 0, nil, s.Listen(t, int(backlog)).ToError()
+}
+
+// Shutdown implements the linux syscall shutdown(2).
+func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ how := args[1].Int()
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Validate how, then call syscall implementation.
+ switch how {
+ case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR:
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ return 0, nil, s.Shutdown(t, int(how)).ToError()
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2).
+func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ level := args[1].Int()
+ name := args[2].Int()
+ optValAddr := args[3].Pointer()
+ optLenAddr := args[4].Pointer()
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Read the length. Reject negative values.
+ optLen := int32(0)
+ if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+ return 0, nil, err
+ }
+ if optLen < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Call syscall implementation then copy both value and value len out.
+ v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen))
+ if e != nil {
+ return 0, nil, e.ToError()
+ }
+
+ vLen := int32(binary.Size(v))
+ if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+ return 0, nil, err
+ }
+
+ if v != nil {
+ if _, err := t.CopyOut(optValAddr, v); err != nil {
+ return 0, nil, err
+ }
+ }
+
+ return 0, nil, nil
+}
+
+// getSockOpt tries to handle common socket options, or dispatches to a specific
+// socket implementation.
+func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
+ if level == linux.SOL_SOCKET {
+ switch name {
+ case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
+ if len < sizeOfInt32 {
+ return nil, syserr.ErrInvalidArgument
+ }
+ }
+
+ switch name {
+ case linux.SO_TYPE:
+ _, skType, _ := s.Type()
+ return int32(skType), nil
+ case linux.SO_DOMAIN:
+ family, _, _ := s.Type()
+ return int32(family), nil
+ case linux.SO_PROTOCOL:
+ _, _, protocol := s.Type()
+ return int32(protocol), nil
+ }
+ }
+
+ return s.GetSockOpt(t, level, name, optValAddr, len)
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2).
+//
+// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket.
+func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ level := args[1].Int()
+ name := args[2].Int()
+ optValAddr := args[3].Pointer()
+ optLen := args[4].Int()
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ if optLen < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if optLen > maxOptLen {
+ return 0, nil, syserror.EINVAL
+ }
+ buf := t.CopyScratchBuffer(int(optLen))
+ if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+ return 0, nil, err
+ }
+
+ // Call syscall implementation.
+ if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil {
+ return 0, nil, err.ToError()
+ }
+
+ return 0, nil, nil
+}
+
+// GetSockName implements the linux syscall getsockname(2).
+func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ addrlen := args[2].Pointer()
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Get the socket name and copy it to the caller.
+ v, vl, err := s.GetSockName(t)
+ if err != nil {
+ return 0, nil, err.ToError()
+ }
+
+ return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// GetPeerName implements the linux syscall getpeername(2).
+func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ addrlen := args[2].Pointer()
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Get the socket peer name and copy it to the caller.
+ v, vl, err := s.GetPeerName(t)
+ if err != nil {
+ return 0, nil, err.ToError()
+ }
+
+ return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// RecvMsg implements the linux syscall recvmsg(2).
+func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ msgPtr := args[1].Pointer()
+ flags := args[2].Int()
+
+ if t.Arch().Width() != 8 {
+ // We only handle 64-bit for now.
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Reject flags that we don't handle yet.
+ if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if file.Flags().NonBlocking {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ var haveDeadline bool
+ var deadline ktime.Time
+ if dl := s.RecvTimeout(); dl > 0 {
+ deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+ haveDeadline = true
+ } else if dl < 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline)
+ return n, nil, err
+}
+
+// RecvMMsg implements the linux syscall recvmmsg(2).
+func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ msgPtr := args[1].Pointer()
+ vlen := args[2].Uint()
+ flags := args[3].Int()
+ toPtr := args[4].Pointer()
+
+ if t.Arch().Width() != 8 {
+ // We only handle 64-bit for now.
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Reject flags that we don't handle yet.
+ if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ if file.Flags().NonBlocking {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ var haveDeadline bool
+ var deadline ktime.Time
+ if toPtr != 0 {
+ ts, err := copyTimespecIn(t, toPtr)
+ if err != nil {
+ return 0, nil, err
+ }
+ if !ts.Valid() {
+ return 0, nil, syserror.EINVAL
+ }
+ deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration())
+ haveDeadline = true
+ }
+
+ if !haveDeadline {
+ if dl := s.RecvTimeout(); dl > 0 {
+ deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+ haveDeadline = true
+ } else if dl < 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+ }
+
+ var count uint32
+ var err error
+ for i := uint64(0); i < uint64(vlen); i++ {
+ mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+ if !ok {
+ return 0, nil, syserror.EFAULT
+ }
+ var n uintptr
+ if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil {
+ break
+ }
+
+ // Copy the received length to the caller.
+ lp, ok := mp.AddLength(messageHeader64Len)
+ if !ok {
+ return 0, nil, syserror.EFAULT
+ }
+ if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+ break
+ }
+ count++
+ }
+
+ if count == 0 {
+ return 0, nil, err
+ }
+ return uintptr(count), nil, nil
+}
+
+func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
+ // Capture the message header and io vectors.
+ var msg MessageHeader64
+ if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+ return 0, err
+ }
+
+ if msg.IovLen > linux.UIO_MAXIOV {
+ return 0, syserror.EMSGSIZE
+ }
+ dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, err
+ }
+
+ // FIXME(b/63594852): Pretend we have an empty error queue.
+ if flags&linux.MSG_ERRQUEUE != 0 {
+ return 0, syserror.EAGAIN
+ }
+
+ // Fast path when no control message nor name buffers are provided.
+ if msg.ControlLen == 0 && msg.NameLen == 0 {
+ n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
+ if err != nil {
+ return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
+ }
+ if !cms.Unix.Empty() {
+ mflags |= linux.MSG_CTRUNC
+ cms.Release()
+ }
+
+ if int(msg.Flags) != mflags {
+ // Copy out the flags to the caller.
+ if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+ return 0, err
+ }
+ }
+
+ return uintptr(n), nil
+ }
+
+ if msg.ControlLen > maxControlLen {
+ return 0, syserror.ENOBUFS
+ }
+ n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
+ if e != nil {
+ return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+ }
+ defer cms.Release()
+
+ controlData := make([]byte, 0, msg.ControlLen)
+ controlData = control.PackControlMessages(t, cms, controlData)
+
+ if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
+ creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
+ controlData, mflags = control.PackCredentials(t, creds, controlData, mflags)
+ }
+
+ if cms.Unix.Rights != nil {
+ controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
+ }
+
+ // Copy the address to the caller.
+ if msg.NameLen != 0 {
+ if err := writeAddress(t, sender, senderLen, usermem.Addr(msg.Name), usermem.Addr(msgPtr+nameLenOffset)); err != nil {
+ return 0, err
+ }
+ }
+
+ // Copy the control data to the caller.
+ if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+ return 0, err
+ }
+ if len(controlData) > 0 {
+ if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+ return 0, err
+ }
+ }
+
+ // Copy out the flags to the caller.
+ if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+ return 0, err
+ }
+
+ return uintptr(n), nil
+}
+
+// recvFrom is the implementation of the recvfrom syscall. It is called by
+// recvfrom and recv syscall handlers.
+func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) {
+ if int(bufLen) < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Reject flags that we don't handle yet.
+ if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, syserror.ENOTSOCK
+ }
+
+ if file.Flags().NonBlocking {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, err
+ }
+
+ var haveDeadline bool
+ var deadline ktime.Time
+ if dl := s.RecvTimeout(); dl > 0 {
+ deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+ haveDeadline = true
+ } else if dl < 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
+ cm.Release()
+ if e != nil {
+ return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+ }
+
+ // Copy the address to the caller.
+ if nameLenPtr != 0 {
+ if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil {
+ return 0, err
+ }
+ }
+
+ return uintptr(n), nil
+}
+
+// RecvFrom implements the linux syscall recvfrom(2).
+func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ bufPtr := args[1].Pointer()
+ bufLen := args[2].Uint64()
+ flags := args[3].Int()
+ namePtr := args[4].Pointer()
+ nameLenPtr := args[5].Pointer()
+
+ n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr)
+ return n, nil, err
+}
+
+// SendMsg implements the linux syscall sendmsg(2).
+func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ msgPtr := args[1].Pointer()
+ flags := args[2].Int()
+
+ if t.Arch().Width() != 8 {
+ // We only handle 64-bit for now.
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Reject flags that we don't handle yet.
+ if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if file.Flags().NonBlocking {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ n, err := sendSingleMsg(t, s, file, msgPtr, flags)
+ return n, nil, err
+}
+
+// SendMMsg implements the linux syscall sendmmsg(2).
+func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ msgPtr := args[1].Pointer()
+ vlen := args[2].Uint()
+ flags := args[3].Int()
+
+ if t.Arch().Width() != 8 {
+ // We only handle 64-bit for now.
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Reject flags that we don't handle yet.
+ if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if file.Flags().NonBlocking {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ var count uint32
+ var err error
+ for i := uint64(0); i < uint64(vlen); i++ {
+ mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+ if !ok {
+ return 0, nil, syserror.EFAULT
+ }
+ var n uintptr
+ if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil {
+ break
+ }
+
+ // Copy the received length to the caller.
+ lp, ok := mp.AddLength(messageHeader64Len)
+ if !ok {
+ return 0, nil, syserror.EFAULT
+ }
+ if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+ break
+ }
+ count++
+ }
+
+ if count == 0 {
+ return 0, nil, err
+ }
+ return uintptr(count), nil, nil
+}
+
+func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr usermem.Addr, flags int32) (uintptr, error) {
+ // Capture the message header.
+ var msg MessageHeader64
+ if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+ return 0, err
+ }
+
+ var controlData []byte
+ if msg.ControlLen > 0 {
+ // Put an upper bound to prevent large allocations.
+ if msg.ControlLen > maxControlLen {
+ return 0, syserror.ENOBUFS
+ }
+ controlData = make([]byte, msg.ControlLen)
+ if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+ return 0, err
+ }
+ }
+
+ // Read the destination address if one is specified.
+ var to []byte
+ if msg.NameLen != 0 {
+ var err error
+ to, err = CaptureAddress(t, usermem.Addr(msg.Name), msg.NameLen)
+ if err != nil {
+ return 0, err
+ }
+ }
+
+ // Read data then call the sendmsg implementation.
+ if msg.IovLen > linux.UIO_MAXIOV {
+ return 0, syserror.EMSGSIZE
+ }
+ src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, err
+ }
+
+ controlMessages, err := control.Parse(t, s, controlData)
+ if err != nil {
+ return 0, err
+ }
+
+ var haveDeadline bool
+ var deadline ktime.Time
+ if dl := s.SendTimeout(); dl > 0 {
+ deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+ haveDeadline = true
+ } else if dl < 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ // Call the syscall implementation.
+ n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
+ err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
+ if err != nil {
+ controlMessages.Release()
+ }
+ return uintptr(n), err
+}
+
+// sendTo is the implementation of the sendto syscall. It is called by sendto
+// and send syscall handlers.
+func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) {
+ bl := int(bufLen)
+ if bl < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.FileOperations.(socket.Socket)
+ if !ok {
+ return 0, syserror.ENOTSOCK
+ }
+
+ if file.Flags().NonBlocking {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ // Read the destination address if one is specified.
+ var to []byte
+ var err error
+ if namePtr != 0 {
+ to, err = CaptureAddress(t, namePtr, nameLen)
+ if err != nil {
+ return 0, err
+ }
+ }
+
+ src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, err
+ }
+
+ var haveDeadline bool
+ var deadline ktime.Time
+ if dl := s.SendTimeout(); dl > 0 {
+ deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+ haveDeadline = true
+ } else if dl < 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ // Call the syscall implementation.
+ n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
+ return uintptr(n), handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
+}
+
+// SendTo implements the linux syscall sendto(2).
+func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ bufPtr := args[1].Pointer()
+ bufLen := args[2].Uint64()
+ flags := args[3].Int()
+ namePtr := args[4].Pointer()
+ nameLen := args[5].Uint()
+
+ n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
+ return n, nil, err
+}
+
+// LINT.ThenChange(./vfs2/socket.go)
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
new file mode 100644
index 000000000..77c78889d
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -0,0 +1,337 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// doSplice implements a blocking splice operation.
+func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) {
+ if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 || (opts.SrcStart+opts.Length < 0) {
+ return 0, syserror.EINVAL
+ }
+
+ if opts.Length > int64(kernel.MAX_RW_COUNT) {
+ opts.Length = int64(kernel.MAX_RW_COUNT)
+ }
+
+ var (
+ total int64
+ n int64
+ err error
+ inCh chan struct{}
+ outCh chan struct{}
+ )
+ for opts.Length > 0 {
+ n, err = fs.Splice(t, outFile, inFile, opts)
+ opts.Length -= n
+ total += n
+ if err != syserror.ErrWouldBlock {
+ break
+ } else if err == syserror.ErrWouldBlock && nonBlocking {
+ break
+ }
+
+ // Note that the blocking behavior here is a bit different than the
+ // normal pattern. Because we need to have both data to read and data
+ // to write simultaneously, we actually explicitly block on both of
+ // these cases in turn before returning to the splice operation.
+ if inFile.Readiness(EventMaskRead) == 0 {
+ if inCh == nil {
+ inCh = make(chan struct{}, 1)
+ inW, _ := waiter.NewChannelEntry(inCh)
+ inFile.EventRegister(&inW, EventMaskRead)
+ defer inFile.EventUnregister(&inW)
+ continue // Need to refresh readiness.
+ }
+ if err = t.Block(inCh); err != nil {
+ break
+ }
+ }
+ if outFile.Readiness(EventMaskWrite) == 0 {
+ if outCh == nil {
+ outCh = make(chan struct{}, 1)
+ outW, _ := waiter.NewChannelEntry(outCh)
+ outFile.EventRegister(&outW, EventMaskWrite)
+ defer outFile.EventUnregister(&outW)
+ continue // Need to refresh readiness.
+ }
+ if err = t.Block(outCh); err != nil {
+ break
+ }
+ }
+ }
+
+ if total > 0 {
+ // On Linux, inotify behavior is not very consistent with splice(2). We try
+ // our best to emulate Linux for very basic calls to splice, where for some
+ // reason, events are generated for output files, but not input files.
+ outFile.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+ }
+ return total, err
+}
+
+// Sendfile implements linux system call sendfile(2).
+func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ outFD := args[0].Int()
+ inFD := args[1].Int()
+ offsetAddr := args[2].Pointer()
+ count := int64(args[3].SizeT())
+
+ // Get files.
+ inFile := t.GetFile(inFD)
+ if inFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer inFile.DecRef()
+
+ if !inFile.Flags().Read {
+ return 0, nil, syserror.EBADF
+ }
+
+ outFile := t.GetFile(outFD)
+ if outFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer outFile.DecRef()
+
+ if !outFile.Flags().Write {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Verify that the outfile Append flag is not set.
+ if outFile.Flags().Append {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Verify that we have a regular infile. This is a requirement; the
+ // same check appears in Linux (fs/splice.c:splice_direct_to_actor).
+ if !fs.IsRegular(inFile.Dirent.Inode.StableAttr) {
+ return 0, nil, syserror.EINVAL
+ }
+
+ var (
+ n int64
+ err error
+ )
+ if offsetAddr != 0 {
+ // Verify that when offset address is not null, infile must be
+ // seekable. The fs.Splice routine itself validates basic read.
+ if !inFile.Flags().Pread {
+ return 0, nil, syserror.ESPIPE
+ }
+
+ // Copy in the offset.
+ var offset int64
+ if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+ return 0, nil, err
+ }
+
+ // Do the splice.
+ n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
+ Length: count,
+ SrcOffset: true,
+ SrcStart: offset,
+ }, outFile.Flags().NonBlocking)
+
+ // Copy out the new offset.
+ if _, err := t.CopyOut(offsetAddr, n+offset); err != nil {
+ return 0, nil, err
+ }
+ } else {
+ // Send data using splice.
+ n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
+ Length: count,
+ }, outFile.Flags().NonBlocking)
+ }
+
+ // Sendfile can't lose any data because inFD is always a regual file.
+ if n != 0 {
+ err = nil
+ }
+
+ // We can only pass a single file to handleIOError, so pick inFile
+ // arbitrarily. This is used only for debugging purposes.
+ return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "sendfile", inFile)
+}
+
+// Splice implements splice(2).
+func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ inFD := args[0].Int()
+ inOffset := args[1].Pointer()
+ outFD := args[2].Int()
+ outOffset := args[3].Pointer()
+ count := int64(args[4].SizeT())
+ flags := args[5].Int()
+
+ // Check for invalid flags.
+ if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get files.
+ outFile := t.GetFile(outFD)
+ if outFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer outFile.DecRef()
+
+ inFile := t.GetFile(inFD)
+ if inFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer inFile.DecRef()
+
+ // The operation is non-blocking if anything is non-blocking.
+ //
+ // N.B. This is a rather simplistic heuristic that avoids some
+ // poor edge case behavior since the exact semantics here are
+ // underspecified and vary between versions of Linux itself.
+ nonBlock := inFile.Flags().NonBlocking || outFile.Flags().NonBlocking || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+ // Construct our options.
+ //
+ // Note that exactly one of the underlying buffers must be a pipe. We
+ // don't actually have this constraint internally, but we enforce it
+ // for the semantics of the call.
+ opts := fs.SpliceOpts{
+ Length: count,
+ }
+ inFileAttr := inFile.Dirent.Inode.StableAttr
+ outFileAttr := outFile.Dirent.Inode.StableAttr
+ switch {
+ case fs.IsPipe(inFileAttr) && !fs.IsPipe(outFileAttr):
+ if inOffset != 0 {
+ return 0, nil, syserror.ESPIPE
+ }
+ if outOffset != 0 {
+ if !outFile.Flags().Pwrite {
+ return 0, nil, syserror.EINVAL
+ }
+
+ var offset int64
+ if _, err := t.CopyIn(outOffset, &offset); err != nil {
+ return 0, nil, err
+ }
+
+ // Use the destination offset.
+ opts.DstOffset = true
+ opts.DstStart = offset
+ }
+ case !fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr):
+ if outOffset != 0 {
+ return 0, nil, syserror.ESPIPE
+ }
+ if inOffset != 0 {
+ if !inFile.Flags().Pread {
+ return 0, nil, syserror.EINVAL
+ }
+
+ var offset int64
+ if _, err := t.CopyIn(inOffset, &offset); err != nil {
+ return 0, nil, err
+ }
+
+ // Use the source offset.
+ opts.SrcOffset = true
+ opts.SrcStart = offset
+ }
+ case fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr):
+ if inOffset != 0 || outOffset != 0 {
+ return 0, nil, syserror.ESPIPE
+ }
+
+ // We may not refer to the same pipe; otherwise it's a continuous loop.
+ if inFileAttr.InodeID == outFileAttr.InodeID {
+ return 0, nil, syserror.EINVAL
+ }
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Splice data.
+ n, err := doSplice(t, outFile, inFile, opts, nonBlock)
+
+ // Special files can have additional requirements for granularity. For
+ // example, read from eventfd returns EINVAL if a size is less 8 bytes.
+ // Inotify is another example. read will return EINVAL is a buffer is
+ // too small to return the next event, but a size of an event isn't
+ // fixed, it is sizeof(struct inotify_event) + {NAME_LEN} + 1.
+ if n != 0 && err != nil && (fs.IsAnonymous(inFileAttr) || fs.IsAnonymous(outFileAttr)) {
+ err = nil
+ }
+
+ // See above; inFile is chosen arbitrarily here.
+ return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "splice", inFile)
+}
+
+// Tee imlements tee(2).
+func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ inFD := args[0].Int()
+ outFD := args[1].Int()
+ count := int64(args[2].SizeT())
+ flags := args[3].Int()
+
+ // Check for invalid flags.
+ if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get files.
+ outFile := t.GetFile(outFD)
+ if outFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer outFile.DecRef()
+
+ inFile := t.GetFile(inFD)
+ if inFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer inFile.DecRef()
+
+ // All files must be pipes.
+ if !fs.IsPipe(inFile.Dirent.Inode.StableAttr) || !fs.IsPipe(outFile.Dirent.Inode.StableAttr) {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // We may not refer to the same pipe; see above.
+ if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // The operation is non-blocking if anything is non-blocking.
+ nonBlock := inFile.Flags().NonBlocking || outFile.Flags().NonBlocking || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+ // Splice data.
+ n, err := doSplice(t, outFile, inFile, fs.SpliceOpts{
+ Length: count,
+ Dup: true,
+ }, nonBlock)
+
+ // Tee doesn't change a state of inFD, so it can't lose any data.
+ if n != 0 {
+ err = nil
+ }
+
+ // See above; inFile is chosen arbitrarily here.
+ return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "tee", inFile)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
new file mode 100644
index 000000000..46ebf27a2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -0,0 +1,290 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// LINT.IfChange
+
+// Stat implements linux syscall stat(2).
+func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ statAddr := args[1].Pointer()
+
+ path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ return stat(t, d, dirPath, statAddr)
+ })
+}
+
+// Fstatat implements linux syscall newfstatat, i.e. fstatat(2).
+func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ statAddr := args[2].Pointer()
+ flags := args[3].Int()
+
+ path, dirPath, err := copyInPath(t, addr, flags&linux.AT_EMPTY_PATH != 0)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ if path == "" {
+ // Annoying. What's wrong with fstat?
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ return 0, nil, fstat(t, file, statAddr)
+ }
+
+ // If the path ends in a slash (i.e. dirPath is true) or if AT_SYMLINK_NOFOLLOW is unset,
+ // then we must resolve the final component.
+ resolve := dirPath || flags&linux.AT_SYMLINK_NOFOLLOW == 0
+
+ return 0, nil, fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ return stat(t, d, dirPath, statAddr)
+ })
+}
+
+// Lstat implements linux syscall lstat(2).
+func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ statAddr := args[1].Pointer()
+
+ path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // If the path ends in a slash (i.e. dirPath is true), then we *do*
+ // want to resolve the final component.
+ resolve := dirPath
+
+ return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ return stat(t, d, dirPath, statAddr)
+ })
+}
+
+// Fstat implements linux syscall fstat(2).
+func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ statAddr := args[1].Pointer()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ return 0, nil, fstat(t, file, statAddr)
+}
+
+// stat implements stat from the given *fs.Dirent.
+func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+ uattr, err := d.Inode.UnstableAttr(t)
+ if err != nil {
+ return err
+ }
+ s := statFromAttrs(t, d.Inode.StableAttr, uattr)
+ _, err = s.CopyOut(t, statAddr)
+ return err
+}
+
+// fstat implements fstat for the given *fs.File.
+func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error {
+ uattr, err := f.UnstableAttr(t)
+ if err != nil {
+ return err
+ }
+ s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr)
+ _, err = s.CopyOut(t, statAddr)
+ return err
+}
+
+// Statx implements linux syscall statx(2).
+func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ pathAddr := args[1].Pointer()
+ flags := args[2].Int()
+ mask := args[3].Uint()
+ statxAddr := args[4].Pointer()
+
+ if mask&linux.STATX__RESERVED != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if flags&^(linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH|linux.AT_STATX_SYNC_TYPE) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if flags&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE {
+ return 0, nil, syserror.EINVAL
+ }
+
+ path, dirPath, err := copyInPath(t, pathAddr, flags&linux.AT_EMPTY_PATH != 0)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ if path == "" {
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+ uattr, err := file.UnstableAttr(t)
+ if err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, statx(t, file.Dirent.Inode.StableAttr, uattr, statxAddr)
+ }
+
+ resolve := dirPath || flags&linux.AT_SYMLINK_NOFOLLOW == 0
+
+ return 0, nil, fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+ uattr, err := d.Inode.UnstableAttr(t)
+ if err != nil {
+ return err
+ }
+ return statx(t, d.Inode.StableAttr, uattr, statxAddr)
+ })
+}
+
+func statx(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr, statxAddr usermem.Addr) error {
+ // "[T]he kernel may return fields that weren't requested and may fail to
+ // return fields that were requested, depending on what the backing
+ // filesystem supports.
+ // [...]
+ // A filesystem may also fill in fields that the caller didn't ask for
+ // if it has values for them available and the information is available
+ // at no extra cost. If this happens, the corresponding bits will be
+ // set in stx_mask." -- statx(2)
+ //
+ // We fill in all the values we have (which currently does not include
+ // btime, see b/135608823), regardless of what the user asked for. The
+ // STATX_BASIC_STATS mask indicates that all fields are present except
+ // for btime.
+
+ devMajor, devMinor := linux.DecodeDeviceID(uint32(sattr.DeviceID))
+ s := linux.Statx{
+ // TODO(b/135608823): Support btime, and then change this to
+ // STATX_ALL to indicate presence of btime.
+ Mask: linux.STATX_BASIC_STATS,
+
+ // No attributes, and none supported.
+ Attributes: 0,
+ AttributesMask: 0,
+
+ Blksize: uint32(sattr.BlockSize),
+ Nlink: uint32(uattr.Links),
+ UID: uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+ GID: uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+ Mode: uint16(sattr.Type.LinuxType()) | uint16(uattr.Perms.LinuxMode()),
+ Ino: sattr.InodeID,
+ Size: uint64(uattr.Size),
+ Blocks: uint64(uattr.Usage) / 512,
+ Atime: uattr.AccessTime.StatxTimestamp(),
+ Ctime: uattr.StatusChangeTime.StatxTimestamp(),
+ Mtime: uattr.ModificationTime.StatxTimestamp(),
+ RdevMajor: uint32(sattr.DeviceFileMajor),
+ RdevMinor: sattr.DeviceFileMinor,
+ DevMajor: uint32(devMajor),
+ DevMinor: devMinor,
+ }
+ _, err := t.CopyOut(statxAddr, &s)
+ return err
+}
+
+// Statfs implements linux syscall statfs(2).
+func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ statfsAddr := args[1].Pointer()
+
+ path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ return statfsImpl(t, d, statfsAddr)
+ })
+}
+
+// Fstatfs implements linux syscall fstatfs(2).
+func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ statfsAddr := args[1].Pointer()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ return 0, nil, statfsImpl(t, file.Dirent, statfsAddr)
+}
+
+// statfsImpl implements the linux syscall statfs and fstatfs based on a Dirent,
+// copying the statfs structure out to addr on success, otherwise an error is
+// returned.
+func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
+ info, err := d.Inode.StatFS(t)
+ if err != nil {
+ return err
+ }
+ // Construct the statfs structure and copy it out.
+ statfs := linux.Statfs{
+ Type: info.Type,
+ // Treat block size and fragment size as the same, as
+ // most consumers of this structure will expect one
+ // or the other to be filled in.
+ BlockSize: d.Inode.StableAttr.BlockSize,
+ Blocks: info.TotalBlocks,
+ // We don't have the concept of reserved blocks, so
+ // report blocks free the same as available blocks.
+ // This is a normal thing for filesystems, to do, see
+ // udf, hugetlbfs, tmpfs, among others.
+ BlocksFree: info.FreeBlocks,
+ BlocksAvailable: info.FreeBlocks,
+ Files: info.TotalFiles,
+ FilesFree: info.FreeFiles,
+ // Same as Linux for simple_statfs, see fs/libfs.c.
+ NameLength: linux.NAME_MAX,
+ FragmentSize: d.Inode.StableAttr.BlockSize,
+ // Leave other fields 0 like simple_statfs does.
+ }
+ _, err = t.CopyOut(addr, &statfs)
+ return err
+}
+
+// LINT.ThenChange(vfs2/stat.go)
diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
new file mode 100644
index 000000000..0a04a6113
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// LINT.IfChange
+
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+ return linux.Stat{
+ Dev: sattr.DeviceID,
+ Ino: sattr.InodeID,
+ Nlink: uattr.Links,
+ Mode: sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+ UID: uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+ GID: uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+ Rdev: uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+ Size: uattr.Size,
+ Blksize: sattr.BlockSize,
+ Blocks: uattr.Usage / 512,
+ ATime: uattr.AccessTime.Timespec(),
+ MTime: uattr.ModificationTime.Timespec(),
+ CTime: uattr.StatusChangeTime.Timespec(),
+ }
+}
+
+// LINT.ThenChange(vfs2/stat_amd64.go)
diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
new file mode 100644
index 000000000..5a3b1bfad
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// LINT.IfChange
+
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+ return linux.Stat{
+ Dev: sattr.DeviceID,
+ Ino: sattr.InodeID,
+ Nlink: uint32(uattr.Links),
+ Mode: sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+ UID: uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+ GID: uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+ Rdev: uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+ Size: uattr.Size,
+ Blksize: int32(sattr.BlockSize),
+ Blocks: uattr.Usage / 512,
+ ATime: uattr.AccessTime.Timespec(),
+ MTime: uattr.ModificationTime.Timespec(),
+ CTime: uattr.StatusChangeTime.Timespec(),
+ }
+}
+
+// LINT.ThenChange(vfs2/stat_arm64.go)
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
new file mode 100644
index 000000000..5ad465ae3
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -0,0 +1,141 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// LINT.IfChange
+
+// Sync implements linux system call sync(2).
+func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ t.MountNamespace().SyncAll(t)
+ // Sync is always successful.
+ return 0, nil, nil
+}
+
+// Syncfs implements linux system call syncfs(2).
+func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Use "sync-the-world" for now, it's guaranteed that fd is at least
+ // on the root filesystem.
+ return Sync(t, args)
+}
+
+// Fsync implements linux syscall fsync(2).
+func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll)
+ return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// Fdatasync implements linux syscall fdatasync(2).
+//
+// At the moment, it just calls Fsync, which is a big hammer, but correct.
+func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData)
+ return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// SyncFileRange implements linux syscall sync_file_rage(2)
+func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ var err error
+
+ fd := args[0].Int()
+ offset := args[1].Int64()
+ nbytes := args[2].Int64()
+ uflags := args[3].Uint()
+
+ if offset < 0 || offset+nbytes < offset {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if uflags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|
+ linux.SYNC_FILE_RANGE_WRITE|
+ linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if nbytes == 0 {
+ nbytes = fs.FileMaxOffset
+ }
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // SYNC_FILE_RANGE_WAIT_BEFORE waits upon write-out of all pages in the
+ // specified range that have already been submitted to the device
+ // driver for write-out before performing any write.
+ if uflags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 &&
+ uflags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 {
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, syserror.ENOSYS
+ }
+
+ // SYNC_FILE_RANGE_WRITE initiates write-out of all dirty pages in the
+ // specified range which are not presently submitted write-out.
+ //
+ // It looks impossible to implement this functionality without a
+ // massive rework of the vfs subsystem. file.Fsync() take a file lock
+ // for the entire operation, so even if it is running in a go routing,
+ // it blocks other file operations instead of flushing data in the
+ // background.
+ //
+ // It should be safe to skipped this flag while nobody uses
+ // SYNC_FILE_RANGE_WAIT_BEFORE.
+
+ // SYNC_FILE_RANGE_WAIT_AFTER waits upon write-out of all pages in the
+ // range after performing any write.
+ //
+ // In Linux, sync_file_range() doesn't writes out the file's
+ // meta-data, but fdatasync() does if a file size is changed.
+ if uflags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 {
+ err = file.Fsync(t, offset, fs.FileMaxOffset, fs.SyncData)
+ }
+
+ return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// LINT.ThenChange(vfs2/sync.go)
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
new file mode 100644
index 000000000..297de052a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -0,0 +1,48 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+)
+
+// Sysinfo implements the sysinfo syscall as described in man 2 sysinfo.
+func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+
+ mf := t.Kernel().MemoryFile()
+ mf.UpdateUsage()
+ _, totalUsage := usage.MemoryAccounting.Copy()
+ totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
+ memFree := totalSize - totalUsage
+ if memFree > totalSize {
+ // Underflow.
+ memFree = 0
+ }
+
+ // Only a subset of the fields in sysinfo_t make sense to return.
+ si := linux.Sysinfo{
+ Procs: uint16(len(t.PIDNamespace().Tasks())),
+ Uptime: t.Kernel().MonotonicClock().Now().Seconds(),
+ TotalRAM: totalSize,
+ FreeRAM: memFree,
+ Unit: 1,
+ }
+ _, err := t.CopyOut(addr, si)
+ return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_syslog.go b/pkg/sentry/syscalls/linux/sys_syslog.go
new file mode 100644
index 000000000..40c8bb061
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_syslog.go
@@ -0,0 +1,61 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+ _SYSLOG_ACTION_READ_ALL = 3
+ _SYSLOG_ACTION_SIZE_BUFFER = 10
+)
+
+// logBufLen is the default syslog buffer size on Linux.
+const logBufLen = 1 << 17
+
+// Syslog implements part of Linux syscall syslog.
+//
+// Only the unpriviledged commands are implemented, allowing applications to
+// read a fun dmesg.
+func Syslog(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ command := args[0].Int()
+ buf := args[1].Pointer()
+ size := int(args[2].Int())
+
+ switch command {
+ case _SYSLOG_ACTION_READ_ALL:
+ if size < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if size > logBufLen {
+ size = logBufLen
+ }
+
+ log := t.Kernel().Syslog().Log()
+ if len(log) > size {
+ log = log[:size]
+ }
+
+ n, err := t.CopyOutBytes(buf, log)
+ return uintptr(n), nil, err
+ case _SYSLOG_ACTION_SIZE_BUFFER:
+ return logBufLen, nil, nil
+ default:
+ return 0, nil, syserror.ENOSYS
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
new file mode 100644
index 000000000..00915fdde
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -0,0 +1,769 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "path"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+ "gvisor.dev/gvisor/pkg/sentry/loader"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+ // ExecMaxTotalSize is the maximum length of all argv and envv entries.
+ //
+ // N.B. The behavior here is different than Linux. Linux provides a limit on
+ // individual arguments of 32 pages, and an aggregate limit of at least 32 pages
+ // but otherwise bounded by min(stack size / 4, 8 MB * 3 / 4). We don't implement
+ // any behavior based on the stack size, and instead provide a fixed hard-limit of
+ // 2 MB (which should work well given that 8 MB stack limits are common).
+ ExecMaxTotalSize = 2 * 1024 * 1024
+
+ // ExecMaxElemSize is the maximum length of a single argv or envv entry.
+ ExecMaxElemSize = 32 * usermem.PageSize
+
+ // exitSignalMask is the signal mask to be sent at exit. Same as CSIGNAL in linux.
+ exitSignalMask = 0xff
+)
+
+// Getppid implements linux syscall getppid(2).
+func Getppid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ parent := t.Parent()
+ if parent == nil {
+ return 0, nil, nil
+ }
+ return uintptr(t.PIDNamespace().IDOfThreadGroup(parent.ThreadGroup())), nil, nil
+}
+
+// Getpid implements linux syscall getpid(2).
+func Getpid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return uintptr(t.ThreadGroup().ID()), nil, nil
+}
+
+// Gettid implements linux syscall gettid(2).
+func Gettid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return uintptr(t.ThreadID()), nil, nil
+}
+
+// Execve implements linux syscall execve(2).
+func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ filenameAddr := args[0].Pointer()
+ argvAddr := args[1].Pointer()
+ envvAddr := args[2].Pointer()
+
+ return execveat(t, linux.AT_FDCWD, filenameAddr, argvAddr, envvAddr, 0)
+}
+
+// Execveat implements linux syscall execveat(2).
+func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirFD := args[0].Int()
+ pathnameAddr := args[1].Pointer()
+ argvAddr := args[2].Pointer()
+ envvAddr := args[3].Pointer()
+ flags := args[4].Int()
+
+ return execveat(t, dirFD, pathnameAddr, argvAddr, envvAddr, flags)
+}
+
+func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
+ pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ var argv, envv []string
+ if argvAddr != 0 {
+ var err error
+ argv, err = t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize)
+ if err != nil {
+ return 0, nil, err
+ }
+ }
+ if envvAddr != 0 {
+ var err error
+ envv, err = t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize)
+ if err != nil {
+ return 0, nil, err
+ }
+ }
+
+ if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ atEmptyPath := flags&linux.AT_EMPTY_PATH != 0
+ if !atEmptyPath && len(pathname) == 0 {
+ return 0, nil, syserror.ENOENT
+ }
+ resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0
+
+ root := t.FSContext().RootDirectory()
+ defer root.DecRef()
+
+ var wd *fs.Dirent
+ var executable fsbridge.File
+ var closeOnExec bool
+ if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) {
+ // Even if the pathname is absolute, we may still need the wd
+ // for interpreter scripts if the path of the interpreter is
+ // relative.
+ wd = t.FSContext().WorkingDirectory()
+ } else {
+ // Need to extract the given FD.
+ f, fdFlags := t.FDTable().Get(dirFD)
+ if f == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer f.DecRef()
+ closeOnExec = fdFlags.CloseOnExec
+
+ if atEmptyPath && len(pathname) == 0 {
+ // TODO(gvisor.dev/issue/160): Linux requires only execute permission,
+ // not read. However, our backing filesystems may prevent us from reading
+ // the file without read permission. Additionally, a task with a
+ // non-readable executable has additional constraints on access via
+ // ptrace and procfs.
+ if err := f.Dirent.Inode.CheckPermission(t, fs.PermMask{Read: true, Execute: true}); err != nil {
+ return 0, nil, err
+ }
+ executable = fsbridge.NewFSFile(f)
+ } else {
+ wd = f.Dirent
+ wd.IncRef()
+ if !fs.IsDir(wd.Inode.StableAttr) {
+ return 0, nil, syserror.ENOTDIR
+ }
+ }
+ }
+ if wd != nil {
+ defer wd.DecRef()
+ }
+
+ // Load the new TaskContext.
+ remainingTraversals := uint(linux.MaxSymlinkTraversals)
+ loadArgs := loader.LoadArgs{
+ Opener: fsbridge.NewFSLookup(t.MountNamespace(), root, wd),
+ RemainingTraversals: &remainingTraversals,
+ ResolveFinal: resolveFinal,
+ Filename: pathname,
+ File: executable,
+ CloseOnExec: closeOnExec,
+ Argv: argv,
+ Envv: envv,
+ Features: t.Arch().FeatureSet(),
+ }
+
+ tc, se := t.Kernel().LoadTaskImage(t, loadArgs)
+ if se != nil {
+ return 0, nil, se.ToError()
+ }
+
+ ctrl, err := t.Execve(tc)
+ return 0, ctrl, err
+}
+
+// Exit implements linux syscall exit(2).
+func Exit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ status := int(args[0].Int())
+ t.PrepareExit(kernel.ExitStatus{Code: status})
+ return 0, kernel.CtrlDoExit, nil
+}
+
+// ExitGroup implements linux syscall exit_group(2).
+func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ status := int(args[0].Int())
+ t.PrepareGroupExit(kernel.ExitStatus{Code: status})
+ return 0, kernel.CtrlDoExit, nil
+}
+
+// clone is used by Clone, Fork, and VFork.
+func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr, childTID usermem.Addr, tls usermem.Addr) (uintptr, *kernel.SyscallControl, error) {
+ opts := kernel.CloneOptions{
+ SharingOptions: kernel.SharingOptions{
+ NewAddressSpace: flags&linux.CLONE_VM == 0,
+ NewSignalHandlers: flags&linux.CLONE_SIGHAND == 0,
+ NewThreadGroup: flags&linux.CLONE_THREAD == 0,
+ TerminationSignal: linux.Signal(flags & exitSignalMask),
+ NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID,
+ NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER,
+ NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET,
+ NewFiles: flags&linux.CLONE_FILES == 0,
+ NewFSContext: flags&linux.CLONE_FS == 0,
+ NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS,
+ NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC,
+ },
+ Stack: stack,
+ SetTLS: flags&linux.CLONE_SETTLS == linux.CLONE_SETTLS,
+ TLS: tls,
+ ChildClearTID: flags&linux.CLONE_CHILD_CLEARTID == linux.CLONE_CHILD_CLEARTID,
+ ChildSetTID: flags&linux.CLONE_CHILD_SETTID == linux.CLONE_CHILD_SETTID,
+ ChildTID: childTID,
+ ParentSetTID: flags&linux.CLONE_PARENT_SETTID == linux.CLONE_PARENT_SETTID,
+ ParentTID: parentTID,
+ Vfork: flags&linux.CLONE_VFORK == linux.CLONE_VFORK,
+ Untraced: flags&linux.CLONE_UNTRACED == linux.CLONE_UNTRACED,
+ InheritTracer: flags&linux.CLONE_PTRACE == linux.CLONE_PTRACE,
+ }
+ ntid, ctrl, err := t.Clone(&opts)
+ return uintptr(ntid), ctrl, err
+}
+
+// Fork implements Linux syscall fork(2).
+func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ // "A call to fork() is equivalent to a call to clone(2) specifying flags
+ // as just SIGCHLD." - fork(2)
+ return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0)
+}
+
+// Vfork implements Linux syscall vfork(2).
+func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ // """
+ // A call to vfork() is equivalent to calling clone(2) with flags specified as:
+ //
+ // CLONE_VM | CLONE_VFORK | SIGCHLD
+ // """ - vfork(2)
+ return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0)
+}
+
+// parseCommonWaitOptions applies the options common to wait4 and waitid to
+// wopts.
+func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error {
+ switch options & (linux.WCLONE | linux.WALL) {
+ case 0:
+ wopts.NonCloneTasks = true
+ case linux.WCLONE:
+ wopts.CloneTasks = true
+ case linux.WALL:
+ wopts.NonCloneTasks = true
+ wopts.CloneTasks = true
+ default:
+ return syserror.EINVAL
+ }
+ if options&linux.WCONTINUED != 0 {
+ wopts.Events |= kernel.EventGroupContinue
+ }
+ if options&linux.WNOHANG == 0 {
+ wopts.BlockInterruptErr = kernel.ERESTARTSYS
+ }
+ if options&linux.WNOTHREAD == 0 {
+ wopts.SiblingChildren = true
+ }
+ return nil
+}
+
+// wait4 waits for the given child process to exit.
+func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) {
+ if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
+ return 0, syserror.EINVAL
+ }
+ wopts := kernel.WaitOptions{
+ Events: kernel.EventExit | kernel.EventTraceeStop,
+ ConsumeEvent: true,
+ }
+ // There are four cases to consider:
+ //
+ // pid < -1 any child process whose process group ID is equal to the absolute value of pid
+ // pid == -1 any child process
+ // pid == 0 any child process whose process group ID is equal to that of the calling process
+ // pid > 0 the child whose process ID is equal to the value of pid
+ switch {
+ case pid < -1:
+ wopts.SpecificPGID = kernel.ProcessGroupID(-pid)
+ case pid == -1:
+ // Any process is the default.
+ case pid == 0:
+ wopts.SpecificPGID = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())
+ default:
+ wopts.SpecificTID = kernel.ThreadID(pid)
+ }
+
+ if err := parseCommonWaitOptions(&wopts, options); err != nil {
+ return 0, err
+ }
+ if options&linux.WUNTRACED != 0 {
+ wopts.Events |= kernel.EventChildGroupStop
+ }
+
+ wr, err := t.Wait(&wopts)
+ if err != nil {
+ if err == kernel.ErrNoWaitableEvent {
+ return 0, nil
+ }
+ return 0, err
+ }
+ if statusAddr != 0 {
+ if _, err := t.CopyOut(statusAddr, wr.Status); err != nil {
+ return 0, err
+ }
+ }
+ if rusageAddr != 0 {
+ ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
+ if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+ return 0, err
+ }
+ }
+ return uintptr(wr.TID), nil
+}
+
+// Wait4 implements linux syscall wait4(2).
+func Wait4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pid := int(args[0].Int())
+ statusAddr := args[1].Pointer()
+ options := int(args[2].Uint())
+ rusageAddr := args[3].Pointer()
+
+ n, err := wait4(t, pid, statusAddr, options, rusageAddr)
+ return n, nil, err
+}
+
+// WaitPid implements linux syscall waitpid(2).
+func WaitPid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pid := int(args[0].Int())
+ statusAddr := args[1].Pointer()
+ options := int(args[2].Uint())
+
+ n, err := wait4(t, pid, statusAddr, options, 0)
+ return n, nil, err
+}
+
+// Waitid implements linux syscall waitid(2).
+func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ idtype := args[0].Int()
+ id := args[1].Int()
+ infop := args[2].Pointer()
+ options := int(args[3].Uint())
+ rusageAddr := args[4].Pointer()
+
+ if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ wopts := kernel.WaitOptions{
+ Events: kernel.EventTraceeStop,
+ ConsumeEvent: options&linux.WNOWAIT == 0,
+ }
+ switch idtype {
+ case linux.P_ALL:
+ case linux.P_PID:
+ wopts.SpecificTID = kernel.ThreadID(id)
+ case linux.P_PGID:
+ wopts.SpecificPGID = kernel.ProcessGroupID(id)
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ if err := parseCommonWaitOptions(&wopts, options); err != nil {
+ return 0, nil, err
+ }
+ if options&linux.WEXITED != 0 {
+ wopts.Events |= kernel.EventExit
+ }
+ if options&linux.WSTOPPED != 0 {
+ wopts.Events |= kernel.EventChildGroupStop
+ }
+
+ wr, err := t.Wait(&wopts)
+ if err != nil {
+ if err == kernel.ErrNoWaitableEvent {
+ err = nil
+ // "If WNOHANG was specified in options and there were no children
+ // in a waitable state, then waitid() returns 0 immediately and the
+ // state of the siginfo_t structure pointed to by infop is
+ // unspecified." - waitid(2). But Linux's waitid actually zeroes
+ // out the fields it would set for a successful waitid in this case
+ // as well.
+ if infop != 0 {
+ var si arch.SignalInfo
+ _, err = t.CopyOut(infop, &si)
+ }
+ }
+ return 0, nil, err
+ }
+ if rusageAddr != 0 {
+ ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
+ if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+ return 0, nil, err
+ }
+ }
+ if infop == 0 {
+ return 0, nil, nil
+ }
+ si := arch.SignalInfo{
+ Signo: int32(linux.SIGCHLD),
+ }
+ si.SetPid(int32(wr.TID))
+ si.SetUid(int32(wr.UID))
+ // TODO(b/73541790): convert kernel.ExitStatus to functions and make
+ // WaitResult.Status a linux.WaitStatus.
+ s := syscall.WaitStatus(wr.Status)
+ switch {
+ case s.Exited():
+ si.Code = arch.CLD_EXITED
+ si.SetStatus(int32(s.ExitStatus()))
+ case s.Signaled():
+ si.Code = arch.CLD_KILLED
+ si.SetStatus(int32(s.Signal()))
+ case s.CoreDump():
+ si.Code = arch.CLD_DUMPED
+ si.SetStatus(int32(s.Signal()))
+ case s.Stopped():
+ if wr.Event == kernel.EventTraceeStop {
+ si.Code = arch.CLD_TRAPPED
+ si.SetStatus(int32(s.TrapCause()))
+ } else {
+ si.Code = arch.CLD_STOPPED
+ si.SetStatus(int32(s.StopSignal()))
+ }
+ case s.Continued():
+ si.Code = arch.CLD_CONTINUED
+ si.SetStatus(int32(linux.SIGCONT))
+ default:
+ t.Warningf("waitid got incomprehensible wait status %d", s)
+ }
+ _, err = t.CopyOut(infop, &si)
+ return 0, nil, err
+}
+
+// SetTidAddress implements linux syscall set_tid_address(2).
+func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+
+ // Always succeed, return caller's tid.
+ t.SetClearTID(addr)
+ return uintptr(t.ThreadID()), nil, nil
+}
+
+// Unshare implements linux syscall unshare(2).
+func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := args[0].Int()
+ opts := kernel.SharingOptions{
+ NewAddressSpace: flags&linux.CLONE_VM == linux.CLONE_VM,
+ NewSignalHandlers: flags&linux.CLONE_SIGHAND == linux.CLONE_SIGHAND,
+ NewThreadGroup: flags&linux.CLONE_THREAD == linux.CLONE_THREAD,
+ NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID,
+ NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER,
+ NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET,
+ NewFiles: flags&linux.CLONE_FILES == linux.CLONE_FILES,
+ NewFSContext: flags&linux.CLONE_FS == linux.CLONE_FS,
+ NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS,
+ NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC,
+ }
+ // "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2)
+ if opts.NewPIDNamespace {
+ opts.NewThreadGroup = true
+ }
+ // "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since
+ // Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS."
+ if opts.NewUserNamespace {
+ opts.NewThreadGroup = true
+ opts.NewFSContext = true
+ }
+ return 0, nil, t.Unshare(&opts)
+}
+
+// SchedYield implements linux syscall sched_yield(2).
+func SchedYield(t *kernel.Task, _ arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ t.Yield()
+ return 0, nil, nil
+}
+
+// SchedSetaffinity implements linux syscall sched_setaffinity(2).
+func SchedSetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ tid := args[0].Int()
+ size := args[1].SizeT()
+ maskAddr := args[2].Pointer()
+
+ var task *kernel.Task
+ if tid == 0 {
+ task = t
+ } else {
+ task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
+ if task == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ }
+
+ mask := sched.NewCPUSet(t.Kernel().ApplicationCores())
+ if size > mask.Size() {
+ size = mask.Size()
+ }
+ if _, err := t.CopyInBytes(maskAddr, mask[:size]); err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, task.SetCPUMask(mask)
+}
+
+// SchedGetaffinity implements linux syscall sched_getaffinity(2).
+func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ tid := args[0].Int()
+ size := args[1].SizeT()
+ maskAddr := args[2].Pointer()
+
+ // This limitation is because linux stores the cpumask
+ // in an array of "unsigned long" so the buffer needs to
+ // be a multiple of the word size.
+ if size&(t.Arch().Width()-1) > 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ var task *kernel.Task
+ if tid == 0 {
+ task = t
+ } else {
+ task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
+ if task == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ }
+
+ mask := task.CPUMask()
+ // The buffer needs to be big enough to hold a cpumask with
+ // all possible cpus.
+ if size < mask.Size() {
+ return 0, nil, syserror.EINVAL
+ }
+ _, err := t.CopyOutBytes(maskAddr, mask)
+
+ // NOTE: The syscall interface is slightly different than the glibc
+ // interface. The raw sched_getaffinity syscall returns the number of
+ // bytes used to represent a cpu mask.
+ return uintptr(mask.Size()), nil, err
+}
+
+// Getcpu implements linux syscall getcpu(2).
+func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ cpu := args[0].Pointer()
+ node := args[1].Pointer()
+ // third argument to this system call is nowadays unused.
+
+ if cpu != 0 {
+ buf := t.CopyScratchBuffer(4)
+ usermem.ByteOrder.PutUint32(buf, uint32(t.CPU()))
+ if _, err := t.CopyOutBytes(cpu, buf); err != nil {
+ return 0, nil, err
+ }
+ }
+ // We always return node 0.
+ if node != 0 {
+ if _, err := t.MemoryManager().ZeroOut(t, node, 4, usermem.IOOpts{
+ AddressSpaceActive: true,
+ }); err != nil {
+ return 0, nil, err
+ }
+ }
+ return 0, nil, nil
+}
+
+// Setpgid implements the linux syscall setpgid(2).
+func Setpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ // Note that throughout this function, pgid is interpreted with respect
+ // to t's namespace, not with respect to the selected ThreadGroup's
+ // namespace (which may be different).
+ pid := kernel.ThreadID(args[0].Int())
+ pgid := kernel.ProcessGroupID(args[1].Int())
+
+ // "If pid is zero, then the process ID of the calling process is used."
+ tg := t.ThreadGroup()
+ if pid != 0 {
+ ot := t.PIDNamespace().TaskWithID(pid)
+ if ot == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ tg = ot.ThreadGroup()
+ if tg.Leader() != ot {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Setpgid only operates on child threadgroups.
+ if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) {
+ return 0, nil, syserror.ESRCH
+ }
+ }
+
+ // "If pgid is zero, then the PGID of the process specified by pid is made
+ // the same as its process ID."
+ defaultPGID := kernel.ProcessGroupID(t.PIDNamespace().IDOfThreadGroup(tg))
+ if pgid == 0 {
+ pgid = defaultPGID
+ } else if pgid < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // If the pgid is the same as the group, then create a new one. Otherwise,
+ // we attempt to join an existing process group.
+ if pgid == defaultPGID {
+ // For convenience, errors line up with Linux syscall API.
+ if err := tg.CreateProcessGroup(); err != nil {
+ // Is the process group already as expected? If so,
+ // just return success. This is the same behavior as
+ // Linux.
+ if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == defaultPGID {
+ return 0, nil, nil
+ }
+ return 0, nil, err
+ }
+ } else {
+ // Same as CreateProcessGroup, above.
+ if err := tg.JoinProcessGroup(t.PIDNamespace(), pgid, tg != t.ThreadGroup()); err != nil {
+ // See above.
+ if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid {
+ return 0, nil, nil
+ }
+ return 0, nil, err
+ }
+ }
+
+ // Success.
+ return 0, nil, nil
+}
+
+// Getpgrp implements the linux syscall getpgrp(2).
+func Getpgrp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return uintptr(t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), nil, nil
+}
+
+// Getpgid implements the linux syscall getpgid(2).
+func Getpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ tid := kernel.ThreadID(args[0].Int())
+ if tid == 0 {
+ return Getpgrp(t, args)
+ }
+
+ target := t.PIDNamespace().TaskWithID(tid)
+ if target == nil {
+ return 0, nil, syserror.ESRCH
+ }
+
+ return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil
+}
+
+// Setsid implements the linux syscall setsid(2).
+func Setsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, t.ThreadGroup().CreateSession()
+}
+
+// Getsid implements the linux syscall getsid(2).
+func Getsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ tid := kernel.ThreadID(args[0].Int())
+ if tid == 0 {
+ return uintptr(t.PIDNamespace().IDOfSession(t.ThreadGroup().Session())), nil, nil
+ }
+
+ target := t.PIDNamespace().TaskWithID(tid)
+ if target == nil {
+ return 0, nil, syserror.ESRCH
+ }
+
+ return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil
+}
+
+// Getpriority pretends to implement the linux syscall getpriority(2).
+//
+// This is a stub; real priorities require a full scheduler.
+func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ which := args[0].Int()
+ who := kernel.ThreadID(args[1].Int())
+
+ switch which {
+ case linux.PRIO_PROCESS:
+ // Look for who, return ESRCH if not found.
+ var task *kernel.Task
+ if who == 0 {
+ task = t
+ } else {
+ task = t.PIDNamespace().TaskWithID(who)
+ }
+
+ if task == nil {
+ return 0, nil, syserror.ESRCH
+ }
+
+ // From kernel/sys.c:getpriority:
+ // "To avoid negative return values, 'getpriority()'
+ // will not return the normal nice-value, but a negated
+ // value that has been offset by 20"
+ return uintptr(20 - task.Niceness()), nil, nil
+ case linux.PRIO_USER:
+ fallthrough
+ case linux.PRIO_PGRP:
+ // PRIO_USER and PRIO_PGRP have no further implementation yet.
+ return 0, nil, nil
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+}
+
+// Setpriority pretends to implement the linux syscall setpriority(2).
+//
+// This is a stub; real priorities require a full scheduler.
+func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ which := args[0].Int()
+ who := kernel.ThreadID(args[1].Int())
+ niceval := int(args[2].Int())
+
+ // In the kernel's implementation, values outside the range
+ // of [-20, 19] are truncated to these minimum and maximum
+ // values.
+ if niceval < -20 /* min niceval */ {
+ niceval = -20
+ } else if niceval > 19 /* max niceval */ {
+ niceval = 19
+ }
+
+ switch which {
+ case linux.PRIO_PROCESS:
+ // Look for who, return ESRCH if not found.
+ var task *kernel.Task
+ if who == 0 {
+ task = t
+ } else {
+ task = t.PIDNamespace().TaskWithID(who)
+ }
+
+ if task == nil {
+ return 0, nil, syserror.ESRCH
+ }
+
+ task.SetNiceness(niceval)
+ case linux.PRIO_USER:
+ fallthrough
+ case linux.PRIO_PGRP:
+ // PRIO_USER and PRIO_PGRP have no further implementation yet.
+ return 0, nil, nil
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ return 0, nil, nil
+}
+
+// Ptrace implements linux system call ptrace(2).
+func Ptrace(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ req := args[0].Int64()
+ pid := kernel.ThreadID(args[1].Int())
+ addr := args[2].Pointer()
+ data := args[3].Pointer()
+
+ return 0, nil, t.Ptrace(req, pid, addr, data)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
new file mode 100644
index 000000000..2d2aa0819
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -0,0 +1,342 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "fmt"
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// The most significant 29 bits hold either a pid or a file descriptor.
+func pidOfClockID(c int32) kernel.ThreadID {
+ return kernel.ThreadID(^(c >> 3))
+}
+
+// whichCPUClock returns one of CPUCLOCK_PERF, CPUCLOCK_VIRT, CPUCLOCK_SCHED or
+// CLOCK_FD.
+func whichCPUClock(c int32) int32 {
+ return c & linux.CPUCLOCK_CLOCK_MASK
+}
+
+// isCPUClockPerThread returns true if the CPUCLOCK_PERTHREAD bit is set in the
+// clock id.
+func isCPUClockPerThread(c int32) bool {
+ return c&linux.CPUCLOCK_PERTHREAD_MASK != 0
+}
+
+// isValidCPUClock returns checks that the cpu clock id is valid.
+func isValidCPUClock(c int32) bool {
+ // Bits 0, 1, and 2 cannot all be set.
+ if c&7 == 7 {
+ return false
+ }
+ if whichCPUClock(c) >= linux.CPUCLOCK_MAX {
+ return false
+ }
+ return true
+}
+
+// targetTask returns the kernel.Task for the given clock id.
+func targetTask(t *kernel.Task, c int32) *kernel.Task {
+ pid := pidOfClockID(c)
+ if pid == 0 {
+ return t
+ }
+ return t.PIDNamespace().TaskWithID(pid)
+}
+
+// ClockGetres implements linux syscall clock_getres(2).
+func ClockGetres(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ clockID := int32(args[0].Int())
+ addr := args[1].Pointer()
+ r := linux.Timespec{
+ Sec: 0,
+ Nsec: 1,
+ }
+
+ if _, err := getClock(t, clockID); err != nil {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if addr == 0 {
+ // Don't need to copy out.
+ return 0, nil, nil
+ }
+
+ return 0, nil, copyTimespecOut(t, addr, &r)
+}
+
+type cpuClocker interface {
+ UserCPUClock() ktime.Clock
+ CPUClock() ktime.Clock
+}
+
+func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) {
+ if clockID < 0 {
+ if !isValidCPUClock(clockID) {
+ return nil, syserror.EINVAL
+ }
+
+ targetTask := targetTask(t, clockID)
+ if targetTask == nil {
+ return nil, syserror.EINVAL
+ }
+
+ var target cpuClocker
+ if isCPUClockPerThread(clockID) {
+ target = targetTask
+ } else {
+ target = targetTask.ThreadGroup()
+ }
+
+ switch whichCPUClock(clockID) {
+ case linux.CPUCLOCK_VIRT:
+ return target.UserCPUClock(), nil
+ case linux.CPUCLOCK_PROF, linux.CPUCLOCK_SCHED:
+ // CPUCLOCK_SCHED is approximated by CPUCLOCK_PROF.
+ return target.CPUClock(), nil
+ default:
+ return nil, syserror.EINVAL
+ }
+ }
+
+ switch clockID {
+ case linux.CLOCK_REALTIME, linux.CLOCK_REALTIME_COARSE:
+ return t.Kernel().RealtimeClock(), nil
+ case linux.CLOCK_MONOTONIC, linux.CLOCK_MONOTONIC_COARSE,
+ linux.CLOCK_MONOTONIC_RAW, linux.CLOCK_BOOTTIME:
+ // CLOCK_MONOTONIC approximates CLOCK_MONOTONIC_RAW.
+ // CLOCK_BOOTTIME is internally mapped to CLOCK_MONOTONIC, as:
+ // - CLOCK_BOOTTIME should behave as CLOCK_MONOTONIC while also
+ // including suspend time.
+ // - gVisor has no concept of suspend/resume.
+ // - CLOCK_MONOTONIC already includes save/restore time, which is
+ // the closest to suspend time.
+ return t.Kernel().MonotonicClock(), nil
+ case linux.CLOCK_PROCESS_CPUTIME_ID:
+ return t.ThreadGroup().CPUClock(), nil
+ case linux.CLOCK_THREAD_CPUTIME_ID:
+ return t.CPUClock(), nil
+ default:
+ return nil, syserror.EINVAL
+ }
+}
+
+// ClockGettime implements linux syscall clock_gettime(2).
+func ClockGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ clockID := int32(args[0].Int())
+ addr := args[1].Pointer()
+
+ c, err := getClock(t, clockID)
+ if err != nil {
+ return 0, nil, err
+ }
+ ts := c.Now().Timespec()
+ return 0, nil, copyTimespecOut(t, addr, &ts)
+}
+
+// ClockSettime implements linux syscall clock_settime(2).
+func ClockSettime(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, syserror.EPERM
+}
+
+// Time implements linux syscall time(2).
+func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+
+ r := t.Kernel().RealtimeClock().Now().TimeT()
+ if addr == usermem.Addr(0) {
+ return uintptr(r), nil, nil
+ }
+
+ if _, err := t.CopyOut(addr, r); err != nil {
+ return 0, nil, err
+ }
+ return uintptr(r), nil, nil
+}
+
+// clockNanosleepRestartBlock encapsulates the state required to restart
+// clock_nanosleep(2) via restart_syscall(2).
+//
+// +stateify savable
+type clockNanosleepRestartBlock struct {
+ c ktime.Clock
+ duration time.Duration
+ rem usermem.Addr
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (n *clockNanosleepRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+ return 0, clockNanosleepFor(t, n.c, n.duration, n.rem)
+}
+
+// clockNanosleepUntil blocks until a specified time.
+//
+// If blocking is interrupted, the syscall is restarted with the original
+// arguments.
+func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, ts linux.Timespec) error {
+ notifier, tchan := ktime.NewChannelNotifier()
+ timer := ktime.NewTimer(c, notifier)
+
+ // Turn on the timer.
+ timer.Swap(ktime.Setting{
+ Period: 0,
+ Enabled: true,
+ Next: ktime.FromTimespec(ts),
+ })
+
+ err := t.BlockWithTimer(nil, tchan)
+
+ timer.Destroy()
+
+ // Did we just block until the timeout happened?
+ if err == syserror.ETIMEDOUT {
+ return nil
+ }
+
+ return syserror.ConvertIntr(err, kernel.ERESTARTNOHAND)
+}
+
+// clockNanosleepFor blocks for a specified duration.
+//
+// If blocking is interrupted, the syscall is restarted with the remaining
+// duration timeout.
+func clockNanosleepFor(t *kernel.Task, c ktime.Clock, dur time.Duration, rem usermem.Addr) error {
+ timer, start, tchan := ktime.After(c, dur)
+
+ err := t.BlockWithTimer(nil, tchan)
+
+ after := c.Now()
+
+ timer.Destroy()
+
+ switch err {
+ case syserror.ETIMEDOUT:
+ // Slept for entire timeout.
+ return nil
+ case syserror.ErrInterrupted:
+ // Interrupted.
+ remaining := dur - after.Sub(start)
+ if remaining < 0 {
+ remaining = time.Duration(0)
+ }
+
+ // Copy out remaining time.
+ if rem != 0 {
+ timeleft := linux.NsecToTimespec(remaining.Nanoseconds())
+ if err := copyTimespecOut(t, rem, &timeleft); err != nil {
+ return err
+ }
+ }
+
+ // Arrange for a restart with the remaining duration.
+ t.SetSyscallRestartBlock(&clockNanosleepRestartBlock{
+ c: c,
+ duration: remaining,
+ rem: rem,
+ })
+ return kernel.ERESTART_RESTARTBLOCK
+ default:
+ panic(fmt.Sprintf("Impossible BlockWithTimer error %v", err))
+ }
+}
+
+// Nanosleep implements linux syscall Nanosleep(2).
+func Nanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ rem := args[1].Pointer()
+
+ ts, err := copyTimespecIn(t, addr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ if !ts.Valid() {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Just like linux, we cap the timeout with the max number that int64 can
+ // represent which is roughly 292 years.
+ dur := time.Duration(ts.ToNsecCapped()) * time.Nanosecond
+ return 0, nil, clockNanosleepFor(t, t.Kernel().MonotonicClock(), dur, rem)
+}
+
+// ClockNanosleep implements linux syscall clock_nanosleep(2).
+func ClockNanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ clockID := int32(args[0].Int())
+ flags := args[1].Int()
+ addr := args[2].Pointer()
+ rem := args[3].Pointer()
+
+ req, err := copyTimespecIn(t, addr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ if !req.Valid() {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Only allow clock constants also allowed by Linux.
+ if clockID > 0 {
+ if clockID != linux.CLOCK_REALTIME &&
+ clockID != linux.CLOCK_MONOTONIC &&
+ clockID != linux.CLOCK_PROCESS_CPUTIME_ID {
+ return 0, nil, syserror.EINVAL
+ }
+ }
+
+ c, err := getClock(t, clockID)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ if flags&linux.TIMER_ABSTIME != 0 {
+ return 0, nil, clockNanosleepUntil(t, c, req)
+ }
+
+ dur := time.Duration(req.ToNsecCapped()) * time.Nanosecond
+ return 0, nil, clockNanosleepFor(t, c, dur, rem)
+}
+
+// Gettimeofday implements linux syscall gettimeofday(2).
+func Gettimeofday(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ tv := args[0].Pointer()
+ tz := args[1].Pointer()
+
+ if tv != usermem.Addr(0) {
+ nowTv := t.Kernel().RealtimeClock().Now().Timeval()
+ if err := copyTimevalOut(t, tv, &nowTv); err != nil {
+ return 0, nil, err
+ }
+ }
+
+ if tz != usermem.Addr(0) {
+ // Ask the time package for the timezone.
+ _, offset := time.Now().Zone()
+ // This int32 array mimics linux's struct timezone.
+ timezone := [2]int32{-int32(offset) / 60, 0}
+ _, err := t.CopyOut(tz, timezone)
+ return 0, nil, err
+ }
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
new file mode 100644
index 000000000..a4c400f87
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -0,0 +1,203 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+const nsecPerSec = int64(time.Second)
+
+// copyItimerValIn copies an ItimerVal from the untrusted app range to the
+// kernel. The ItimerVal may be either 32 or 64 bits.
+// A NULL address is allowed because because Linux allows
+// setitimer(which, NULL, &old_value) which disables the timer.
+// There is a KERN_WARN message saying this misfeature will be removed.
+// However, that hasn't happened as of 3.19, so we continue to support it.
+func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) {
+ if addr == usermem.Addr(0) {
+ return linux.ItimerVal{}, nil
+ }
+
+ switch t.Arch().Width() {
+ case 8:
+ // Native size, just copy directly.
+ var itv linux.ItimerVal
+ if _, err := t.CopyIn(addr, &itv); err != nil {
+ return linux.ItimerVal{}, err
+ }
+
+ return itv, nil
+ default:
+ return linux.ItimerVal{}, syserror.ENOSYS
+ }
+}
+
+// copyItimerValOut copies an ItimerVal to the untrusted app range.
+// The ItimerVal may be either 32 or 64 bits.
+// A NULL address is allowed, in which case no copy takes place
+func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) error {
+ if addr == usermem.Addr(0) {
+ return nil
+ }
+
+ switch t.Arch().Width() {
+ case 8:
+ // Native size, just copy directly.
+ _, err := t.CopyOut(addr, itv)
+ return err
+ default:
+ return syserror.ENOSYS
+ }
+}
+
+// Getitimer implements linux syscall getitimer(2).
+func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ timerID := args[0].Int()
+ val := args[1].Pointer()
+
+ olditv, err := t.Getitimer(timerID)
+ if err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, copyItimerValOut(t, val, &olditv)
+}
+
+// Setitimer implements linux syscall setitimer(2).
+func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ timerID := args[0].Int()
+ newVal := args[1].Pointer()
+ oldVal := args[2].Pointer()
+
+ newitv, err := copyItimerValIn(t, newVal)
+ if err != nil {
+ return 0, nil, err
+ }
+ olditv, err := t.Setitimer(timerID, newitv)
+ if err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, copyItimerValOut(t, oldVal, &olditv)
+}
+
+// Alarm implements linux syscall alarm(2).
+func Alarm(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ duration := time.Duration(args[0].Uint()) * time.Second
+
+ olditv, err := t.Setitimer(linux.ITIMER_REAL, linux.ItimerVal{
+ Value: linux.DurationToTimeval(duration),
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ olddur := olditv.Value.ToDuration()
+ secs := olddur.Round(time.Second).Nanoseconds() / nsecPerSec
+ if secs == 0 && olddur != 0 {
+ // We can't return 0 if an alarm was previously scheduled.
+ secs = 1
+ }
+ return uintptr(secs), nil, nil
+}
+
+// TimerCreate implements linux syscall timer_create(2).
+func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ clockID := args[0].Int()
+ sevp := args[1].Pointer()
+ timerIDp := args[2].Pointer()
+
+ c, err := getClock(t, clockID)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ var sev *linux.Sigevent
+ if sevp != 0 {
+ sev = &linux.Sigevent{}
+ if _, err = t.CopyIn(sevp, sev); err != nil {
+ return 0, nil, err
+ }
+ }
+
+ id, err := t.IntervalTimerCreate(c, sev)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ if _, err := t.CopyOut(timerIDp, &id); err != nil {
+ t.IntervalTimerDelete(id)
+ return 0, nil, err
+ }
+
+ return 0, nil, nil
+}
+
+// TimerSettime implements linux syscall timer_settime(2).
+func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ timerID := linux.TimerID(args[0].Value)
+ flags := args[1].Int()
+ newValAddr := args[2].Pointer()
+ oldValAddr := args[3].Pointer()
+
+ var newVal linux.Itimerspec
+ if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+ return 0, nil, err
+ }
+ oldVal, err := t.IntervalTimerSettime(timerID, newVal, flags&linux.TIMER_ABSTIME != 0)
+ if err != nil {
+ return 0, nil, err
+ }
+ if oldValAddr != 0 {
+ if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+ return 0, nil, err
+ }
+ }
+ return 0, nil, nil
+}
+
+// TimerGettime implements linux syscall timer_gettime(2).
+func TimerGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ timerID := linux.TimerID(args[0].Value)
+ curValAddr := args[1].Pointer()
+
+ curVal, err := t.IntervalTimerGettime(timerID)
+ if err != nil {
+ return 0, nil, err
+ }
+ _, err = t.CopyOut(curValAddr, &curVal)
+ return 0, nil, err
+}
+
+// TimerGetoverrun implements linux syscall timer_getoverrun(2).
+func TimerGetoverrun(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ timerID := linux.TimerID(args[0].Value)
+
+ o, err := t.IntervalTimerGetoverrun(timerID)
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(o), nil, nil
+}
+
+// TimerDelete implements linux syscall timer_delete(2).
+func TimerDelete(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ timerID := linux.TimerID(args[0].Value)
+ return 0, nil, t.IntervalTimerDelete(timerID)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
new file mode 100644
index 000000000..cf49b43db
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -0,0 +1,121 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TimerfdCreate implements Linux syscall timerfd_create(2).
+func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ clockID := args[0].Int()
+ flags := args[1].Int()
+
+ if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ var c ktime.Clock
+ switch clockID {
+ case linux.CLOCK_REALTIME:
+ c = t.Kernel().RealtimeClock()
+ case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME:
+ c = t.Kernel().MonotonicClock()
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+ f := timerfd.NewFile(t, c)
+ defer f.DecRef()
+ f.SetFlags(fs.SettableFileFlags{
+ NonBlocking: flags&linux.TFD_NONBLOCK != 0,
+ })
+
+ fd, err := t.NewFDFrom(0, f, kernel.FDFlags{
+ CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
+
+// TimerfdSettime implements Linux syscall timerfd_settime(2).
+func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ flags := args[1].Int()
+ newValAddr := args[2].Pointer()
+ oldValAddr := args[3].Pointer()
+
+ if flags&^(linux.TFD_TIMER_ABSTIME) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ f := t.GetFile(fd)
+ if f == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer f.DecRef()
+
+ tf, ok := f.FileOperations.(*timerfd.TimerOperations)
+ if !ok {
+ return 0, nil, syserror.EINVAL
+ }
+
+ var newVal linux.Itimerspec
+ if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+ return 0, nil, err
+ }
+ newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tf.Clock())
+ if err != nil {
+ return 0, nil, err
+ }
+ tm, oldS := tf.SetTime(newS)
+ if oldValAddr != 0 {
+ oldVal := ktime.ItimerspecFromSetting(tm, oldS)
+ if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+ return 0, nil, err
+ }
+ }
+ return 0, nil, nil
+}
+
+// TimerfdGettime implements Linux syscall timerfd_gettime(2).
+func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ curValAddr := args[1].Pointer()
+
+ f := t.GetFile(fd)
+ if f == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer f.DecRef()
+
+ tf, ok := f.FileOperations.(*timerfd.TimerOperations)
+ if !ok {
+ return 0, nil, syserror.EINVAL
+ }
+
+ tm, s := tf.GetTime()
+ curVal := ktime.ItimerspecFromSetting(tm, s)
+ _, err := t.CopyOut(curValAddr, &curVal)
+ return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_tls_amd64.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
new file mode 100644
index 000000000..b3eb96a1c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
@@ -0,0 +1,52 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//+build amd64
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// ArchPrctl implements linux syscall arch_prctl(2).
+// It sets architecture-specific process or thread state for t.
+func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ switch args[0].Int() {
+ case linux.ARCH_GET_FS:
+ addr := args[1].Pointer()
+ fsbase := t.Arch().TLS()
+ _, err := t.CopyOut(addr, uint64(fsbase))
+ if err != nil {
+ return 0, nil, err
+ }
+
+ case linux.ARCH_SET_FS:
+ fsbase := args[1].Uint64()
+ if !t.Arch().SetTLS(uintptr(fsbase)) {
+ return 0, nil, syserror.EPERM
+ }
+
+ case linux.ARCH_GET_GS, linux.ARCH_SET_GS:
+ t.Kernel().EmitUnimplementedEvent(t)
+ fallthrough
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_tls_arm64.go b/pkg/sentry/syscalls/linux/sys_tls_arm64.go
new file mode 100644
index 000000000..fb08a356e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_tls_arm64.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//+build arm64
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// ArchPrctl is not defined for ARM64.
+func ArchPrctl(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, syserror.ENOSYS
+}
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
new file mode 100644
index 000000000..e9d702e8e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -0,0 +1,95 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Uname implements linux syscall uname.
+func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ version := t.SyscallTable().Version
+
+ uts := t.UTSNamespace()
+
+ // Fill in structure fields.
+ var u linux.UtsName
+ copy(u.Sysname[:], version.Sysname)
+ copy(u.Nodename[:], uts.HostName())
+ copy(u.Release[:], version.Release)
+ copy(u.Version[:], version.Version)
+ // build tag above.
+ switch t.SyscallTable().Arch {
+ case arch.AMD64:
+ copy(u.Machine[:], "x86_64")
+ case arch.ARM64:
+ copy(u.Machine[:], "aarch64")
+ default:
+ copy(u.Machine[:], "unknown")
+ }
+ copy(u.Domainname[:], uts.DomainName())
+
+ // Copy out the result.
+ va := args[0].Pointer()
+ _, err := t.CopyOut(va, u)
+ return 0, nil, err
+}
+
+// Setdomainname implements Linux syscall setdomainname.
+func Setdomainname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ nameAddr := args[0].Pointer()
+ size := args[1].Int()
+
+ utsns := t.UTSNamespace()
+ if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) {
+ return 0, nil, syserror.EPERM
+ }
+ if size < 0 || size > linux.UTSLen {
+ return 0, nil, syserror.EINVAL
+ }
+
+ name, err := t.CopyInString(nameAddr, int(size))
+ if err != nil {
+ return 0, nil, err
+ }
+
+ utsns.SetDomainName(name)
+ return 0, nil, nil
+}
+
+// Sethostname implements Linux syscall sethostname.
+func Sethostname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ nameAddr := args[0].Pointer()
+ size := args[1].Int()
+
+ utsns := t.UTSNamespace()
+ if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) {
+ return 0, nil, syserror.EPERM
+ }
+ if size < 0 || size > linux.UTSLen {
+ return 0, nil, syserror.EINVAL
+ }
+
+ name := make([]byte, size)
+ if _, err := t.CopyInBytes(nameAddr, name); err != nil {
+ return 0, nil, err
+ }
+
+ utsns.SetHostName(string(name))
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
new file mode 100644
index 000000000..6ec0de96e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -0,0 +1,364 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/socket"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// LINT.IfChange
+
+const (
+ // EventMaskWrite contains events that can be triggered on writes.
+ //
+ // Note that EventHUp is not going to happen for pipes but may for
+ // implementations of poll on some sockets, see net/core/datagram.c.
+ EventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr
+)
+
+// Write implements linux syscall write(2).
+func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ size := args[2].SizeT()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the file is writable.
+ if !file.Flags().Write {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Check that the size is legitimate.
+ si := int(size)
+ if si < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the source of the write.
+ src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := writev(t, file, src)
+ t.IOUsage().AccountWriteSyscall(n)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+}
+
+// Pwrite64 implements linux syscall pwrite64(2).
+func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ size := args[2].SizeT()
+ offset := args[3].Int64()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the offset is legitimate and does not overflow.
+ if offset < 0 || offset+int64(size) < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Is writing at an offset supported?
+ if !file.Flags().Pwrite {
+ return 0, nil, syserror.ESPIPE
+ }
+
+ // Check that the file is writable.
+ if !file.Flags().Write {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Check that the size is legitimate.
+ si := int(size)
+ if si < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the source of the write.
+ src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := pwritev(t, file, src, offset)
+ t.IOUsage().AccountWriteSyscall(n)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+}
+
+// Writev implements linux syscall writev(2).
+func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ iovcnt := int(args[2].Int())
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the file is writable.
+ if !file.Flags().Write {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Read the iovecs that specify the source of the write.
+ src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := writev(t, file, src)
+ t.IOUsage().AccountWriteSyscall(n)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+}
+
+// Pwritev implements linux syscall pwritev(2).
+func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ iovcnt := int(args[2].Int())
+ offset := args[3].Int64()
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the offset is legitimate.
+ if offset < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Is writing at an offset supported?
+ if !file.Flags().Pwrite {
+ return 0, nil, syserror.ESPIPE
+ }
+
+ // Check that the file is writable.
+ if !file.Flags().Write {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Read the iovecs that specify the source of the write.
+ src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := pwritev(t, file, src, offset)
+ t.IOUsage().AccountWriteSyscall(n)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+}
+
+// Pwritev2 implements linux syscall pwritev2(2).
+func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ // While the syscall is
+ // pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+ // the linux internal call
+ // (https://elixir.bootlin.com/linux/v4.18/source/fs/read_write.c#L1354)
+ // splits the offset argument into a high/low value for compatibility with
+ // 32-bit architectures. The flags argument is the 5th argument.
+
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ iovcnt := int(args[2].Int())
+ offset := args[3].Int64()
+ flags := int(args[5].Int())
+
+ if int(args[4].Int())&0x4 == 1 {
+ return 0, nil, syserror.EACCES
+ }
+
+ file := t.GetFile(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the offset is legitimate.
+ if offset < -1 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Is writing at an offset supported?
+ if offset > -1 && !file.Flags().Pwrite {
+ return 0, nil, syserror.ESPIPE
+ }
+
+ // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is
+ // accepted as a valid flag argument for pwritev2.
+ if flags&^linux.RWF_VALID != 0 {
+ return uintptr(flags), nil, syserror.EOPNOTSUPP
+ }
+
+ // Check that the file is writeable.
+ if !file.Flags().Write {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Read the iovecs that specify the source of the write.
+ src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // If pwritev2 is called with an offset of -1, writev is called.
+ if offset == -1 {
+ n, err := writev(t, file, src)
+ t.IOUsage().AccountWriteSyscall(n)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+ }
+
+ n, err := pwritev(t, file, src, offset)
+ t.IOUsage().AccountWriteSyscall(n)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+}
+
+func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) {
+ n, err := f.Writev(t, src)
+ if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+ if n > 0 {
+ // Queue notification if we wrote anything.
+ f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+ }
+ return n, err
+ }
+
+ // Sockets support write timeouts.
+ var haveDeadline bool
+ var deadline ktime.Time
+ if s, ok := f.FileOperations.(socket.Socket); ok {
+ dl := s.SendTimeout()
+ if dl < 0 && err == syserror.ErrWouldBlock {
+ return n, err
+ }
+ if dl > 0 {
+ deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+ haveDeadline = true
+ }
+ }
+
+ // Register for notifications.
+ w, ch := waiter.NewChannelEntry(nil)
+ f.EventRegister(&w, EventMaskWrite)
+
+ total := n
+ for {
+ // Shorten src to reflect bytes previously written.
+ src = src.DropFirst64(n)
+
+ // Issue the request and break out if it completes with
+ // anything other than "would block".
+ n, err = f.Writev(t, src)
+ total += n
+ if err != syserror.ErrWouldBlock {
+ break
+ }
+
+ // Wait for a notification that we should retry.
+ if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+ if err == syserror.ETIMEDOUT {
+ err = syserror.ErrWouldBlock
+ }
+ break
+ }
+ }
+
+ f.EventUnregister(&w)
+
+ if total > 0 {
+ // Queue notification if we wrote anything.
+ f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+ }
+
+ return total, err
+}
+
+func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+ n, err := f.Pwritev(t, src, offset)
+ if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+ if n > 0 {
+ // Queue notification if we wrote anything.
+ f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+ }
+ return n, err
+ }
+
+ // Register for notifications.
+ w, ch := waiter.NewChannelEntry(nil)
+ f.EventRegister(&w, EventMaskWrite)
+
+ total := n
+ for {
+ // Shorten src to reflect bytes previously written.
+ src = src.DropFirst64(n)
+
+ // Issue the request and break out if it completes with
+ // anything other than "would block".
+ n, err = f.Pwritev(t, src, offset+total)
+ total += n
+ if err != syserror.ErrWouldBlock {
+ break
+ }
+
+ // Wait for a notification that we should retry.
+ if err = t.Block(ch); err != nil {
+ break
+ }
+ }
+
+ f.EventUnregister(&w)
+
+ if total > 0 {
+ // Queue notification if we wrote anything.
+ f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+ }
+
+ return total, err
+}
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
new file mode 100644
index 000000000..c24946160
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -0,0 +1,432 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "strings"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// LINT.IfChange
+
+// GetXattr implements linux syscall getxattr(2).
+func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return getXattrFromPath(t, args, true)
+}
+
+// LGetXattr implements linux syscall lgetxattr(2).
+func LGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return getXattrFromPath(t, args, false)
+}
+
+// FGetXattr implements linux syscall fgetxattr(2).
+func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ nameAddr := args[1].Pointer()
+ valueAddr := args[2].Pointer()
+ size := uint64(args[3].SizeT())
+
+ // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+ f := t.GetFile(fd)
+ if f == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer f.DecRef()
+
+ n, err := getXattr(t, f.Dirent, nameAddr, valueAddr, size)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(n), nil, nil
+}
+
+func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ nameAddr := args[1].Pointer()
+ valueAddr := args[2].Pointer()
+ size := uint64(args[3].SizeT())
+
+ path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n := 0
+ err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ n, err = getXattr(t, d, nameAddr, valueAddr, size)
+ return err
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(n), nil, nil
+}
+
+// getXattr implements getxattr(2) from the given *fs.Dirent.
+func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64) (int, error) {
+ name, err := copyInXattrName(t, nameAddr)
+ if err != nil {
+ return 0, err
+ }
+
+ if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
+ return 0, err
+ }
+
+ // TODO(b/148380782): Support xattrs in namespaces other than "user".
+ if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+ return 0, syserror.EOPNOTSUPP
+ }
+
+ // If getxattr(2) is called with size 0, the size of the value will be
+ // returned successfully even if it is nonzero. In that case, we need to
+ // retrieve the entire attribute value so we can return the correct size.
+ requestedSize := size
+ if size == 0 || size > linux.XATTR_SIZE_MAX {
+ requestedSize = linux.XATTR_SIZE_MAX
+ }
+
+ value, err := d.Inode.GetXattr(t, name, requestedSize)
+ if err != nil {
+ return 0, err
+ }
+ n := len(value)
+ if uint64(n) > requestedSize {
+ return 0, syserror.ERANGE
+ }
+
+ // Don't copy out the attribute value if size is 0.
+ if size == 0 {
+ return n, nil
+ }
+
+ if _, err = t.CopyOutBytes(valueAddr, []byte(value)); err != nil {
+ return 0, err
+ }
+ return n, nil
+}
+
+// SetXattr implements linux syscall setxattr(2).
+func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return setXattrFromPath(t, args, true)
+}
+
+// LSetXattr implements linux syscall lsetxattr(2).
+func LSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return setXattrFromPath(t, args, false)
+}
+
+// FSetXattr implements linux syscall fsetxattr(2).
+func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ nameAddr := args[1].Pointer()
+ valueAddr := args[2].Pointer()
+ size := uint64(args[3].SizeT())
+ flags := args[4].Uint()
+
+ // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+ f := t.GetFile(fd)
+ if f == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer f.DecRef()
+
+ return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags)
+}
+
+func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ nameAddr := args[1].Pointer()
+ valueAddr := args[2].Pointer()
+ size := uint64(args[3].SizeT())
+ flags := args[4].Uint()
+
+ path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ return setXattr(t, d, nameAddr, valueAddr, uint64(size), flags)
+ })
+}
+
+// setXattr implements setxattr(2) from the given *fs.Dirent.
+func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error {
+ if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+ return syserror.EINVAL
+ }
+
+ name, err := copyInXattrName(t, nameAddr)
+ if err != nil {
+ return err
+ }
+
+ if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+ return err
+ }
+
+ if size > linux.XATTR_SIZE_MAX {
+ return syserror.E2BIG
+ }
+ buf := make([]byte, size)
+ if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
+ return err
+ }
+ value := string(buf)
+
+ if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+ return syserror.EOPNOTSUPP
+ }
+
+ if err := d.Inode.SetXattr(t, d, name, value, flags); err != nil {
+ return err
+ }
+ d.InotifyEvent(linux.IN_ATTRIB, 0)
+ return nil
+}
+
+func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
+ name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
+ if err != nil {
+ if err == syserror.ENAMETOOLONG {
+ return "", syserror.ERANGE
+ }
+ return "", err
+ }
+ if len(name) == 0 {
+ return "", syserror.ERANGE
+ }
+ return name, nil
+}
+
+// Restrict xattrs to regular files and directories.
+//
+// TODO(b/148380782): In Linux, this restriction technically only applies to
+// xattrs in the "user.*" namespace. Make file type checks specific to the
+// namespace once we allow other xattr prefixes.
+func xattrFileTypeOk(i *fs.Inode) bool {
+ return fs.IsRegular(i.StableAttr) || fs.IsDir(i.StableAttr)
+}
+
+func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error {
+ // Restrict xattrs to regular files and directories.
+ if !xattrFileTypeOk(i) {
+ if perms.Write {
+ return syserror.EPERM
+ }
+ return syserror.ENODATA
+ }
+
+ return i.CheckPermission(t, perms)
+}
+
+// ListXattr implements linux syscall listxattr(2).
+func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return listXattrFromPath(t, args, true)
+}
+
+// LListXattr implements linux syscall llistxattr(2).
+func LListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return listXattrFromPath(t, args, false)
+}
+
+// FListXattr implements linux syscall flistxattr(2).
+func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ listAddr := args[1].Pointer()
+ size := uint64(args[2].SizeT())
+
+ // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+ f := t.GetFile(fd)
+ if f == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer f.DecRef()
+
+ n, err := listXattr(t, f.Dirent, listAddr, size)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(n), nil, nil
+}
+
+func listXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ listAddr := args[1].Pointer()
+ size := uint64(args[2].SizeT())
+
+ path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n := 0
+ err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ n, err = listXattr(t, d, listAddr, size)
+ return err
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(n), nil, nil
+}
+
+func listXattr(t *kernel.Task, d *fs.Dirent, addr usermem.Addr, size uint64) (int, error) {
+ if !xattrFileTypeOk(d.Inode) {
+ return 0, nil
+ }
+
+ // If listxattr(2) is called with size 0, the buffer size needed to contain
+ // the xattr list will be returned successfully even if it is nonzero. In
+ // that case, we need to retrieve the entire list so we can compute and
+ // return the correct size.
+ requestedSize := size
+ if size == 0 || size > linux.XATTR_SIZE_MAX {
+ requestedSize = linux.XATTR_SIZE_MAX
+ }
+ xattrs, err := d.Inode.ListXattr(t, requestedSize)
+ if err != nil {
+ return 0, err
+ }
+
+ // TODO(b/148380782): support namespaces other than "user".
+ for x := range xattrs {
+ if !strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
+ delete(xattrs, x)
+ }
+ }
+
+ listSize := xattrListSize(xattrs)
+ if listSize > linux.XATTR_SIZE_MAX {
+ return 0, syserror.E2BIG
+ }
+ if uint64(listSize) > requestedSize {
+ return 0, syserror.ERANGE
+ }
+
+ // Don't copy out the attributes if size is 0.
+ if size == 0 {
+ return listSize, nil
+ }
+
+ buf := make([]byte, 0, listSize)
+ for x := range xattrs {
+ buf = append(buf, []byte(x)...)
+ buf = append(buf, 0)
+ }
+ if _, err := t.CopyOutBytes(addr, buf); err != nil {
+ return 0, err
+ }
+
+ return len(buf), nil
+}
+
+func xattrListSize(xattrs map[string]struct{}) int {
+ size := 0
+ for x := range xattrs {
+ size += len(x) + 1
+ }
+ return size
+}
+
+// RemoveXattr implements linux syscall removexattr(2).
+func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return removeXattrFromPath(t, args, true)
+}
+
+// LRemoveXattr implements linux syscall lremovexattr(2).
+func LRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return removeXattrFromPath(t, args, false)
+}
+
+// FRemoveXattr implements linux syscall fremovexattr(2).
+func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ nameAddr := args[1].Pointer()
+
+ // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+ f := t.GetFile(fd)
+ if f == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer f.DecRef()
+
+ return 0, nil, removeXattr(t, f.Dirent, nameAddr)
+}
+
+func removeXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ nameAddr := args[1].Pointer()
+
+ path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ return removeXattr(t, d, nameAddr)
+ })
+}
+
+// removeXattr implements removexattr(2) from the given *fs.Dirent.
+func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
+ name, err := copyInXattrName(t, nameAddr)
+ if err != nil {
+ return err
+ }
+
+ if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+ return err
+ }
+
+ if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+ return syserror.EOPNOTSUPP
+ }
+
+ if err := d.Inode.RemoveXattr(t, d, name); err != nil {
+ return err
+ }
+ d.InotifyEvent(linux.IN_ATTRIB, 0)
+ return nil
+}
+
+// LINT.ThenChange(vfs2/xattr.go)
diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go
new file mode 100644
index 000000000..ddc3ee26e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/timespec.go
@@ -0,0 +1,111 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// copyTimespecIn copies a Timespec from the untrusted app range to the kernel.
+func copyTimespecIn(t *kernel.Task, addr usermem.Addr) (linux.Timespec, error) {
+ switch t.Arch().Width() {
+ case 8:
+ ts := linux.Timespec{}
+ in := t.CopyScratchBuffer(16)
+ _, err := t.CopyInBytes(addr, in)
+ if err != nil {
+ return ts, err
+ }
+ ts.Sec = int64(usermem.ByteOrder.Uint64(in[0:]))
+ ts.Nsec = int64(usermem.ByteOrder.Uint64(in[8:]))
+ return ts, nil
+ default:
+ return linux.Timespec{}, syserror.ENOSYS
+ }
+}
+
+// copyTimespecOut copies a Timespec to the untrusted app range.
+func copyTimespecOut(t *kernel.Task, addr usermem.Addr, ts *linux.Timespec) error {
+ switch t.Arch().Width() {
+ case 8:
+ out := t.CopyScratchBuffer(16)
+ usermem.ByteOrder.PutUint64(out[0:], uint64(ts.Sec))
+ usermem.ByteOrder.PutUint64(out[8:], uint64(ts.Nsec))
+ _, err := t.CopyOutBytes(addr, out)
+ return err
+ default:
+ return syserror.ENOSYS
+ }
+}
+
+// copyTimevalIn copies a Timeval from the untrusted app range to the kernel.
+func copyTimevalIn(t *kernel.Task, addr usermem.Addr) (linux.Timeval, error) {
+ switch t.Arch().Width() {
+ case 8:
+ tv := linux.Timeval{}
+ in := t.CopyScratchBuffer(16)
+ _, err := t.CopyInBytes(addr, in)
+ if err != nil {
+ return tv, err
+ }
+ tv.Sec = int64(usermem.ByteOrder.Uint64(in[0:]))
+ tv.Usec = int64(usermem.ByteOrder.Uint64(in[8:]))
+ return tv, nil
+ default:
+ return linux.Timeval{}, syserror.ENOSYS
+ }
+}
+
+// copyTimevalOut copies a Timeval to the untrusted app range.
+func copyTimevalOut(t *kernel.Task, addr usermem.Addr, tv *linux.Timeval) error {
+ switch t.Arch().Width() {
+ case 8:
+ out := t.CopyScratchBuffer(16)
+ usermem.ByteOrder.PutUint64(out[0:], uint64(tv.Sec))
+ usermem.ByteOrder.PutUint64(out[8:], uint64(tv.Usec))
+ _, err := t.CopyOutBytes(addr, out)
+ return err
+ default:
+ return syserror.ENOSYS
+ }
+}
+
+// copyTimespecInToDuration copies a Timespec from the untrusted app range,
+// validates it and converts it to a Duration.
+//
+// If the Timespec is larger than what can be represented in a Duration, the
+// returned value is the maximum that Duration will allow.
+//
+// If timespecAddr is NULL, the returned value is negative.
+func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) {
+ // Use a negative Duration to indicate "no timeout".
+ timeout := time.Duration(-1)
+ if timespecAddr != 0 {
+ timespec, err := copyTimespecIn(t, timespecAddr)
+ if err != nil {
+ return 0, err
+ }
+ if !timespec.Valid() {
+ return 0, syserror.EINVAL
+ }
+ timeout = time.Duration(timespec.ToNsecCapped())
+ }
+ return timeout, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
new file mode 100644
index 000000000..0c740335b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -0,0 +1,76 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "vfs2",
+ srcs = [
+ "aio.go",
+ "epoll.go",
+ "eventfd.go",
+ "execve.go",
+ "fd.go",
+ "filesystem.go",
+ "fscontext.go",
+ "getdents.go",
+ "inotify.go",
+ "ioctl.go",
+ "lock.go",
+ "memfd.go",
+ "mmap.go",
+ "mount.go",
+ "path.go",
+ "pipe.go",
+ "poll.go",
+ "read_write.go",
+ "setstat.go",
+ "signal.go",
+ "socket.go",
+ "splice.go",
+ "stat.go",
+ "stat_amd64.go",
+ "stat_arm64.go",
+ "sync.go",
+ "timerfd.go",
+ "vfs2.go",
+ "xattr.go",
+ ],
+ marshal = True,
+ visibility = ["//:sandbox"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/binary",
+ "//pkg/bits",
+ "//pkg/context",
+ "//pkg/fspath",
+ "//pkg/gohacks",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/fs/lock",
+ "//pkg/sentry/fsbridge",
+ "//pkg/sentry/fsimpl/eventfd",
+ "//pkg/sentry/fsimpl/pipefs",
+ "//pkg/sentry/fsimpl/signalfd",
+ "//pkg/sentry/fsimpl/timerfd",
+ "//pkg/sentry/fsimpl/tmpfs",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/fasync",
+ "//pkg/sentry/kernel/pipe",
+ "//pkg/sentry/kernel/time",
+ "//pkg/sentry/limits",
+ "//pkg/sentry/loader",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/mm",
+ "//pkg/sentry/socket",
+ "//pkg/sentry/socket/control",
+ "//pkg/sentry/socket/unix/transport",
+ "//pkg/sentry/syscalls",
+ "//pkg/sentry/syscalls/linux",
+ "//pkg/sentry/vfs",
+ "//pkg/sync",
+ "//pkg/syserr",
+ "//pkg/syserror",
+ "//pkg/usermem",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go
new file mode 100644
index 000000000..e5cdefc50
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/aio.go
@@ -0,0 +1,216 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
+ slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// IoSubmit implements linux syscall io_submit(2).
+func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ id := args[0].Uint64()
+ nrEvents := args[1].Int()
+ addr := args[2].Pointer()
+
+ if nrEvents < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ for i := int32(0); i < nrEvents; i++ {
+ // Copy in the address.
+ cbAddrNative := t.Arch().Native(0)
+ if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
+ if i > 0 {
+ // Some successful.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
+ }
+
+ // Copy in this callback.
+ var cb linux.IOCallback
+ cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
+ if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+ if i > 0 {
+ // Some have been successful.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
+ }
+
+ // Process this callback.
+ if err := submitCallback(t, id, &cb, cbAddr); err != nil {
+ if i > 0 {
+ // Partial success.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
+ }
+
+ // Advance to the next one.
+ addr += usermem.Addr(t.Arch().Width())
+ }
+
+ return uintptr(nrEvents), nil, nil
+}
+
+// submitCallback processes a single callback.
+func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error {
+ if cb.Reserved2 != 0 {
+ return syserror.EINVAL
+ }
+
+ fd := t.GetFileVFS2(cb.FD)
+ if fd == nil {
+ return syserror.EBADF
+ }
+ defer fd.DecRef()
+
+ // Was there an eventFD? Extract it.
+ var eventFD *vfs.FileDescription
+ if cb.Flags&linux.IOCB_FLAG_RESFD != 0 {
+ eventFD = t.GetFileVFS2(cb.ResFD)
+ if eventFD == nil {
+ return syserror.EBADF
+ }
+ defer eventFD.DecRef()
+
+ // Check that it is an eventfd.
+ if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok {
+ return syserror.EINVAL
+ }
+ }
+
+ ioseq, err := memoryFor(t, cb)
+ if err != nil {
+ return err
+ }
+
+ // Check offset for reads/writes.
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+ if cb.Offset < 0 {
+ return syserror.EINVAL
+ }
+ }
+
+ // Prepare the request.
+ aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id)
+ if !ok {
+ return syserror.EINVAL
+ }
+ if ready := aioCtx.Prepare(); !ready {
+ // Context is busy.
+ return syserror.EAGAIN
+ }
+
+ if eventFD != nil {
+ // The request is set. Make sure there's a ref on the file.
+ //
+ // This is necessary when the callback executes on completion,
+ // which is also what will release this reference.
+ eventFD.IncRef()
+ }
+
+ // Perform the request asynchronously.
+ fd.IncRef()
+ t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx))
+ return nil
+}
+
+func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback {
+ return func(ctx context.Context) {
+ if aioCtx.Dead() {
+ aioCtx.CancelPendingRequest()
+ return
+ }
+ ev := &linux.IOEvent{
+ Data: cb.Data,
+ Obj: uint64(cbAddr),
+ }
+
+ var err error
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV:
+ ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{})
+ case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+ ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{})
+ case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC:
+ err = fd.Sync(ctx)
+ }
+
+ // Update the result.
+ if err != nil {
+ err = slinux.HandleIOErrorVFS2(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd)
+ ev.Result = -int64(kernel.ExtractErrno(err, 0))
+ }
+
+ fd.DecRef()
+
+ // Queue the result for delivery.
+ aioCtx.FinishRequest(ev)
+
+ // Notify the event file if one was specified. This needs to happen
+ // *after* queueing the result to avoid racing with the thread we may
+ // wake up.
+ if eventFD != nil {
+ eventFD.Impl().(*eventfd.EventFileDescription).Signal(1)
+ eventFD.DecRef()
+ }
+ }
+}
+
+// memoryFor returns appropriate memory for the given callback.
+func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) {
+ bytes := int(cb.Bytes)
+ if bytes < 0 {
+ // Linux also requires that this field fit in ssize_t.
+ return usermem.IOSequence{}, syserror.EINVAL
+ }
+
+ // Since this I/O will be asynchronous with respect to t's task goroutine,
+ // we have no guarantee that t's AddressSpace will be active during the
+ // I/O.
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE:
+ return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+ AddressSpaceActive: false,
+ })
+
+ case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV:
+ return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+ AddressSpaceActive: false,
+ })
+
+ case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP:
+ return usermem.IOSequence{}, nil
+
+ default:
+ // Not a supported command.
+ return usermem.IOSequence{}, syserror.EINVAL
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
new file mode 100644
index 000000000..34c90ae3e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -0,0 +1,228 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "math"
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+var sizeofEpollEvent = (*linux.EpollEvent)(nil).SizeBytes()
+
+// EpollCreate1 implements Linux syscall epoll_create1(2).
+func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := args[0].Int()
+ if flags&^linux.EPOLL_CLOEXEC != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ file, err := t.Kernel().VFS().NewEpollInstanceFD()
+ if err != nil {
+ return 0, nil, err
+ }
+ defer file.DecRef()
+
+ fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+ CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(fd), nil, nil
+}
+
+// EpollCreate implements Linux syscall epoll_create(2).
+func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ size := args[0].Int()
+
+ // "Since Linux 2.6.8, the size argument is ignored, but must be greater
+ // than zero" - epoll_create(2)
+ if size <= 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ file, err := t.Kernel().VFS().NewEpollInstanceFD()
+ if err != nil {
+ return 0, nil, err
+ }
+ defer file.DecRef()
+
+ fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(fd), nil, nil
+}
+
+// EpollCtl implements Linux syscall epoll_ctl(2).
+func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ epfd := args[0].Int()
+ op := args[1].Int()
+ fd := args[2].Int()
+ eventAddr := args[3].Pointer()
+
+ epfile := t.GetFileVFS2(epfd)
+ if epfile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer epfile.DecRef()
+ ep, ok := epfile.Impl().(*vfs.EpollInstance)
+ if !ok {
+ return 0, nil, syserror.EINVAL
+ }
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+ if epfile == file {
+ return 0, nil, syserror.EINVAL
+ }
+
+ var event linux.EpollEvent
+ switch op {
+ case linux.EPOLL_CTL_ADD:
+ if _, err := event.CopyIn(t, eventAddr); err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, ep.AddInterest(file, fd, event)
+ case linux.EPOLL_CTL_DEL:
+ return 0, nil, ep.DeleteInterest(file, fd)
+ case linux.EPOLL_CTL_MOD:
+ if _, err := event.CopyIn(t, eventAddr); err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, ep.ModifyInterest(file, fd, event)
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+}
+
+// EpollWait implements Linux syscall epoll_wait(2).
+func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ epfd := args[0].Int()
+ eventsAddr := args[1].Pointer()
+ maxEvents := int(args[2].Int())
+ timeout := int(args[3].Int())
+
+ var _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS
+ if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS {
+ return 0, nil, syserror.EINVAL
+ }
+
+ epfile := t.GetFileVFS2(epfd)
+ if epfile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer epfile.DecRef()
+ ep, ok := epfile.Impl().(*vfs.EpollInstance)
+ if !ok {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent,
+ // maxEvents), so that the buffer can be allocated on the stack.
+ var (
+ events [16]linux.EpollEvent
+ total int
+ ch chan struct{}
+ haveDeadline bool
+ deadline ktime.Time
+ )
+ for {
+ batchEvents := len(events)
+ if batchEvents > maxEvents {
+ batchEvents = maxEvents
+ }
+ n := ep.ReadEvents(events[:batchEvents])
+ maxEvents -= n
+ if n != 0 {
+ // Copy what we read out.
+ copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events[:n])
+ copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
+ eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
+ total += copiedEvents
+ if err != nil {
+ if total != 0 {
+ return uintptr(total), nil, nil
+ }
+ return 0, nil, err
+ }
+ // If we've filled the application's event buffer, we're done.
+ if maxEvents == 0 {
+ return uintptr(total), nil, nil
+ }
+ // Loop if we read a full batch, under the expectation that there
+ // may be more events to read.
+ if n == batchEvents {
+ continue
+ }
+ }
+ // We get here if n != batchEvents. If we read any number of events
+ // (just now, or in a previous iteration of this loop), or if timeout
+ // is 0 (such that epoll_wait should be non-blocking), return the
+ // events we've read so far to the application.
+ if total != 0 || timeout == 0 {
+ return uintptr(total), nil, nil
+ }
+ // In the first iteration of this loop, register with the epoll
+ // instance for readability events, but then immediately continue the
+ // loop since we need to retry ReadEvents() before blocking. In all
+ // subsequent iterations, block until events are available, the timeout
+ // expires, or an interrupt arrives.
+ if ch == nil {
+ var w waiter.Entry
+ w, ch = waiter.NewChannelEntry(nil)
+ epfile.EventRegister(&w, waiter.EventIn)
+ defer epfile.EventUnregister(&w)
+ } else {
+ // Set up the timer if a timeout was specified.
+ if timeout > 0 && !haveDeadline {
+ timeoutDur := time.Duration(timeout) * time.Millisecond
+ deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
+ haveDeadline = true
+ }
+ if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+ if err == syserror.ETIMEDOUT {
+ err = nil
+ }
+ // total must be 0 since otherwise we would have returned
+ // above.
+ return 0, nil, err
+ }
+ }
+ }
+}
+
+// EpollPwait implements Linux syscall epoll_pwait(2).
+func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ maskAddr := args[4].Pointer()
+ maskSize := uint(args[5].Uint())
+
+ if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
+ return 0, nil, err
+ }
+
+ return EpollWait(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go
new file mode 100644
index 000000000..aff1a2070
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go
@@ -0,0 +1,61 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Eventfd2 implements linux syscall eventfd2(2).
+func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ initVal := uint64(args[0].Uint())
+ flags := uint(args[1].Uint())
+ allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC)
+
+ if flags & ^allOps != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ vfsObj := t.Kernel().VFS()
+ fileFlags := uint32(linux.O_RDWR)
+ if flags&linux.EFD_NONBLOCK != 0 {
+ fileFlags |= linux.O_NONBLOCK
+ }
+ semMode := flags&linux.EFD_SEMAPHORE != 0
+ eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer eventfd.DecRef()
+
+ fd, err := t.NewFDFromVFS2(0, eventfd, kernel.FDFlags{
+ CloseOnExec: flags&linux.EFD_CLOEXEC != 0,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
+
+// Eventfd implements linux syscall eventfd(2).
+func Eventfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ args[1].Value = 0
+ return Eventfd2(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go
new file mode 100644
index 000000000..aef0078a8
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/execve.go
@@ -0,0 +1,137 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/loader"
+ slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Execve implements linux syscall execve(2).
+func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathnameAddr := args[0].Pointer()
+ argvAddr := args[1].Pointer()
+ envvAddr := args[2].Pointer()
+ return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */)
+}
+
+// Execveat implements linux syscall execveat(2).
+func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirfd := args[0].Int()
+ pathnameAddr := args[1].Pointer()
+ argvAddr := args[2].Pointer()
+ envvAddr := args[3].Pointer()
+ flags := args[4].Int()
+ return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags)
+}
+
+func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
+ if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
+ if err != nil {
+ return 0, nil, err
+ }
+ var argv, envv []string
+ if argvAddr != 0 {
+ var err error
+ argv, err = t.CopyInVector(argvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
+ if err != nil {
+ return 0, nil, err
+ }
+ }
+ if envvAddr != 0 {
+ var err error
+ envv, err = t.CopyInVector(envvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
+ if err != nil {
+ return 0, nil, err
+ }
+ }
+
+ root := t.FSContext().RootDirectoryVFS2()
+ defer root.DecRef()
+ var executable fsbridge.File
+ closeOnExec := false
+ if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute {
+ // We must open the executable ourselves since dirfd is used as the
+ // starting point while resolving path, but the task working directory
+ // is used as the starting point while resolving interpreters (Linux:
+ // fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() =>
+ // do_open_execat(fd=AT_FDCWD)), and the loader package is currently
+ // incapable of handling this correctly.
+ if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+ return 0, nil, syserror.ENOENT
+ }
+ dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd)
+ if dirfile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ start := dirfile.VirtualDentry()
+ start.IncRef()
+ dirfile.DecRef()
+ closeOnExec = dirfileFlags.CloseOnExec
+ file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{
+ Root: root,
+ Start: start,
+ Path: path,
+ FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ FileExec: true,
+ })
+ start.DecRef()
+ if err != nil {
+ return 0, nil, err
+ }
+ defer file.DecRef()
+ executable = fsbridge.NewVFSFile(file)
+ }
+
+ // Load the new TaskContext.
+ mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change
+ defer mntns.DecRef()
+ wd := t.FSContext().WorkingDirectoryVFS2()
+ defer wd.DecRef()
+ remainingTraversals := uint(linux.MaxSymlinkTraversals)
+ loadArgs := loader.LoadArgs{
+ Opener: fsbridge.NewVFSLookup(mntns, root, wd),
+ RemainingTraversals: &remainingTraversals,
+ ResolveFinal: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+ Filename: pathname,
+ File: executable,
+ CloseOnExec: closeOnExec,
+ Argv: argv,
+ Envv: envv,
+ Features: t.Arch().FeatureSet(),
+ }
+
+ tc, se := t.Kernel().LoadTaskImage(t, loadArgs)
+ if se != nil {
+ return 0, nil, se.ToError()
+ }
+
+ ctrl, err := t.Execve(tc)
+ return 0, ctrl, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
new file mode 100644
index 000000000..517394ba9
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -0,0 +1,355 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+ slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Close implements Linux syscall close(2).
+func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+
+ // Note that Remove provides a reference on the file that we may use to
+ // flush. It is still active until we drop the final reference below
+ // (and other reference-holding operations complete).
+ _, file := t.FDTable().Remove(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ err := file.OnClose(t)
+ return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file)
+}
+
+// Dup implements Linux syscall dup(2).
+func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
+ if err != nil {
+ return 0, nil, syserror.EMFILE
+ }
+ return uintptr(newFD), nil, nil
+}
+
+// Dup2 implements Linux syscall dup2(2).
+func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ oldfd := args[0].Int()
+ newfd := args[1].Int()
+
+ if oldfd == newfd {
+ // As long as oldfd is valid, dup2() does nothing and returns newfd.
+ file := t.GetFileVFS2(oldfd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ file.DecRef()
+ return uintptr(newfd), nil, nil
+ }
+
+ return dup3(t, oldfd, newfd, 0)
+}
+
+// Dup3 implements Linux syscall dup3(2).
+func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ oldfd := args[0].Int()
+ newfd := args[1].Int()
+ flags := args[2].Uint()
+
+ if oldfd == newfd {
+ return 0, nil, syserror.EINVAL
+ }
+
+ return dup3(t, oldfd, newfd, flags)
+}
+
+func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) {
+ if flags&^linux.O_CLOEXEC != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ file := t.GetFileVFS2(oldfd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{
+ CloseOnExec: flags&linux.O_CLOEXEC != 0,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(newfd), nil, nil
+}
+
+// Fcntl implements linux syscall fcntl(2).
+func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ cmd := args[1].Int()
+
+ file, flags := t.FDTable().GetVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ switch cmd {
+ case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
+ minfd := args[2].Int()
+ fd, err := t.NewFDFromVFS2(minfd, file, kernel.FDFlags{
+ CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(fd), nil, nil
+ case linux.F_GETFD:
+ return uintptr(flags.ToLinuxFDFlags()), nil, nil
+ case linux.F_SETFD:
+ flags := args[2].Uint()
+ err := t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+ CloseOnExec: flags&linux.FD_CLOEXEC != 0,
+ })
+ return 0, nil, err
+ case linux.F_GETFL:
+ return uintptr(file.StatusFlags()), nil, nil
+ case linux.F_SETFL:
+ return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
+ case linux.F_SETPIPE_SZ:
+ pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
+ if !ok {
+ return 0, nil, syserror.EBADF
+ }
+ n, err := pipefile.SetPipeSize(int64(args[2].Int()))
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(n), nil, nil
+ case linux.F_GETOWN:
+ owner, hasOwner := getAsyncOwner(t, file)
+ if !hasOwner {
+ return 0, nil, nil
+ }
+ if owner.Type == linux.F_OWNER_PGRP {
+ return uintptr(-owner.PID), nil, nil
+ }
+ return uintptr(owner.PID), nil, nil
+ case linux.F_SETOWN:
+ who := args[2].Int()
+ ownerType := int32(linux.F_OWNER_PID)
+ if who < 0 {
+ // Check for overflow before flipping the sign.
+ if who-1 > who {
+ return 0, nil, syserror.EINVAL
+ }
+ ownerType = linux.F_OWNER_PGRP
+ who = -who
+ }
+ return 0, nil, setAsyncOwner(t, file, ownerType, who)
+ case linux.F_GETOWN_EX:
+ owner, hasOwner := getAsyncOwner(t, file)
+ if !hasOwner {
+ return 0, nil, nil
+ }
+ _, err := t.CopyOut(args[2].Pointer(), &owner)
+ return 0, nil, err
+ case linux.F_SETOWN_EX:
+ var owner linux.FOwnerEx
+ n, err := t.CopyIn(args[2].Pointer(), &owner)
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(n), nil, setAsyncOwner(t, file, owner.Type, owner.PID)
+ case linux.F_GETPIPE_SZ:
+ pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
+ if !ok {
+ return 0, nil, syserror.EBADF
+ }
+ return uintptr(pipefile.PipeSize()), nil, nil
+ case linux.F_GET_SEALS:
+ val, err := tmpfs.GetSeals(file)
+ return uintptr(val), nil, err
+ case linux.F_ADD_SEALS:
+ if !file.IsWritable() {
+ return 0, nil, syserror.EPERM
+ }
+ err := tmpfs.AddSeals(file, args[2].Uint())
+ return 0, nil, err
+ case linux.F_SETLK, linux.F_SETLKW:
+ return 0, nil, posixLock(t, args, file, cmd)
+ default:
+ // TODO(gvisor.dev/issue/2920): Everything else is not yet supported.
+ return 0, nil, syserror.EINVAL
+ }
+}
+
+func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) {
+ a := fd.AsyncHandler()
+ if a == nil {
+ return linux.FOwnerEx{}, false
+ }
+
+ ot, otg, opg := a.(*fasync.FileAsync).Owner()
+ switch {
+ case ot != nil:
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_TID,
+ PID: int32(t.PIDNamespace().IDOfTask(ot)),
+ }, true
+ case otg != nil:
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_PID,
+ PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)),
+ }, true
+ case opg != nil:
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_PGRP,
+ PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)),
+ }, true
+ default:
+ return linux.FOwnerEx{}, true
+ }
+}
+
+func setAsyncOwner(t *kernel.Task, fd *vfs.FileDescription, ownerType, pid int32) error {
+ switch ownerType {
+ case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP:
+ // Acceptable type.
+ default:
+ return syserror.EINVAL
+ }
+
+ a := fd.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync)
+ if pid == 0 {
+ a.ClearOwner()
+ return nil
+ }
+
+ switch ownerType {
+ case linux.F_OWNER_TID:
+ task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid))
+ if task == nil {
+ return syserror.ESRCH
+ }
+ a.SetOwnerTask(t, task)
+ return nil
+ case linux.F_OWNER_PID:
+ tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid))
+ if tg == nil {
+ return syserror.ESRCH
+ }
+ a.SetOwnerThreadGroup(t, tg)
+ return nil
+ case linux.F_OWNER_PGRP:
+ pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid))
+ if pg == nil {
+ return syserror.ESRCH
+ }
+ a.SetOwnerProcessGroup(t, pg)
+ return nil
+ default:
+ return syserror.EINVAL
+ }
+}
+
+func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, cmd int32) error {
+ // Copy in the lock request.
+ flockAddr := args[2].Pointer()
+ var flock linux.Flock
+ if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+ return err
+ }
+
+ var blocker lock.Blocker
+ if cmd == linux.F_SETLKW {
+ blocker = t
+ }
+
+ switch flock.Type {
+ case linux.F_RDLCK:
+ if !file.IsReadable() {
+ return syserror.EBADF
+ }
+ return file.LockPOSIX(t, t.FDTable(), lock.ReadLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker)
+
+ case linux.F_WRLCK:
+ if !file.IsWritable() {
+ return syserror.EBADF
+ }
+ return file.LockPOSIX(t, t.FDTable(), lock.WriteLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker)
+
+ case linux.F_UNLCK:
+ return file.UnlockPOSIX(t, t.FDTable(), uint64(flock.Start), uint64(flock.Len), flock.Whence)
+
+ default:
+ return syserror.EINVAL
+ }
+}
+
+// Fadvise64 implements fadvise64(2).
+// This implementation currently ignores the provided advice.
+func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ length := args[2].Int64()
+ advice := args[3].Int()
+
+ // Note: offset is allowed to be negative.
+ if length < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // If the FD refers to a pipe or FIFO, return error.
+ if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe {
+ return 0, nil, syserror.ESPIPE
+ }
+
+ switch advice {
+ case linux.POSIX_FADV_NORMAL:
+ case linux.POSIX_FADV_RANDOM:
+ case linux.POSIX_FADV_SEQUENTIAL:
+ case linux.POSIX_FADV_WILLNEED:
+ case linux.POSIX_FADV_DONTNEED:
+ case linux.POSIX_FADV_NOREUSE:
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Sure, whatever.
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
new file mode 100644
index 000000000..6b14c2bef
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -0,0 +1,384 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Link implements Linux syscall link(2).
+func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ oldpathAddr := args[0].Pointer()
+ newpathAddr := args[1].Pointer()
+ return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
+}
+
+// Linkat implements Linux syscall linkat(2).
+func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ olddirfd := args[0].Int()
+ oldpathAddr := args[1].Pointer()
+ newdirfd := args[2].Int()
+ newpathAddr := args[3].Pointer()
+ flags := args[4].Int()
+ return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
+}
+
+func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags int32) error {
+ if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 {
+ return syserror.EINVAL
+ }
+ if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) {
+ return syserror.ENOENT
+ }
+
+ oldpath, err := copyInPath(t, oldpathAddr)
+ if err != nil {
+ return err
+ }
+ oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0))
+ if err != nil {
+ return err
+ }
+ defer oldtpop.Release()
+
+ newpath, err := copyInPath(t, newpathAddr)
+ if err != nil {
+ return err
+ }
+ newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return err
+ }
+ defer newtpop.Release()
+
+ return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop)
+}
+
+// Mkdir implements Linux syscall mkdir(2).
+func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ mode := args[1].ModeT()
+ return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Mkdirat implements Linux syscall mkdirat(2).
+func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirfd := args[0].Int()
+ addr := args[1].Pointer()
+ mode := args[2].ModeT()
+ return 0, nil, mkdirat(t, dirfd, addr, mode)
+}
+
+func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error {
+ path, err := copyInPath(t, addr)
+ if err != nil {
+ return err
+ }
+ tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return err
+ }
+ defer tpop.Release()
+ return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{
+ Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()),
+ })
+}
+
+// Mknod implements Linux syscall mknod(2).
+func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ mode := args[1].ModeT()
+ dev := args[2].Uint()
+ return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev)
+}
+
+// Mknodat implements Linux syscall mknodat(2).
+func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirfd := args[0].Int()
+ addr := args[1].Pointer()
+ mode := args[2].ModeT()
+ dev := args[3].Uint()
+ return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev)
+}
+
+func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode linux.FileMode, dev uint32) error {
+ path, err := copyInPath(t, addr)
+ if err != nil {
+ return err
+ }
+ tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return err
+ }
+ defer tpop.Release()
+
+ // "Zero file type is equivalent to type S_IFREG." - mknod(2)
+ if mode.FileType() == 0 {
+ mode |= linux.ModeRegular
+ }
+ major, minor := linux.DecodeDeviceID(dev)
+ return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
+ Mode: mode &^ linux.FileMode(t.FSContext().Umask()),
+ DevMajor: uint32(major),
+ DevMinor: minor,
+ })
+}
+
+// Open implements Linux syscall open(2).
+func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ flags := args[1].Uint()
+ mode := args[2].ModeT()
+ return openat(t, linux.AT_FDCWD, addr, flags, mode)
+}
+
+// Openat implements Linux syscall openat(2).
+func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirfd := args[0].Int()
+ addr := args[1].Pointer()
+ flags := args[2].Uint()
+ mode := args[3].ModeT()
+ return openat(t, dirfd, addr, flags, mode)
+}
+
+// Creat implements Linux syscall creat(2).
+func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ mode := args[1].ModeT()
+ return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode)
+}
+
+func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) {
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+ tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0))
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release()
+
+ file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
+ Flags: flags | linux.O_LARGEFILE,
+ Mode: linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()),
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ defer file.DecRef()
+
+ fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+ CloseOnExec: flags&linux.O_CLOEXEC != 0,
+ })
+ return uintptr(fd), nil, err
+}
+
+// Rename implements Linux syscall rename(2).
+func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ oldpathAddr := args[0].Pointer()
+ newpathAddr := args[1].Pointer()
+ return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
+}
+
+// Renameat implements Linux syscall renameat(2).
+func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ olddirfd := args[0].Int()
+ oldpathAddr := args[1].Pointer()
+ newdirfd := args[2].Int()
+ newpathAddr := args[3].Pointer()
+ return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */)
+}
+
+// Renameat2 implements Linux syscall renameat2(2).
+func Renameat2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ olddirfd := args[0].Int()
+ oldpathAddr := args[1].Pointer()
+ newdirfd := args[2].Int()
+ newpathAddr := args[3].Pointer()
+ flags := args[4].Uint()
+ return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
+}
+
+func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags uint32) error {
+ oldpath, err := copyInPath(t, oldpathAddr)
+ if err != nil {
+ return err
+ }
+ // "If oldpath refers to a symbolic link, the link is renamed" - rename(2)
+ oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return err
+ }
+ defer oldtpop.Release()
+
+ newpath, err := copyInPath(t, newpathAddr)
+ if err != nil {
+ return err
+ }
+ newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return err
+ }
+ defer newtpop.Release()
+
+ return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{
+ Flags: flags,
+ })
+}
+
+// Fallocate implements linux system call fallocate(2).
+func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ mode := args[1].Uint64()
+ offset := args[2].Int64()
+ length := args[3].Int64()
+
+ file := t.GetFileVFS2(fd)
+
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ if !file.IsWritable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ if mode != 0 {
+ return 0, nil, syserror.ENOTSUP
+ }
+
+ if offset < 0 || length <= 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ size := offset + length
+
+ if size < 0 {
+ return 0, nil, syserror.EFBIG
+ }
+
+ limit := limits.FromContext(t).Get(limits.FileSize).Cur
+
+ if uint64(size) >= limit {
+ t.SendSignal(&arch.SignalInfo{
+ Signo: int32(linux.SIGXFSZ),
+ Code: arch.SignalInfoUser,
+ })
+ return 0, nil, syserror.EFBIG
+ }
+
+ return 0, nil, file.Impl().Allocate(t, mode, uint64(offset), uint64(length))
+
+ // File length modified, generate notification.
+ // TODO(gvisor.dev/issue/1479): Reenable when Inotify is ported.
+ // file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+}
+
+// Rmdir implements Linux syscall rmdir(2).
+func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr)
+}
+
+func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return err
+ }
+ tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return err
+ }
+ defer tpop.Release()
+ return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop)
+}
+
+// Unlink implements Linux syscall unlink(2).
+func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr)
+}
+
+func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return err
+ }
+ tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return err
+ }
+ defer tpop.Release()
+ return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop)
+}
+
+// Unlinkat implements Linux syscall unlinkat(2).
+func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirfd := args[0].Int()
+ pathAddr := args[1].Pointer()
+ flags := args[2].Int()
+
+ if flags&^linux.AT_REMOVEDIR != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if flags&linux.AT_REMOVEDIR != 0 {
+ return 0, nil, rmdirat(t, dirfd, pathAddr)
+ }
+ return 0, nil, unlinkat(t, dirfd, pathAddr)
+}
+
+// Symlink implements Linux syscall symlink(2).
+func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ targetAddr := args[0].Pointer()
+ linkpathAddr := args[1].Pointer()
+ return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr)
+}
+
+// Symlinkat implements Linux syscall symlinkat(2).
+func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ targetAddr := args[0].Pointer()
+ newdirfd := args[1].Int()
+ linkpathAddr := args[2].Pointer()
+ return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr)
+}
+
+func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpathAddr usermem.Addr) error {
+ target, err := t.CopyInString(targetAddr, linux.PATH_MAX)
+ if err != nil {
+ return err
+ }
+ if len(target) == 0 {
+ return syserror.ENOENT
+ }
+ linkpath, err := copyInPath(t, linkpathAddr)
+ if err != nil {
+ return err
+ }
+ tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return err
+ }
+ defer tpop.Release()
+ return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
new file mode 100644
index 000000000..317409a18
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Getcwd implements Linux syscall getcwd(2).
+func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ size := args[1].SizeT()
+
+ root := t.FSContext().RootDirectoryVFS2()
+ wd := t.FSContext().WorkingDirectoryVFS2()
+ s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd)
+ root.DecRef()
+ wd.DecRef()
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // Note this is >= because we need a terminator.
+ if uint(len(s)) >= size {
+ return 0, nil, syserror.ERANGE
+ }
+
+ // Construct a byte slice containing a NUL terminator.
+ buf := t.CopyScratchBuffer(len(s) + 1)
+ copy(buf, s)
+ buf[len(buf)-1] = 0
+
+ // Write the pathname slice.
+ n, err := t.CopyOutBytes(addr, buf)
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(n), nil, nil
+}
+
+// Chdir implements Linux syscall chdir(2).
+func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+
+ path, err := copyInPath(t, addr)
+ if err != nil {
+ return 0, nil, err
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release()
+
+ vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+ CheckSearchable: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ t.FSContext().SetWorkingDirectoryVFS2(vd)
+ vd.DecRef()
+ return 0, nil, nil
+}
+
+// Fchdir implements Linux syscall fchdir(2).
+func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+
+ tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release()
+
+ vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+ CheckSearchable: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ t.FSContext().SetWorkingDirectoryVFS2(vd)
+ vd.DecRef()
+ return 0, nil, nil
+}
+
+// Chroot implements Linux syscall chroot(2).
+func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+
+ if !t.HasCapability(linux.CAP_SYS_CHROOT) {
+ return 0, nil, syserror.EPERM
+ }
+
+ path, err := copyInPath(t, addr)
+ if err != nil {
+ return 0, nil, err
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release()
+
+ vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+ CheckSearchable: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ t.FSContext().SetRootDirectoryVFS2(vd)
+ vd.DecRef()
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
new file mode 100644
index 000000000..c7c7bf7ce
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -0,0 +1,161 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Getdents implements Linux syscall getdents(2).
+func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return getdents(t, args, false /* isGetdents64 */)
+}
+
+// Getdents64 implements Linux syscall getdents64(2).
+func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return getdents(t, args, true /* isGetdents64 */)
+}
+
+func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ size := int(args[2].Uint())
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ cb := getGetdentsCallback(t, addr, size, isGetdents64)
+ err := file.IterDirents(t, cb)
+ n := size - cb.remaining
+ putGetdentsCallback(cb)
+ if n == 0 {
+ return 0, nil, err
+ }
+ return uintptr(n), nil, nil
+}
+
+type getdentsCallback struct {
+ t *kernel.Task
+ addr usermem.Addr
+ remaining int
+ isGetdents64 bool
+}
+
+var getdentsCallbackPool = sync.Pool{
+ New: func() interface{} {
+ return &getdentsCallback{}
+ },
+}
+
+func getGetdentsCallback(t *kernel.Task, addr usermem.Addr, size int, isGetdents64 bool) *getdentsCallback {
+ cb := getdentsCallbackPool.Get().(*getdentsCallback)
+ *cb = getdentsCallback{
+ t: t,
+ addr: addr,
+ remaining: size,
+ isGetdents64: isGetdents64,
+ }
+ return cb
+}
+
+func putGetdentsCallback(cb *getdentsCallback) {
+ cb.t = nil
+ getdentsCallbackPool.Put(cb)
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
+ var buf []byte
+ if cb.isGetdents64 {
+ // struct linux_dirent64 {
+ // ino64_t d_ino; /* 64-bit inode number */
+ // off64_t d_off; /* 64-bit offset to next structure */
+ // unsigned short d_reclen; /* Size of this dirent */
+ // unsigned char d_type; /* File type */
+ // char d_name[]; /* Filename (null-terminated) */
+ // };
+ size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
+ size = (size + 7) &^ 7 // round up to multiple of 8
+ if size > cb.remaining {
+ return syserror.EINVAL
+ }
+ buf = cb.t.CopyScratchBuffer(size)
+ usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
+ usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
+ usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
+ buf[18] = dirent.Type
+ copy(buf[19:], dirent.Name)
+ // Zero out all remaining bytes in buf, including the NUL terminator
+ // after dirent.Name.
+ bufTail := buf[19+len(dirent.Name):]
+ for i := range bufTail {
+ bufTail[i] = 0
+ }
+ } else {
+ // struct linux_dirent {
+ // unsigned long d_ino; /* Inode number */
+ // unsigned long d_off; /* Offset to next linux_dirent */
+ // unsigned short d_reclen; /* Length of this linux_dirent */
+ // char d_name[]; /* Filename (null-terminated) */
+ // /* length is actually (d_reclen - 2 -
+ // offsetof(struct linux_dirent, d_name)) */
+ // /*
+ // char pad; // Zero padding byte
+ // char d_type; // File type (only since Linux
+ // // 2.6.4); offset is (d_reclen - 1)
+ // */
+ // };
+ if cb.t.Arch().Width() != 8 {
+ panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
+ }
+ size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
+ size = (size + 7) &^ 7 // round up to multiple of sizeof(long)
+ if size > cb.remaining {
+ return syserror.EINVAL
+ }
+ buf = cb.t.CopyScratchBuffer(size)
+ usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
+ usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
+ usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
+ copy(buf[18:], dirent.Name)
+ // Zero out all remaining bytes in buf, including the NUL terminator
+ // after dirent.Name and the zero padding byte between the name and
+ // dirent type.
+ bufTail := buf[18+len(dirent.Name) : size-1]
+ for i := range bufTail {
+ bufTail[i] = 0
+ }
+ buf[size-1] = dirent.Type
+ }
+ n, err := cb.t.CopyOutBytes(cb.addr, buf)
+ if err != nil {
+ // Don't report partially-written dirents by advancing cb.addr or
+ // cb.remaining.
+ return err
+ }
+ cb.addr += usermem.Addr(n)
+ cb.remaining -= n
+ return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go
new file mode 100644
index 000000000..5d98134a5
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go
@@ -0,0 +1,137 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC
+
+// InotifyInit1 implements the inotify_init1() syscalls.
+func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := args[0].Int()
+ if flags&^allFlags != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags))
+ if err != nil {
+ return 0, nil, err
+ }
+ defer ino.DecRef()
+
+ fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{
+ CloseOnExec: flags&linux.IN_CLOEXEC != 0,
+ })
+
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
+
+// InotifyInit implements the inotify_init() syscalls.
+func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ args[0].Value = 0
+ return InotifyInit1(t, args)
+}
+
+// fdToInotify resolves an fd to an inotify object. If successful, the file will
+// have an extra ref and the caller is responsible for releasing the ref.
+func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) {
+ f := t.GetFileVFS2(fd)
+ if f == nil {
+ // Invalid fd.
+ return nil, nil, syserror.EBADF
+ }
+
+ ino, ok := f.Impl().(*vfs.Inotify)
+ if !ok {
+ // Not an inotify fd.
+ f.DecRef()
+ return nil, nil, syserror.EINVAL
+ }
+
+ return ino, f, nil
+}
+
+// InotifyAddWatch implements the inotify_add_watch() syscall.
+func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ mask := args[2].Uint()
+
+ // "EINVAL: The given event mask contains no valid events."
+ // -- inotify_add_watch(2)
+ if mask&linux.ALL_INOTIFY_BITS == 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link."
+ // -- inotify(7)
+ follow := followFinalSymlink
+ if mask&linux.IN_DONT_FOLLOW == 0 {
+ follow = nofollowFinalSymlink
+ }
+
+ ino, f, err := fdToInotify(t, fd)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer f.DecRef()
+
+ path, err := copyInPath(t, addr)
+ if err != nil {
+ return 0, nil, err
+ }
+ if mask&linux.IN_ONLYDIR != 0 {
+ path.Dir = true
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release()
+ d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ return 0, nil, err
+ }
+ defer d.DecRef()
+
+ fd, err = ino.AddWatch(d.Dentry(), mask)
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(fd), nil, nil
+}
+
+// InotifyRmWatch implements the inotify_rm_watch() syscall.
+func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ wd := args[1].Int()
+
+ ino, f, err := fdToInotify(t, fd)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer f.DecRef()
+ return 0, nil, ino.RmWatch(wd)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
new file mode 100644
index 000000000..fd6ab94b2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -0,0 +1,107 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Ioctl implements Linux syscall ioctl(2).
+func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Handle ioctls that apply to all FDs.
+ switch args[1].Int() {
+ case linux.FIONCLEX:
+ t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+ CloseOnExec: false,
+ })
+ return 0, nil, nil
+
+ case linux.FIOCLEX:
+ t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+ CloseOnExec: true,
+ })
+ return 0, nil, nil
+
+ case linux.FIONBIO:
+ var set int32
+ if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ return 0, nil, err
+ }
+ flags := file.StatusFlags()
+ if set != 0 {
+ flags |= linux.O_NONBLOCK
+ } else {
+ flags &^= linux.O_NONBLOCK
+ }
+ return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags)
+
+ case linux.FIOASYNC:
+ var set int32
+ if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ return 0, nil, err
+ }
+ flags := file.StatusFlags()
+ if set != 0 {
+ flags |= linux.O_ASYNC
+ } else {
+ flags &^= linux.O_ASYNC
+ }
+ file.SetStatusFlags(t, t.Credentials(), flags)
+ return 0, nil, nil
+
+ case linux.FIOGETOWN, linux.SIOCGPGRP:
+ var who int32
+ owner, hasOwner := getAsyncOwner(t, file)
+ if hasOwner {
+ if owner.Type == linux.F_OWNER_PGRP {
+ who = -owner.PID
+ } else {
+ who = owner.PID
+ }
+ }
+ _, err := t.CopyOut(args[2].Pointer(), &who)
+ return 0, nil, err
+
+ case linux.FIOSETOWN, linux.SIOCSPGRP:
+ var who int32
+ if _, err := t.CopyIn(args[2].Pointer(), &who); err != nil {
+ return 0, nil, err
+ }
+ ownerType := int32(linux.F_OWNER_PID)
+ if who < 0 {
+ // Check for overflow before flipping the sign.
+ if who-1 > who {
+ return 0, nil, syserror.EINVAL
+ }
+ ownerType = linux.F_OWNER_PGRP
+ who = -who
+ }
+ return 0, nil, setAsyncOwner(t, file, ownerType, who)
+ }
+
+ ret, err := file.Ioctl(t, t.MemoryManager(), args)
+ return ret, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go
new file mode 100644
index 000000000..bf19028c4
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/lock.go
@@ -0,0 +1,64 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Flock implements linux syscall flock(2).
+func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ operation := args[1].Int()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ // flock(2): EBADF fd is not an open file descriptor.
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ nonblocking := operation&linux.LOCK_NB != 0
+ operation &^= linux.LOCK_NB
+
+ var blocker lock.Blocker
+ if !nonblocking {
+ blocker = t
+ }
+
+ switch operation {
+ case linux.LOCK_EX:
+ if err := file.LockBSD(t, lock.WriteLock, blocker); err != nil {
+ return 0, nil, err
+ }
+ case linux.LOCK_SH:
+ if err := file.LockBSD(t, lock.ReadLock, blocker); err != nil {
+ return 0, nil, err
+ }
+ case linux.LOCK_UN:
+ if err := file.UnlockBSD(t); err != nil {
+ return 0, nil, err
+ }
+ default:
+ // flock(2): EINVAL operation is invalid.
+ return 0, nil, syserror.EINVAL
+ }
+
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go
new file mode 100644
index 000000000..bbe248d17
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go
@@ -0,0 +1,63 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+ memfdPrefix = "memfd:"
+ memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix)
+ memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
+)
+
+// MemfdCreate implements the linux syscall memfd_create(2).
+func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ flags := args[1].Uint()
+
+ if flags&^memfdAllFlags != 0 {
+ // Unknown bits in flags.
+ return 0, nil, syserror.EINVAL
+ }
+
+ allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
+ cloExec := flags&linux.MFD_CLOEXEC != 0
+
+ name, err := t.CopyInString(addr, memfdMaxNameLen)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ shmMount := t.Kernel().ShmMount()
+ file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+ CloseOnExec: cloExec,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go
new file mode 100644
index 000000000..60a43f0a0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go
@@ -0,0 +1,92 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mmap implements Linux syscall mmap(2).
+func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ prot := args[2].Int()
+ flags := args[3].Int()
+ fd := args[4].Int()
+ fixed := flags&linux.MAP_FIXED != 0
+ private := flags&linux.MAP_PRIVATE != 0
+ shared := flags&linux.MAP_SHARED != 0
+ anon := flags&linux.MAP_ANONYMOUS != 0
+ map32bit := flags&linux.MAP_32BIT != 0
+
+ // Require exactly one of MAP_PRIVATE and MAP_SHARED.
+ if private == shared {
+ return 0, nil, syserror.EINVAL
+ }
+
+ opts := memmap.MMapOpts{
+ Length: args[1].Uint64(),
+ Offset: args[5].Uint64(),
+ Addr: args[0].Pointer(),
+ Fixed: fixed,
+ Unmap: fixed,
+ Map32Bit: map32bit,
+ Private: private,
+ Perms: usermem.AccessType{
+ Read: linux.PROT_READ&prot != 0,
+ Write: linux.PROT_WRITE&prot != 0,
+ Execute: linux.PROT_EXEC&prot != 0,
+ },
+ MaxPerms: usermem.AnyAccess,
+ GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
+ Precommit: linux.MAP_POPULATE&flags != 0,
+ }
+ if linux.MAP_LOCKED&flags != 0 {
+ opts.MLockMode = memmap.MLockEager
+ }
+ defer func() {
+ if opts.MappingIdentity != nil {
+ opts.MappingIdentity.DecRef()
+ }
+ }()
+
+ if !anon {
+ // Convert the passed FD to a file reference.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // mmap unconditionally requires that the FD is readable.
+ if !file.IsReadable() {
+ return 0, nil, syserror.EACCES
+ }
+ // MAP_SHARED requires that the FD be writable for PROT_WRITE.
+ if shared && !file.IsWritable() {
+ opts.MaxPerms.Write = false
+ }
+
+ if err := file.ConfigureMMap(t, &opts); err != nil {
+ return 0, nil, err
+ }
+ }
+
+ rv, err := t.MemoryManager().MMap(t, opts)
+ return uintptr(rv), nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go
new file mode 100644
index 000000000..adeaa39cc
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mount.go
@@ -0,0 +1,145 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mount implements Linux syscall mount(2).
+func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ sourceAddr := args[0].Pointer()
+ targetAddr := args[1].Pointer()
+ typeAddr := args[2].Pointer()
+ flags := args[3].Uint64()
+ dataAddr := args[4].Pointer()
+
+ // For null-terminated strings related to mount(2), Linux copies in at most
+ // a page worth of data. See fs/namespace.c:copy_mount_string().
+ fsType, err := t.CopyInString(typeAddr, usermem.PageSize)
+ if err != nil {
+ return 0, nil, err
+ }
+ source, err := t.CopyInString(sourceAddr, usermem.PageSize)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ targetPath, err := copyInPath(t, targetAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ data := ""
+ if dataAddr != 0 {
+ // In Linux, a full page is always copied in regardless of null
+ // character placement, and the address is passed to each file system.
+ // Most file systems always treat this data as a string, though, and so
+ // do all of the ones we implement.
+ data, err = t.CopyInString(dataAddr, usermem.PageSize)
+ if err != nil {
+ return 0, nil, err
+ }
+ }
+
+ // Ignore magic value that was required before Linux 2.4.
+ if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL {
+ flags = flags &^ linux.MS_MGC_MSK
+ }
+
+ // Must have CAP_SYS_ADMIN in the current mount namespace's associated user
+ // namespace.
+ creds := t.Credentials()
+ if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+ return 0, nil, syserror.EPERM
+ }
+
+ const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND |
+ linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE |
+ linux.MS_UNBINDABLE | linux.MS_MOVE
+
+ // Silently allow MS_NOSUID, since we don't implement set-id bits
+ // anyway.
+ const unsupportedFlags = linux.MS_NODEV |
+ linux.MS_NODIRATIME | linux.MS_STRICTATIME
+
+ // Linux just allows passing any flags to mount(2) - it won't fail when
+ // unknown or unsupported flags are passed. Since we don't implement
+ // everything, we fail explicitly on flags that are unimplemented.
+ if flags&(unsupportedOps|unsupportedFlags) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ var opts vfs.MountOptions
+ if flags&linux.MS_NOATIME == linux.MS_NOATIME {
+ opts.Flags.NoATime = true
+ }
+ if flags&linux.MS_NOEXEC == linux.MS_NOEXEC {
+ opts.Flags.NoExec = true
+ }
+ if flags&linux.MS_RDONLY == linux.MS_RDONLY {
+ opts.ReadOnly = true
+ }
+ opts.GetFilesystemOptions.Data = data
+
+ target, err := getTaskPathOperation(t, linux.AT_FDCWD, targetPath, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer target.Release()
+
+ return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
+}
+
+// Umount2 implements Linux syscall umount2(2).
+func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ flags := args[1].Int()
+
+ // Must have CAP_SYS_ADMIN in the mount namespace's associated user
+ // namespace.
+ //
+ // Currently, this is always the init task's user namespace.
+ creds := t.Credentials()
+ if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+ return 0, nil, syserror.EPERM
+ }
+
+ const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE
+ if flags&unsupported != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ path, err := copyInPath(t, addr)
+ if err != nil {
+ return 0, nil, err
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release()
+
+ opts := vfs.UmountOptions{
+ Flags: uint32(flags),
+ }
+
+ return 0, nil, t.Kernel().VFS().UmountAt(t, creds, &tpop.pop, &opts)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go
new file mode 100644
index 000000000..97da6c647
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/path.go
@@ -0,0 +1,94 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+func copyInPath(t *kernel.Task, addr usermem.Addr) (fspath.Path, error) {
+ pathname, err := t.CopyInString(addr, linux.PATH_MAX)
+ if err != nil {
+ return fspath.Path{}, err
+ }
+ return fspath.Parse(pathname), nil
+}
+
+type taskPathOperation struct {
+ pop vfs.PathOperation
+ haveStartRef bool
+}
+
+func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink) (taskPathOperation, error) {
+ root := t.FSContext().RootDirectoryVFS2()
+ start := root
+ haveStartRef := false
+ if !path.Absolute {
+ if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
+ root.DecRef()
+ return taskPathOperation{}, syserror.ENOENT
+ }
+ if dirfd == linux.AT_FDCWD {
+ start = t.FSContext().WorkingDirectoryVFS2()
+ haveStartRef = true
+ } else {
+ dirfile := t.GetFileVFS2(dirfd)
+ if dirfile == nil {
+ root.DecRef()
+ return taskPathOperation{}, syserror.EBADF
+ }
+ start = dirfile.VirtualDentry()
+ start.IncRef()
+ haveStartRef = true
+ dirfile.DecRef()
+ }
+ }
+ return taskPathOperation{
+ pop: vfs.PathOperation{
+ Root: root,
+ Start: start,
+ Path: path,
+ FollowFinalSymlink: bool(shouldFollowFinalSymlink),
+ },
+ haveStartRef: haveStartRef,
+ }, nil
+}
+
+func (tpop *taskPathOperation) Release() {
+ tpop.pop.Root.DecRef()
+ if tpop.haveStartRef {
+ tpop.pop.Start.DecRef()
+ tpop.haveStartRef = false
+ }
+}
+
+type shouldAllowEmptyPath bool
+
+const (
+ disallowEmptyPath shouldAllowEmptyPath = false
+ allowEmptyPath shouldAllowEmptyPath = true
+)
+
+type shouldFollowFinalSymlink bool
+
+const (
+ nofollowFinalSymlink shouldFollowFinalSymlink = false
+ followFinalSymlink shouldFollowFinalSymlink = true
+)
diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go
new file mode 100644
index 000000000..4a01e4209
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go
@@ -0,0 +1,63 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Pipe implements Linux syscall pipe(2).
+func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ return 0, nil, pipe2(t, addr, 0)
+}
+
+// Pipe2 implements Linux syscall pipe2(2).
+func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ flags := args[1].Int()
+ return 0, nil, pipe2(t, addr, flags)
+}
+
+func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error {
+ if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
+ return syserror.EINVAL
+ }
+ r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK))
+ defer r.DecRef()
+ defer w.DecRef()
+
+ fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{
+ CloseOnExec: flags&linux.O_CLOEXEC != 0,
+ })
+ if err != nil {
+ return err
+ }
+ if _, err := t.CopyOut(addr, fds); err != nil {
+ for _, fd := range fds {
+ if _, file := t.FDTable().Remove(fd); file != nil {
+ file.DecRef()
+ }
+ }
+ return err
+ }
+ return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
new file mode 100644
index 000000000..ff1b25d7b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -0,0 +1,586 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "fmt"
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// fileCap is the maximum allowable files for poll & select. This has no
+// equivalent in Linux; it exists in gVisor since allocation failure in Go is
+// unrecoverable.
+const fileCap = 1024 * 1024
+
+// Masks for "readable", "writable", and "exceptional" events as defined by
+// select(2).
+const (
+ // selectReadEvents is analogous to the Linux kernel's
+ // fs/select.c:POLLIN_SET.
+ selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR
+
+ // selectWriteEvents is analogous to the Linux kernel's
+ // fs/select.c:POLLOUT_SET.
+ selectWriteEvents = linux.POLLOUT | linux.POLLERR
+
+ // selectExceptEvents is analogous to the Linux kernel's
+ // fs/select.c:POLLEX_SET.
+ selectExceptEvents = linux.POLLPRI
+)
+
+// pollState tracks the associated file description and waiter of a PollFD.
+type pollState struct {
+ file *vfs.FileDescription
+ waiter waiter.Entry
+}
+
+// initReadiness gets the current ready mask for the file represented by the FD
+// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
+// used to register with the file for event notifications, and a reference to
+// the file is stored in "state".
+func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) {
+ if pfd.FD < 0 {
+ pfd.REvents = 0
+ return
+ }
+
+ file := t.GetFileVFS2(pfd.FD)
+ if file == nil {
+ pfd.REvents = linux.POLLNVAL
+ return
+ }
+
+ if ch == nil {
+ defer file.DecRef()
+ } else {
+ state.file = file
+ state.waiter, _ = waiter.NewChannelEntry(ch)
+ file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events)))
+ }
+
+ r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events)))
+ pfd.REvents = int16(r.ToLinux()) & pfd.Events
+}
+
+// releaseState releases all the pollState in "state".
+func releaseState(state []pollState) {
+ for i := range state {
+ if state[i].file != nil {
+ state[i].file.EventUnregister(&state[i].waiter)
+ state[i].file.DecRef()
+ }
+ }
+}
+
+// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout"
+// when "timeout" is greater than zero.
+//
+// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or
+// positive if interrupted by a signal.
+func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
+ var ch chan struct{}
+ if timeout != 0 {
+ ch = make(chan struct{}, 1)
+ }
+
+ // Register for event notification in the files involved if we may
+ // block (timeout not zero). Once we find a file that has a non-zero
+ // result, we stop registering for events but still go through all files
+ // to get their ready masks.
+ state := make([]pollState, len(pfd))
+ defer releaseState(state)
+ n := uintptr(0)
+ for i := range pfd {
+ initReadiness(t, &pfd[i], &state[i], ch)
+ if pfd[i].REvents != 0 {
+ n++
+ ch = nil
+ }
+ }
+
+ if timeout == 0 {
+ return timeout, n, nil
+ }
+
+ haveTimeout := timeout >= 0
+
+ for n == 0 {
+ var err error
+ // Wait for a notification.
+ timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout)
+ if err != nil {
+ if err == syserror.ETIMEDOUT {
+ err = nil
+ }
+ return timeout, 0, err
+ }
+
+ // We got notified, count how many files are ready. If none,
+ // then this was a spurious notification, and we just go back
+ // to sleep with the remaining timeout.
+ for i := range state {
+ if state[i].file == nil {
+ continue
+ }
+
+ r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events)))
+ rl := int16(r.ToLinux()) & pfd[i].Events
+ if rl != 0 {
+ pfd[i].REvents = rl
+ n++
+ }
+ }
+ }
+
+ return timeout, n, nil
+}
+
+// copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max.
+func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) {
+ if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
+ return nil, syserror.EINVAL
+ }
+
+ pfd := make([]linux.PollFD, nfds)
+ if nfds > 0 {
+ if _, err := t.CopyIn(addr, &pfd); err != nil {
+ return nil, err
+ }
+ }
+
+ return pfd, nil
+}
+
+func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
+ pfd, err := copyInPollFDs(t, addr, nfds)
+ if err != nil {
+ return timeout, 0, err
+ }
+
+ // Compatibility warning: Linux adds POLLHUP and POLLERR just before
+ // polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
+ // polling, changing event masks here is an application-visible difference.
+ // (Linux also doesn't copy out event masks at all, only revents.)
+ for i := range pfd {
+ pfd[i].Events |= linux.POLLHUP | linux.POLLERR
+ }
+ remainingTimeout, n, err := pollBlock(t, pfd, timeout)
+ err = syserror.ConvertIntr(err, syserror.EINTR)
+
+ // The poll entries are copied out regardless of whether
+ // any are set or not. This aligns with the Linux behavior.
+ if nfds > 0 && err == nil {
+ if _, err := t.CopyOut(addr, pfd); err != nil {
+ return remainingTimeout, 0, err
+ }
+ }
+
+ return remainingTimeout, n, err
+}
+
+// CopyInFDSet copies an fd set from select(2)/pselect(2).
+func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
+ set := make([]byte, nBytes)
+
+ if addr != 0 {
+ if _, err := t.CopyIn(addr, &set); err != nil {
+ return nil, err
+ }
+ // If we only use part of the last byte, mask out the extraneous bits.
+ //
+ // N.B. This only works on little-endian architectures.
+ if nBitsInLastPartialByte != 0 {
+ set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
+ }
+ }
+ return set, nil
+}
+
+func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
+ if nfds < 0 || nfds > fileCap {
+ return 0, syserror.EINVAL
+ }
+
+ // Calculate the size of the fd sets (one bit per fd).
+ nBytes := (nfds + 7) / 8
+ nBitsInLastPartialByte := nfds % 8
+
+ // Capture all the provided input vectors.
+ r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
+ if err != nil {
+ return 0, err
+ }
+ w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
+ if err != nil {
+ return 0, err
+ }
+ e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
+ if err != nil {
+ return 0, err
+ }
+
+ // Count how many FDs are actually being requested so that we can build
+ // a PollFD array.
+ fdCount := 0
+ for i := 0; i < nBytes; i++ {
+ v := r[i] | w[i] | e[i]
+ for v != 0 {
+ v &= (v - 1)
+ fdCount++
+ }
+ }
+
+ // Build the PollFD array.
+ pfd := make([]linux.PollFD, 0, fdCount)
+ var fd int32
+ for i := 0; i < nBytes; i++ {
+ rV, wV, eV := r[i], w[i], e[i]
+ v := rV | wV | eV
+ m := byte(1)
+ for j := 0; j < 8; j++ {
+ if (v & m) != 0 {
+ // Make sure the fd is valid and decrement the reference
+ // immediately to ensure we don't leak. Note, another thread
+ // might be about to close fd. This is racy, but that's
+ // OK. Linux is racy in the same way.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, syserror.EBADF
+ }
+ file.DecRef()
+
+ var mask int16
+ if (rV & m) != 0 {
+ mask |= selectReadEvents
+ }
+
+ if (wV & m) != 0 {
+ mask |= selectWriteEvents
+ }
+
+ if (eV & m) != 0 {
+ mask |= selectExceptEvents
+ }
+
+ pfd = append(pfd, linux.PollFD{
+ FD: fd,
+ Events: mask,
+ })
+ }
+
+ fd++
+ m <<= 1
+ }
+ }
+
+ // Do the syscall, then count the number of bits set.
+ if _, _, err = pollBlock(t, pfd, timeout); err != nil {
+ return 0, syserror.ConvertIntr(err, syserror.EINTR)
+ }
+
+ // r, w, and e are currently event mask bitsets; unset bits corresponding
+ // to events that *didn't* occur.
+ bitSetCount := uintptr(0)
+ for idx := range pfd {
+ events := pfd[idx].REvents
+ i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
+ m := byte(1) << j
+ if r[i]&m != 0 {
+ if (events & selectReadEvents) != 0 {
+ bitSetCount++
+ } else {
+ r[i] &^= m
+ }
+ }
+ if w[i]&m != 0 {
+ if (events & selectWriteEvents) != 0 {
+ bitSetCount++
+ } else {
+ w[i] &^= m
+ }
+ }
+ if e[i]&m != 0 {
+ if (events & selectExceptEvents) != 0 {
+ bitSetCount++
+ } else {
+ e[i] &^= m
+ }
+ }
+ }
+
+ // Copy updated vectors back.
+ if readFDs != 0 {
+ if _, err := t.CopyOut(readFDs, r); err != nil {
+ return 0, err
+ }
+ }
+
+ if writeFDs != 0 {
+ if _, err := t.CopyOut(writeFDs, w); err != nil {
+ return 0, err
+ }
+ }
+
+ if exceptFDs != 0 {
+ if _, err := t.CopyOut(exceptFDs, e); err != nil {
+ return 0, err
+ }
+ }
+
+ return bitSetCount, nil
+}
+
+// timeoutRemaining returns the amount of time remaining for the specified
+// timeout or 0 if it has elapsed.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
+ now := t.Kernel().MonotonicClock().Now()
+ remaining := timeout - now.Sub(startNs)
+ if remaining < 0 {
+ remaining = 0
+ }
+ return remaining
+}
+
+// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error {
+ if timeout <= 0 {
+ return nil
+ }
+ remaining := timeoutRemaining(t, startNs, timeout)
+ tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
+ _, err := tsRemaining.CopyOut(t, timespecAddr)
+ return err
+}
+
+// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error {
+ if timeout <= 0 {
+ return nil
+ }
+ remaining := timeoutRemaining(t, startNs, timeout)
+ tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
+ _, err := tvRemaining.CopyOut(t, timevalAddr)
+ return err
+}
+
+// pollRestartBlock encapsulates the state required to restart poll(2) via
+// restart_syscall(2).
+//
+// +stateify savable
+type pollRestartBlock struct {
+ pfdAddr usermem.Addr
+ nfds uint
+ timeout time.Duration
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+ return poll(t, p.pfdAddr, p.nfds, p.timeout)
+}
+
+func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
+ remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
+ // On an interrupt poll(2) is restarted with the remaining timeout.
+ if err == syserror.EINTR {
+ t.SetSyscallRestartBlock(&pollRestartBlock{
+ pfdAddr: pfdAddr,
+ nfds: nfds,
+ timeout: remainingTimeout,
+ })
+ return 0, kernel.ERESTART_RESTARTBLOCK
+ }
+ return n, err
+}
+
+// Poll implements linux syscall poll(2).
+func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pfdAddr := args[0].Pointer()
+ nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+ timeout := time.Duration(args[2].Int()) * time.Millisecond
+ n, err := poll(t, pfdAddr, nfds, timeout)
+ return n, nil, err
+}
+
+// Ppoll implements linux syscall ppoll(2).
+func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pfdAddr := args[0].Pointer()
+ nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+ timespecAddr := args[2].Pointer()
+ maskAddr := args[3].Pointer()
+ maskSize := uint(args[4].Uint())
+
+ timeout, err := copyTimespecInToDuration(t, timespecAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ var startNs ktime.Time
+ if timeout > 0 {
+ startNs = t.Kernel().MonotonicClock().Now()
+ }
+
+ if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
+ return 0, nil, err
+ }
+
+ _, n, err := doPoll(t, pfdAddr, nfds, timeout)
+ copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+ // doPoll returns EINTR if interrupted, but ppoll is normally restartable
+ // if interrupted by something other than a signal handled by the
+ // application (i.e. returns ERESTARTNOHAND). However, if
+ // copyOutTimespecRemaining failed, then the restarted ppoll would use the
+ // wrong timeout, so the error should be left as EINTR.
+ //
+ // Note that this means that if err is nil but copyErr is not, copyErr is
+ // ignored. This is consistent with Linux.
+ if err == syserror.EINTR && copyErr == nil {
+ err = kernel.ERESTARTNOHAND
+ }
+ return n, nil, err
+}
+
+// Select implements linux syscall select(2).
+func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ nfds := int(args[0].Int()) // select(2) uses an int.
+ readFDs := args[1].Pointer()
+ writeFDs := args[2].Pointer()
+ exceptFDs := args[3].Pointer()
+ timevalAddr := args[4].Pointer()
+
+ // Use a negative Duration to indicate "no timeout".
+ timeout := time.Duration(-1)
+ if timevalAddr != 0 {
+ var timeval linux.Timeval
+ if _, err := timeval.CopyIn(t, timevalAddr); err != nil {
+ return 0, nil, err
+ }
+ if timeval.Sec < 0 || timeval.Usec < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ timeout = time.Duration(timeval.ToNsecCapped())
+ }
+ startNs := t.Kernel().MonotonicClock().Now()
+ n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+ copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
+ // See comment in Ppoll.
+ if err == syserror.EINTR && copyErr == nil {
+ err = kernel.ERESTARTNOHAND
+ }
+ return n, nil, err
+}
+
+// Pselect implements linux syscall pselect(2).
+func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ nfds := int(args[0].Int()) // select(2) uses an int.
+ readFDs := args[1].Pointer()
+ writeFDs := args[2].Pointer()
+ exceptFDs := args[3].Pointer()
+ timespecAddr := args[4].Pointer()
+ maskWithSizeAddr := args[5].Pointer()
+
+ timeout, err := copyTimespecInToDuration(t, timespecAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ var startNs ktime.Time
+ if timeout > 0 {
+ startNs = t.Kernel().MonotonicClock().Now()
+ }
+
+ if maskWithSizeAddr != 0 {
+ if t.Arch().Width() != 8 {
+ panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width()))
+ }
+ var maskStruct sigSetWithSize
+ if _, err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil {
+ return 0, nil, err
+ }
+ if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil {
+ return 0, nil, err
+ }
+ }
+
+ n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+ copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+ // See comment in Ppoll.
+ if err == syserror.EINTR && copyErr == nil {
+ err = kernel.ERESTARTNOHAND
+ }
+ return n, nil, err
+}
+
+// +marshal
+type sigSetWithSize struct {
+ sigsetAddr uint64
+ sizeofSigset uint64
+}
+
+// copyTimespecInToDuration copies a Timespec from the untrusted app range,
+// validates it and converts it to a Duration.
+//
+// If the Timespec is larger than what can be represented in a Duration, the
+// returned value is the maximum that Duration will allow.
+//
+// If timespecAddr is NULL, the returned value is negative.
+func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) {
+ // Use a negative Duration to indicate "no timeout".
+ timeout := time.Duration(-1)
+ if timespecAddr != 0 {
+ var timespec linux.Timespec
+ if _, err := timespec.CopyIn(t, timespecAddr); err != nil {
+ return 0, err
+ }
+ if !timespec.Valid() {
+ return 0, syserror.EINVAL
+ }
+ timeout = time.Duration(timespec.ToNsecCapped())
+ }
+ return timeout, nil
+}
+
+func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) error {
+ if maskAddr == 0 {
+ return nil
+ }
+ if maskSize != linux.SignalSetSize {
+ return syserror.EINVAL
+ }
+ var mask linux.SignalSet
+ if _, err := mask.CopyIn(t, maskAddr); err != nil {
+ return err
+ }
+ mask &^= kernel.UnblockableSignals
+ oldmask := t.SignalMask()
+ t.SetSignalMask(mask)
+ t.SetSavedSignalMask(oldmask)
+ return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
new file mode 100644
index 000000000..cd25597a7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -0,0 +1,641 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/socket"
+ slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+ eventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+ eventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr
+)
+
+// Read implements Linux syscall read(2).
+func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ size := args[2].SizeT()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the size is legitimate.
+ si := int(size)
+ if si < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the destination of the read.
+ dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := read(t, file, dst, vfs.ReadOptions{})
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+}
+
+// Readv implements Linux syscall readv(2).
+func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ iovcnt := int(args[2].Int())
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Get the destination of the read.
+ dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := read(t, file, dst, vfs.ReadOptions{})
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+}
+
+func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ n, err := file.Read(t, dst, opts)
+ if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
+ return n, err
+ }
+
+ allowBlock, deadline, hasDeadline := blockPolicy(t, file)
+ if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
+ return n, err
+ }
+
+ // Register for notifications.
+ w, ch := waiter.NewChannelEntry(nil)
+ file.EventRegister(&w, eventMaskRead)
+
+ total := n
+ for {
+ // Shorten dst to reflect bytes previously read.
+ dst = dst.DropFirst(int(n))
+
+ // Issue the request and break out if it completes with anything other than
+ // "would block".
+ n, err = file.Read(t, dst, opts)
+ total += n
+ if err != syserror.ErrWouldBlock {
+ break
+ }
+
+ // Wait for a notification that we should retry.
+ if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
+ if err == syserror.ETIMEDOUT {
+ err = syserror.ErrWouldBlock
+ }
+ break
+ }
+ }
+ file.EventUnregister(&w)
+
+ if total > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
+ return total, err
+}
+
+// Pread64 implements Linux syscall pread64(2).
+func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ size := args[2].SizeT()
+ offset := args[3].Int64()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the offset is legitimate and does not overflow.
+ if offset < 0 || offset+int64(size) < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Check that the size is legitimate.
+ si := int(size)
+ if si < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the destination of the read.
+ dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+}
+
+// Preadv implements Linux syscall preadv(2).
+func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ iovcnt := int(args[2].Int())
+ offset := args[3].Int64()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the offset is legitimate.
+ if offset < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the destination of the read.
+ dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+}
+
+// Preadv2 implements Linux syscall preadv2(2).
+func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ // While the glibc signature is
+ // preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+ // the actual syscall
+ // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142)
+ // splits the offset argument into a high/low value for compatibility with
+ // 32-bit architectures. The flags argument is the 6th argument (index 5).
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ iovcnt := int(args[2].Int())
+ offset := args[3].Int64()
+ flags := args[5].Int()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the offset is legitimate.
+ if offset < -1 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the destination of the read.
+ dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ opts := vfs.ReadOptions{
+ Flags: uint32(flags),
+ }
+ var n int64
+ if offset == -1 {
+ n, err = read(t, file, dst, opts)
+ } else {
+ n, err = pread(t, file, dst, offset, opts)
+ }
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+}
+
+func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ n, err := file.PRead(t, dst, offset, opts)
+ if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
+ return n, err
+ }
+
+ allowBlock, deadline, hasDeadline := blockPolicy(t, file)
+ if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
+ return n, err
+ }
+
+ // Register for notifications.
+ w, ch := waiter.NewChannelEntry(nil)
+ file.EventRegister(&w, eventMaskRead)
+
+ total := n
+ for {
+ // Shorten dst to reflect bytes previously read.
+ dst = dst.DropFirst(int(n))
+
+ // Issue the request and break out if it completes with anything other than
+ // "would block".
+ n, err = file.PRead(t, dst, offset+total, opts)
+ total += n
+ if err != syserror.ErrWouldBlock {
+ break
+ }
+
+ // Wait for a notification that we should retry.
+ if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
+ if err == syserror.ETIMEDOUT {
+ err = syserror.ErrWouldBlock
+ }
+ break
+ }
+ }
+ file.EventUnregister(&w)
+
+ if total > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
+ return total, err
+}
+
+// Write implements Linux syscall write(2).
+func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ size := args[2].SizeT()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the size is legitimate.
+ si := int(size)
+ if si < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the source of the write.
+ src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := write(t, file, src, vfs.WriteOptions{})
+ t.IOUsage().AccountWriteSyscall(n)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+}
+
+// Writev implements Linux syscall writev(2).
+func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ iovcnt := int(args[2].Int())
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Get the source of the write.
+ src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := write(t, file, src, vfs.WriteOptions{})
+ t.IOUsage().AccountWriteSyscall(n)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+}
+
+func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ n, err := file.Write(t, src, opts)
+ if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+ }
+ return n, err
+ }
+
+ allowBlock, deadline, hasDeadline := blockPolicy(t, file)
+ if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+ }
+ return n, err
+ }
+
+ // Register for notifications.
+ w, ch := waiter.NewChannelEntry(nil)
+ file.EventRegister(&w, eventMaskWrite)
+
+ total := n
+ for {
+ // Shorten src to reflect bytes previously written.
+ src = src.DropFirst(int(n))
+
+ // Issue the request and break out if it completes with anything other than
+ // "would block".
+ n, err = file.Write(t, src, opts)
+ total += n
+ if err != syserror.ErrWouldBlock {
+ break
+ }
+
+ // Wait for a notification that we should retry.
+ if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
+ if err == syserror.ETIMEDOUT {
+ err = syserror.ErrWouldBlock
+ }
+ break
+ }
+ }
+ file.EventUnregister(&w)
+
+ if total > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+ }
+ return total, err
+}
+
+// Pwrite64 implements Linux syscall pwrite64(2).
+func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ size := args[2].SizeT()
+ offset := args[3].Int64()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the offset is legitimate and does not overflow.
+ if offset < 0 || offset+int64(size) < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Check that the size is legitimate.
+ si := int(size)
+ if si < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the source of the write.
+ src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
+ t.IOUsage().AccountWriteSyscall(n)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+}
+
+// Pwritev implements Linux syscall pwritev(2).
+func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ iovcnt := int(args[2].Int())
+ offset := args[3].Int64()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the offset is legitimate.
+ if offset < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the source of the write.
+ src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+}
+
+// Pwritev2 implements Linux syscall pwritev2(2).
+func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ // While the glibc signature is
+ // pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+ // the actual syscall
+ // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162)
+ // splits the offset argument into a high/low value for compatibility with
+ // 32-bit architectures. The flags argument is the 6th argument (index 5).
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ iovcnt := int(args[2].Int())
+ offset := args[3].Int64()
+ flags := args[5].Int()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the offset is legitimate.
+ if offset < -1 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the source of the write.
+ src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ opts := vfs.WriteOptions{
+ Flags: uint32(flags),
+ }
+ var n int64
+ if offset == -1 {
+ n, err = write(t, file, src, opts)
+ } else {
+ n, err = pwrite(t, file, src, offset, opts)
+ }
+ t.IOUsage().AccountWriteSyscall(n)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+}
+
+func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ n, err := file.PWrite(t, src, offset, opts)
+ if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+ }
+ return n, err
+ }
+
+ allowBlock, deadline, hasDeadline := blockPolicy(t, file)
+ if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
+ return n, err
+ }
+
+ // Register for notifications.
+ w, ch := waiter.NewChannelEntry(nil)
+ file.EventRegister(&w, eventMaskWrite)
+
+ total := n
+ for {
+ // Shorten src to reflect bytes previously written.
+ src = src.DropFirst(int(n))
+
+ // Issue the request and break out if it completes with anything other than
+ // "would block".
+ n, err = file.PWrite(t, src, offset+total, opts)
+ total += n
+ if err != syserror.ErrWouldBlock {
+ break
+ }
+
+ // Wait for a notification that we should retry.
+ if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
+ if err == syserror.ETIMEDOUT {
+ err = syserror.ErrWouldBlock
+ }
+ break
+ }
+ }
+ file.EventUnregister(&w)
+
+ if total > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
+ return total, err
+}
+
+func blockPolicy(t *kernel.Task, file *vfs.FileDescription) (allowBlock bool, deadline ktime.Time, hasDeadline bool) {
+ if file.StatusFlags()&linux.O_NONBLOCK != 0 {
+ return false, ktime.Time{}, false
+ }
+ // Sockets support read/write timeouts.
+ if s, ok := file.Impl().(socket.SocketVFS2); ok {
+ dl := s.RecvTimeout()
+ if dl < 0 {
+ return false, ktime.Time{}, false
+ }
+ if dl > 0 {
+ return true, t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond), true
+ }
+ }
+ return true, ktime.Time{}, false
+}
+
+// Lseek implements Linux syscall lseek(2).
+func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ offset := args[1].Int64()
+ whence := args[2].Int()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ newoff, err := file.Seek(t, offset, whence)
+ return uintptr(newoff), nil, err
+}
+
+// Readahead implements readahead(2).
+func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ offset := args[1].Int64()
+ size := args[2].SizeT()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the file is readable.
+ if !file.IsReadable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Check that the size is valid.
+ if int(size) < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Check that the offset is legitimate and does not overflow.
+ if offset < 0 || offset+int64(size) < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Return EINVAL; if the underlying file type does not support readahead,
+ // then Linux will return EINVAL to indicate as much. In the future, we
+ // may extend this function to actually support readahead hints.
+ return 0, nil, syserror.EINVAL
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
new file mode 100644
index 000000000..09ecfed26
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -0,0 +1,428 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
+
+// Chmod implements Linux syscall chmod(2).
+func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ mode := args[1].ModeT()
+ return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode)
+}
+
+// Fchmodat implements Linux syscall fchmodat(2).
+func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirfd := args[0].Int()
+ pathAddr := args[1].Pointer()
+ mode := args[2].ModeT()
+ return 0, nil, fchmodat(t, dirfd, pathAddr, mode)
+}
+
+func fchmodat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error {
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return err
+ }
+
+ return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_MODE,
+ Mode: uint16(mode & chmodMask),
+ },
+ })
+}
+
+// Fchmod implements Linux syscall fchmod(2).
+func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ mode := args[1].ModeT()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ return 0, nil, file.SetStat(t, vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_MODE,
+ Mode: uint16(mode & chmodMask),
+ },
+ })
+}
+
+// Chown implements Linux syscall chown(2).
+func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ owner := args[1].Int()
+ group := args[2].Int()
+ return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */)
+}
+
+// Lchown implements Linux syscall lchown(2).
+func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ owner := args[1].Int()
+ group := args[2].Int()
+ return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW)
+}
+
+// Fchownat implements Linux syscall fchownat(2).
+func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirfd := args[0].Int()
+ pathAddr := args[1].Pointer()
+ owner := args[2].Int()
+ group := args[3].Int()
+ flags := args[4].Int()
+ return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags)
+}
+
+func fchownat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, owner, group, flags int32) error {
+ if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+ return syserror.EINVAL
+ }
+
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return err
+ }
+
+ var opts vfs.SetStatOptions
+ if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
+ return err
+ }
+
+ return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
+}
+
+func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error {
+ userns := t.UserNamespace()
+ if owner != -1 {
+ kuid := userns.MapToKUID(auth.UID(owner))
+ if !kuid.Ok() {
+ return syserror.EINVAL
+ }
+ opts.Stat.Mask |= linux.STATX_UID
+ opts.Stat.UID = uint32(kuid)
+ }
+ if group != -1 {
+ kgid := userns.MapToKGID(auth.GID(group))
+ if !kgid.Ok() {
+ return syserror.EINVAL
+ }
+ opts.Stat.Mask |= linux.STATX_GID
+ opts.Stat.GID = uint32(kgid)
+ }
+ return nil
+}
+
+// Fchown implements Linux syscall fchown(2).
+func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ owner := args[1].Int()
+ group := args[2].Int()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ var opts vfs.SetStatOptions
+ if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, file.SetStat(t, opts)
+}
+
+// Truncate implements Linux syscall truncate(2).
+func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].Int64()
+
+ if length < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ path, err := copyInPath(t, addr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ err = setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_SIZE,
+ Size: uint64(length),
+ },
+ })
+ return 0, nil, handleSetSizeError(t, err)
+}
+
+// Ftruncate implements Linux syscall ftruncate(2).
+func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ length := args[1].Int64()
+
+ if length < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ err := file.SetStat(t, vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_SIZE,
+ Size: uint64(length),
+ },
+ })
+ return 0, nil, handleSetSizeError(t, err)
+}
+
+// Utime implements Linux syscall utime(2).
+func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ timesAddr := args[1].Pointer()
+
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ opts := vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_ATIME | linux.STATX_MTIME,
+ },
+ }
+ if timesAddr == 0 {
+ opts.Stat.Atime.Nsec = linux.UTIME_NOW
+ opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+ } else {
+ var times linux.Utime
+ if _, err := times.CopyIn(t, timesAddr); err != nil {
+ return 0, nil, err
+ }
+ opts.Stat.Atime.Sec = times.Actime
+ opts.Stat.Mtime.Sec = times.Modtime
+ }
+
+ return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Utimes implements Linux syscall utimes(2).
+func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ timesAddr := args[1].Pointer()
+
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ var opts vfs.SetStatOptions
+ if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Futimesat implements Linux syscall futimesat(2).
+func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirfd := args[0].Int()
+ pathAddr := args[1].Pointer()
+ timesAddr := args[2].Pointer()
+
+ // "If filename is NULL and dfd refers to an open file, then operate on the
+ // file. Otherwise look up filename, possibly using dfd as a starting
+ // point." - fs/utimes.c
+ var path fspath.Path
+ shouldAllowEmptyPath := allowEmptyPath
+ if dirfd == linux.AT_FDCWD || pathAddr != 0 {
+ var err error
+ path, err = copyInPath(t, pathAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+ shouldAllowEmptyPath = disallowEmptyPath
+ }
+
+ var opts vfs.SetStatOptions
+ if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, followFinalSymlink, &opts)
+}
+
+func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error {
+ if timesAddr == 0 {
+ opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
+ opts.Stat.Atime.Nsec = linux.UTIME_NOW
+ opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+ return nil
+ }
+ var times [2]linux.Timeval
+ if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ return err
+ }
+ if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 {
+ return syserror.EINVAL
+ }
+ opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
+ opts.Stat.Atime = linux.StatxTimestamp{
+ Sec: times[0].Sec,
+ Nsec: uint32(times[0].Usec * 1000),
+ }
+ opts.Stat.Mtime = linux.StatxTimestamp{
+ Sec: times[1].Sec,
+ Nsec: uint32(times[1].Usec * 1000),
+ }
+ return nil
+}
+
+// Utimensat implements Linux syscall utimensat(2).
+func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirfd := args[0].Int()
+ pathAddr := args[1].Pointer()
+ timesAddr := args[2].Pointer()
+ flags := args[3].Int()
+
+ // Linux requires that the UTIME_OMIT check occur before checking path or
+ // flags.
+ var opts vfs.SetStatOptions
+ if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
+ return 0, nil, err
+ }
+ if opts.Stat.Mask == 0 {
+ return 0, nil, nil
+ }
+
+ if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // "If filename is NULL and dfd refers to an open file, then operate on the
+ // file. Otherwise look up filename, possibly using dfd as a starting
+ // point." - fs/utimes.c
+ var path fspath.Path
+ shouldAllowEmptyPath := allowEmptyPath
+ if dirfd == linux.AT_FDCWD || pathAddr != 0 {
+ var err error
+ path, err = copyInPath(t, pathAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+ shouldAllowEmptyPath = disallowEmptyPath
+ }
+
+ return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
+}
+
+func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error {
+ if timesAddr == 0 {
+ opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
+ opts.Stat.Atime.Nsec = linux.UTIME_NOW
+ opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+ return nil
+ }
+ var times [2]linux.Timespec
+ if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ return err
+ }
+ if times[0].Nsec != linux.UTIME_OMIT {
+ if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) {
+ return syserror.EINVAL
+ }
+ opts.Stat.Mask |= linux.STATX_ATIME
+ opts.Stat.Atime = linux.StatxTimestamp{
+ Sec: times[0].Sec,
+ Nsec: uint32(times[0].Nsec),
+ }
+ }
+ if times[1].Nsec != linux.UTIME_OMIT {
+ if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) {
+ return syserror.EINVAL
+ }
+ opts.Stat.Mask |= linux.STATX_MTIME
+ opts.Stat.Mtime = linux.StatxTimestamp{
+ Sec: times[1].Sec,
+ Nsec: uint32(times[1].Nsec),
+ }
+ }
+ return nil
+}
+
+func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
+ root := t.FSContext().RootDirectoryVFS2()
+ defer root.DecRef()
+ start := root
+ if !path.Absolute {
+ if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
+ return syserror.ENOENT
+ }
+ if dirfd == linux.AT_FDCWD {
+ start = t.FSContext().WorkingDirectoryVFS2()
+ defer start.DecRef()
+ } else {
+ dirfile := t.GetFileVFS2(dirfd)
+ if dirfile == nil {
+ return syserror.EBADF
+ }
+ if !path.HasComponents() {
+ // Use FileDescription.SetStat() instead of
+ // VirtualFilesystem.SetStatAt(), since the former may be able
+ // to use opened file state to expedite the SetStat.
+ err := dirfile.SetStat(t, *opts)
+ dirfile.DecRef()
+ return err
+ }
+ start = dirfile.VirtualDentry()
+ start.IncRef()
+ defer start.DecRef()
+ dirfile.DecRef()
+ }
+ }
+ return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{
+ Root: root,
+ Start: start,
+ Path: path,
+ FollowFinalSymlink: bool(shouldFollowFinalSymlink),
+ }, opts)
+}
+
+func handleSetSizeError(t *kernel.Task, err error) error {
+ if err == syserror.ErrExceedsFileSizeLimit {
+ // Convert error to EFBIG and send a SIGXFSZ per setrlimit(2).
+ t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
+ return syserror.EFBIG
+ }
+ return err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go
new file mode 100644
index 000000000..623992f6f
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/signal.go
@@ -0,0 +1,100 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/signalfd"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// sharedSignalfd is shared between the two calls.
+func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) {
+ // Copy in the signal mask.
+ mask, err := slinux.CopyInSigSet(t, sigset, sigsetsize)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // Always check for valid flags, even if not creating.
+ if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Is this a change to an existing signalfd?
+ //
+ // The spec indicates that this should adjust the mask.
+ if fd != -1 {
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Is this a signalfd?
+ if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok {
+ sfd.SetMask(mask)
+ return 0, nil, nil
+ }
+
+ // Not a signalfd.
+ return 0, nil, syserror.EINVAL
+ }
+
+ fileFlags := uint32(linux.O_RDWR)
+ if flags&linux.SFD_NONBLOCK != 0 {
+ fileFlags |= linux.O_NONBLOCK
+ }
+
+ // Create a new file.
+ vfsObj := t.Kernel().VFS()
+ file, err := signalfd.New(vfsObj, t, mask, fileFlags)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer file.DecRef()
+
+ // Create a new descriptor.
+ fd, err = t.NewFDFromVFS2(0, file, kernel.FDFlags{
+ CloseOnExec: flags&linux.SFD_CLOEXEC != 0,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // Done.
+ return uintptr(fd), nil, nil
+}
+
+// Signalfd implements the linux syscall signalfd(2).
+func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ sigset := args[1].Pointer()
+ sigsetsize := args[2].SizeT()
+ return sharedSignalfd(t, fd, sigset, sigsetsize, 0)
+}
+
+// Signalfd4 implements the linux syscall signalfd4(2).
+func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ sigset := args[1].Pointer()
+ sigsetsize := args[2].SizeT()
+ flags := args[3].Int()
+ return sharedSignalfd(t, fd, sigset, sigsetsize, flags)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
new file mode 100644
index 000000000..10b668477
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -0,0 +1,1139 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/socket"
+ "gvisor.dev/gvisor/pkg/sentry/socket/control"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserr"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// minListenBacklog is the minimum reasonable backlog for listening sockets.
+const minListenBacklog = 8
+
+// maxListenBacklog is the maximum allowed backlog for listening sockets.
+const maxListenBacklog = 1024
+
+// maxAddrLen is the maximum socket address length we're willing to accept.
+const maxAddrLen = 200
+
+// maxOptLen is the maximum sockopt parameter length we're willing to accept.
+const maxOptLen = 1024 * 8
+
+// maxControlLen is the maximum length of the msghdr.msg_control buffer we're
+// willing to accept. Note that this limit is smaller than Linux, which allows
+// buffers upto INT_MAX.
+const maxControlLen = 10 * 1024 * 1024
+
+// nameLenOffset is the offset from the start of the MessageHeader64 struct to
+// the NameLen field.
+const nameLenOffset = 8
+
+// controlLenOffset is the offset form the start of the MessageHeader64 struct
+// to the ControlLen field.
+const controlLenOffset = 40
+
+// flagsOffset is the offset form the start of the MessageHeader64 struct
+// to the Flags field.
+const flagsOffset = 48
+
+const sizeOfInt32 = 4
+
+// messageHeader64Len is the length of a MessageHeader64 struct.
+var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+
+// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
+var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+
+// baseRecvFlags are the flags that are accepted across recvmsg(2),
+// recvmmsg(2), and recvfrom(2).
+const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC
+
+// MessageHeader64 is the 64-bit representation of the msghdr struct used in
+// the recvmsg and sendmsg syscalls.
+type MessageHeader64 struct {
+ // Name is the optional pointer to a network address buffer.
+ Name uint64
+
+ // NameLen is the length of the buffer pointed to by Name.
+ NameLen uint32
+ _ uint32
+
+ // Iov is a pointer to an array of io vectors that describe the memory
+ // locations involved in the io operation.
+ Iov uint64
+
+ // IovLen is the length of the array pointed to by Iov.
+ IovLen uint64
+
+ // Control is the optional pointer to ancillary control data.
+ Control uint64
+
+ // ControlLen is the length of the data pointed to by Control.
+ ControlLen uint64
+
+ // Flags on the sent/received message.
+ Flags int32
+ _ int32
+}
+
+// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
+// the recvmmsg and sendmmsg syscalls.
+type multipleMessageHeader64 struct {
+ msgHdr MessageHeader64
+ msgLen uint32
+ _ int32
+}
+
+// CopyInMessageHeader64 copies a message header from user to kernel memory.
+func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
+ b := t.CopyScratchBuffer(52)
+ if _, err := t.CopyInBytes(addr, b); err != nil {
+ return err
+ }
+
+ msg.Name = usermem.ByteOrder.Uint64(b[0:])
+ msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
+ msg.Iov = usermem.ByteOrder.Uint64(b[16:])
+ msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
+ msg.Control = usermem.ByteOrder.Uint64(b[32:])
+ msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
+ msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
+
+ return nil
+}
+
+// CaptureAddress allocates memory for and copies a socket address structure
+// from the untrusted address space range.
+func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
+ if addrlen > maxAddrLen {
+ return nil, syserror.EINVAL
+ }
+
+ addrBuf := make([]byte, addrlen)
+ if _, err := t.CopyInBytes(addr, addrBuf); err != nil {
+ return nil, err
+ }
+
+ return addrBuf, nil
+}
+
+// writeAddress writes a sockaddr structure and its length to an output buffer
+// in the unstrusted address space range. If the address is bigger than the
+// buffer, it is truncated.
+func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+ // Get the buffer length.
+ var bufLen uint32
+ if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+ return err
+ }
+
+ if int32(bufLen) < 0 {
+ return syserror.EINVAL
+ }
+
+ // Write the length unconditionally.
+ if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+ return err
+ }
+
+ if addr == nil {
+ return nil
+ }
+
+ if bufLen > addrLen {
+ bufLen = addrLen
+ }
+
+ // Copy as much of the address as will fit in the buffer.
+ encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+ if bufLen > uint32(len(encodedAddr)) {
+ bufLen = uint32(len(encodedAddr))
+ }
+ _, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)])
+ return err
+}
+
+// Socket implements the linux syscall socket(2).
+func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ domain := int(args[0].Int())
+ stype := args[1].Int()
+ protocol := int(args[2].Int())
+
+ // Check and initialize the flags.
+ if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Create the new socket.
+ s, e := socket.NewVFS2(t, domain, linux.SockType(stype&0xf), protocol)
+ if e != nil {
+ return 0, nil, e.ToError()
+ }
+ defer s.DecRef()
+
+ if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil {
+ return 0, nil, err
+ }
+
+ fd, err := t.NewFDFromVFS2(0, s, kernel.FDFlags{
+ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
+
+// SocketPair implements the linux syscall socketpair(2).
+func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ domain := int(args[0].Int())
+ stype := args[1].Int()
+ protocol := int(args[2].Int())
+ addr := args[3].Pointer()
+
+ // Check and initialize the flags.
+ if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Create the socket pair.
+ s1, s2, e := socket.PairVFS2(t, domain, linux.SockType(stype&0xf), protocol)
+ if e != nil {
+ return 0, nil, e.ToError()
+ }
+ // Adding to the FD table will cause an extra reference to be acquired.
+ defer s1.DecRef()
+ defer s2.DecRef()
+
+ nonblocking := uint32(stype & linux.SOCK_NONBLOCK)
+ if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
+ return 0, nil, err
+ }
+ if err := s2.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
+ return 0, nil, err
+ }
+
+ // Create the FDs for the sockets.
+ flags := kernel.FDFlags{
+ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+ }
+ fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{s1, s2}, flags)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ if _, err := t.CopyOut(addr, fds); err != nil {
+ for _, fd := range fds {
+ if _, file := t.FDTable().Remove(fd); file != nil {
+ file.DecRef()
+ }
+ }
+ return 0, nil, err
+ }
+
+ return 0, nil, nil
+}
+
+// Connect implements the linux syscall connect(2).
+func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ addrlen := args[2].Uint()
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Capture address and call syscall implementation.
+ a, err := CaptureAddress(t, addr, addrlen)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
+ return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS)
+}
+
+// accept is the implementation of the accept syscall. It is called by accept
+// and accept4 syscall handlers.
+func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) {
+ // Check that no unsupported flags are passed in.
+ if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, syserror.ENOTSOCK
+ }
+
+ // Call the syscall implementation for this socket, then copy the
+ // output address if one is specified.
+ blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
+
+ peerRequested := addrLen != 0
+ nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
+ if e != nil {
+ return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+ }
+ if peerRequested {
+ // NOTE(magi): Linux does not give you an error if it can't
+ // write the data back out so neither do we.
+ if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL {
+ return 0, err
+ }
+ }
+ return uintptr(nfd), nil
+}
+
+// Accept4 implements the linux syscall accept4(2).
+func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ addrlen := args[2].Pointer()
+ flags := int(args[3].Int())
+
+ n, err := accept(t, fd, addr, addrlen, flags)
+ return n, nil, err
+}
+
+// Accept implements the linux syscall accept(2).
+func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ addrlen := args[2].Pointer()
+
+ n, err := accept(t, fd, addr, addrlen, 0)
+ return n, nil, err
+}
+
+// Bind implements the linux syscall bind(2).
+func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ addrlen := args[2].Uint()
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Capture address and call syscall implementation.
+ a, err := CaptureAddress(t, addr, addrlen)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, s.Bind(t, a).ToError()
+}
+
+// Listen implements the linux syscall listen(2).
+func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ backlog := args[1].Int()
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Per Linux, the backlog is silently capped to reasonable values.
+ if backlog <= 0 {
+ backlog = minListenBacklog
+ }
+ if backlog > maxListenBacklog {
+ backlog = maxListenBacklog
+ }
+
+ return 0, nil, s.Listen(t, int(backlog)).ToError()
+}
+
+// Shutdown implements the linux syscall shutdown(2).
+func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ how := args[1].Int()
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Validate how, then call syscall implementation.
+ switch how {
+ case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR:
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ return 0, nil, s.Shutdown(t, int(how)).ToError()
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2).
+func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ level := args[1].Int()
+ name := args[2].Int()
+ optValAddr := args[3].Pointer()
+ optLenAddr := args[4].Pointer()
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Read the length. Reject negative values.
+ optLen := int32(0)
+ if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+ return 0, nil, err
+ }
+ if optLen < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Call syscall implementation then copy both value and value len out.
+ v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen))
+ if e != nil {
+ return 0, nil, e.ToError()
+ }
+
+ vLen := int32(binary.Size(v))
+ if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+ return 0, nil, err
+ }
+
+ if v != nil {
+ if _, err := t.CopyOut(optValAddr, v); err != nil {
+ return 0, nil, err
+ }
+ }
+
+ return 0, nil, nil
+}
+
+// getSockOpt tries to handle common socket options, or dispatches to a specific
+// socket implementation.
+func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
+ if level == linux.SOL_SOCKET {
+ switch name {
+ case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
+ if len < sizeOfInt32 {
+ return nil, syserr.ErrInvalidArgument
+ }
+ }
+
+ switch name {
+ case linux.SO_TYPE:
+ _, skType, _ := s.Type()
+ return int32(skType), nil
+ case linux.SO_DOMAIN:
+ family, _, _ := s.Type()
+ return int32(family), nil
+ case linux.SO_PROTOCOL:
+ _, _, protocol := s.Type()
+ return int32(protocol), nil
+ }
+ }
+
+ return s.GetSockOpt(t, level, name, optValAddr, len)
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2).
+//
+// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket.
+func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ level := args[1].Int()
+ name := args[2].Int()
+ optValAddr := args[3].Pointer()
+ optLen := args[4].Int()
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ if optLen < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if optLen > maxOptLen {
+ return 0, nil, syserror.EINVAL
+ }
+ buf := t.CopyScratchBuffer(int(optLen))
+ if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+ return 0, nil, err
+ }
+
+ // Call syscall implementation.
+ if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil {
+ return 0, nil, err.ToError()
+ }
+
+ return 0, nil, nil
+}
+
+// GetSockName implements the linux syscall getsockname(2).
+func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ addrlen := args[2].Pointer()
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Get the socket name and copy it to the caller.
+ v, vl, err := s.GetSockName(t)
+ if err != nil {
+ return 0, nil, err.ToError()
+ }
+
+ return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// GetPeerName implements the linux syscall getpeername(2).
+func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ addrlen := args[2].Pointer()
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Get the socket peer name and copy it to the caller.
+ v, vl, err := s.GetPeerName(t)
+ if err != nil {
+ return 0, nil, err.ToError()
+ }
+
+ return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// RecvMsg implements the linux syscall recvmsg(2).
+func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ msgPtr := args[1].Pointer()
+ flags := args[2].Int()
+
+ if t.Arch().Width() != 8 {
+ // We only handle 64-bit for now.
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Reject flags that we don't handle yet.
+ if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ var haveDeadline bool
+ var deadline ktime.Time
+ if dl := s.RecvTimeout(); dl > 0 {
+ deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+ haveDeadline = true
+ } else if dl < 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline)
+ return n, nil, err
+}
+
+// RecvMMsg implements the linux syscall recvmmsg(2).
+func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ msgPtr := args[1].Pointer()
+ vlen := args[2].Uint()
+ flags := args[3].Int()
+ toPtr := args[4].Pointer()
+
+ if t.Arch().Width() != 8 {
+ // We only handle 64-bit for now.
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Reject flags that we don't handle yet.
+ if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ var haveDeadline bool
+ var deadline ktime.Time
+ if toPtr != 0 {
+ var ts linux.Timespec
+ if _, err := ts.CopyIn(t, toPtr); err != nil {
+ return 0, nil, err
+ }
+ if !ts.Valid() {
+ return 0, nil, syserror.EINVAL
+ }
+ deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration())
+ haveDeadline = true
+ }
+
+ if !haveDeadline {
+ if dl := s.RecvTimeout(); dl > 0 {
+ deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+ haveDeadline = true
+ } else if dl < 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+ }
+
+ var count uint32
+ var err error
+ for i := uint64(0); i < uint64(vlen); i++ {
+ mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+ if !ok {
+ return 0, nil, syserror.EFAULT
+ }
+ var n uintptr
+ if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil {
+ break
+ }
+
+ // Copy the received length to the caller.
+ lp, ok := mp.AddLength(messageHeader64Len)
+ if !ok {
+ return 0, nil, syserror.EFAULT
+ }
+ if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+ break
+ }
+ count++
+ }
+
+ if count == 0 {
+ return 0, nil, err
+ }
+ return uintptr(count), nil, nil
+}
+
+func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
+ // Capture the message header and io vectors.
+ var msg MessageHeader64
+ if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+ return 0, err
+ }
+
+ if msg.IovLen > linux.UIO_MAXIOV {
+ return 0, syserror.EMSGSIZE
+ }
+ dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, err
+ }
+
+ // FIXME(b/63594852): Pretend we have an empty error queue.
+ if flags&linux.MSG_ERRQUEUE != 0 {
+ return 0, syserror.EAGAIN
+ }
+
+ // Fast path when no control message nor name buffers are provided.
+ if msg.ControlLen == 0 && msg.NameLen == 0 {
+ n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
+ if err != nil {
+ return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
+ }
+ if !cms.Unix.Empty() {
+ mflags |= linux.MSG_CTRUNC
+ cms.Release()
+ }
+
+ if int(msg.Flags) != mflags {
+ // Copy out the flags to the caller.
+ if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+ return 0, err
+ }
+ }
+
+ return uintptr(n), nil
+ }
+
+ if msg.ControlLen > maxControlLen {
+ return 0, syserror.ENOBUFS
+ }
+ n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
+ if e != nil {
+ return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+ }
+ defer cms.Release()
+
+ controlData := make([]byte, 0, msg.ControlLen)
+ controlData = control.PackControlMessages(t, cms, controlData)
+
+ if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
+ creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
+ controlData, mflags = control.PackCredentials(t, creds, controlData, mflags)
+ }
+
+ if cms.Unix.Rights != nil {
+ controlData, mflags = control.PackRightsVFS2(t, cms.Unix.Rights.(control.SCMRightsVFS2), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
+ }
+
+ // Copy the address to the caller.
+ if msg.NameLen != 0 {
+ if err := writeAddress(t, sender, senderLen, usermem.Addr(msg.Name), usermem.Addr(msgPtr+nameLenOffset)); err != nil {
+ return 0, err
+ }
+ }
+
+ // Copy the control data to the caller.
+ if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+ return 0, err
+ }
+ if len(controlData) > 0 {
+ if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+ return 0, err
+ }
+ }
+
+ // Copy out the flags to the caller.
+ if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+ return 0, err
+ }
+
+ return uintptr(n), nil
+}
+
+// recvFrom is the implementation of the recvfrom syscall. It is called by
+// recvfrom and recv syscall handlers.
+func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) {
+ if int(bufLen) < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Reject flags that we don't handle yet.
+ if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, syserror.ENOTSOCK
+ }
+
+ if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, err
+ }
+
+ var haveDeadline bool
+ var deadline ktime.Time
+ if dl := s.RecvTimeout(); dl > 0 {
+ deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+ haveDeadline = true
+ } else if dl < 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
+ cm.Release()
+ if e != nil {
+ return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+ }
+
+ // Copy the address to the caller.
+ if nameLenPtr != 0 {
+ if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil {
+ return 0, err
+ }
+ }
+
+ return uintptr(n), nil
+}
+
+// RecvFrom implements the linux syscall recvfrom(2).
+func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ bufPtr := args[1].Pointer()
+ bufLen := args[2].Uint64()
+ flags := args[3].Int()
+ namePtr := args[4].Pointer()
+ nameLenPtr := args[5].Pointer()
+
+ n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr)
+ return n, nil, err
+}
+
+// SendMsg implements the linux syscall sendmsg(2).
+func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ msgPtr := args[1].Pointer()
+ flags := args[2].Int()
+
+ if t.Arch().Width() != 8 {
+ // We only handle 64-bit for now.
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Reject flags that we don't handle yet.
+ if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ n, err := sendSingleMsg(t, s, file, msgPtr, flags)
+ return n, nil, err
+}
+
+// SendMMsg implements the linux syscall sendmmsg(2).
+func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ msgPtr := args[1].Pointer()
+ vlen := args[2].Uint()
+ flags := args[3].Int()
+
+ if t.Arch().Width() != 8 {
+ // We only handle 64-bit for now.
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, nil, syserror.ENOTSOCK
+ }
+
+ // Reject flags that we don't handle yet.
+ if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ var count uint32
+ var err error
+ for i := uint64(0); i < uint64(vlen); i++ {
+ mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+ if !ok {
+ return 0, nil, syserror.EFAULT
+ }
+ var n uintptr
+ if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil {
+ break
+ }
+
+ // Copy the received length to the caller.
+ lp, ok := mp.AddLength(messageHeader64Len)
+ if !ok {
+ return 0, nil, syserror.EFAULT
+ }
+ if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+ break
+ }
+ count++
+ }
+
+ if count == 0 {
+ return 0, nil, err
+ }
+ return uintptr(count), nil, nil
+}
+
+func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr usermem.Addr, flags int32) (uintptr, error) {
+ // Capture the message header.
+ var msg MessageHeader64
+ if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+ return 0, err
+ }
+
+ var controlData []byte
+ if msg.ControlLen > 0 {
+ // Put an upper bound to prevent large allocations.
+ if msg.ControlLen > maxControlLen {
+ return 0, syserror.ENOBUFS
+ }
+ controlData = make([]byte, msg.ControlLen)
+ if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+ return 0, err
+ }
+ }
+
+ // Read the destination address if one is specified.
+ var to []byte
+ if msg.NameLen != 0 {
+ var err error
+ to, err = CaptureAddress(t, usermem.Addr(msg.Name), msg.NameLen)
+ if err != nil {
+ return 0, err
+ }
+ }
+
+ // Read data then call the sendmsg implementation.
+ if msg.IovLen > linux.UIO_MAXIOV {
+ return 0, syserror.EMSGSIZE
+ }
+ src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, err
+ }
+
+ controlMessages, err := control.Parse(t, s, controlData)
+ if err != nil {
+ return 0, err
+ }
+
+ var haveDeadline bool
+ var deadline ktime.Time
+ if dl := s.SendTimeout(); dl > 0 {
+ deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+ haveDeadline = true
+ } else if dl < 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ // Call the syscall implementation.
+ n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
+ err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
+ if err != nil {
+ controlMessages.Release()
+ }
+ return uintptr(n), err
+}
+
+// sendTo is the implementation of the sendto syscall. It is called by sendto
+// and send syscall handlers.
+func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) {
+ bl := int(bufLen)
+ if bl < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Get socket from the file descriptor.
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Extract the socket.
+ s, ok := file.Impl().(socket.SocketVFS2)
+ if !ok {
+ return 0, syserror.ENOTSOCK
+ }
+
+ if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ // Read the destination address if one is specified.
+ var to []byte
+ var err error
+ if namePtr != 0 {
+ to, err = CaptureAddress(t, namePtr, nameLen)
+ if err != nil {
+ return 0, err
+ }
+ }
+
+ src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, err
+ }
+
+ var haveDeadline bool
+ var deadline ktime.Time
+ if dl := s.SendTimeout(); dl > 0 {
+ deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+ haveDeadline = true
+ } else if dl < 0 {
+ flags |= linux.MSG_DONTWAIT
+ }
+
+ // Call the syscall implementation.
+ n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
+ return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
+}
+
+// SendTo implements the linux syscall sendto(2).
+func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ bufPtr := args[1].Pointer()
+ bufLen := args[2].Uint64()
+ flags := args[3].Int()
+ namePtr := args[4].Pointer()
+ nameLen := args[5].Uint()
+
+ n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
+ return n, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
new file mode 100644
index 000000000..945a364a7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -0,0 +1,291 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Splice implements Linux syscall splice(2).
+func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ inFD := args[0].Int()
+ inOffsetPtr := args[1].Pointer()
+ outFD := args[2].Int()
+ outOffsetPtr := args[3].Pointer()
+ count := int64(args[4].SizeT())
+ flags := args[5].Int()
+
+ if count == 0 {
+ return 0, nil, nil
+ }
+ if count > int64(kernel.MAX_RW_COUNT) {
+ count = int64(kernel.MAX_RW_COUNT)
+ }
+
+ // Check for invalid flags.
+ if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get file descriptions.
+ inFile := t.GetFileVFS2(inFD)
+ if inFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer inFile.DecRef()
+ outFile := t.GetFileVFS2(outFD)
+ if outFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer outFile.DecRef()
+
+ // Check that both files support the required directionality.
+ if !inFile.IsReadable() || !outFile.IsWritable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ // The operation is non-blocking if anything is non-blocking.
+ //
+ // N.B. This is a rather simplistic heuristic that avoids some
+ // poor edge case behavior since the exact semantics here are
+ // underspecified and vary between versions of Linux itself.
+ nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+ // At least one file description must represent a pipe.
+ inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+ outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+ if !inIsPipe && !outIsPipe {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Copy in offsets.
+ inOffset := int64(-1)
+ if inOffsetPtr != 0 {
+ if inIsPipe {
+ return 0, nil, syserror.ESPIPE
+ }
+ if inFile.Options().DenyPRead {
+ return 0, nil, syserror.EINVAL
+ }
+ if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil {
+ return 0, nil, err
+ }
+ if inOffset < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ }
+ outOffset := int64(-1)
+ if outOffsetPtr != 0 {
+ if outIsPipe {
+ return 0, nil, syserror.ESPIPE
+ }
+ if outFile.Options().DenyPWrite {
+ return 0, nil, syserror.EINVAL
+ }
+ if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil {
+ return 0, nil, err
+ }
+ if outOffset < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ }
+
+ // Move data.
+ var (
+ n int64
+ err error
+ inCh chan struct{}
+ outCh chan struct{}
+ )
+ for {
+ // If both input and output are pipes, delegate to the pipe
+ // implementation. Otherwise, exactly one end is a pipe, which we
+ // ensure is consistently ordered after the non-pipe FD's locks by
+ // passing the pipe FD as usermem.IO to the non-pipe end.
+ switch {
+ case inIsPipe && outIsPipe:
+ n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
+ case inIsPipe:
+ if outOffset != -1 {
+ n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{})
+ outOffset += n
+ } else {
+ n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{})
+ }
+ case outIsPipe:
+ if inOffset != -1 {
+ n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{})
+ inOffset += n
+ } else {
+ n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
+ }
+ }
+ if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
+ break
+ }
+
+ // Note that the blocking behavior here is a bit different than the
+ // normal pattern. Because we need to have both data to read and data
+ // to write simultaneously, we actually explicitly block on both of
+ // these cases in turn before returning to the splice operation.
+ if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
+ if inCh == nil {
+ inCh = make(chan struct{}, 1)
+ inW, _ := waiter.NewChannelEntry(inCh)
+ inFile.EventRegister(&inW, eventMaskRead)
+ defer inFile.EventUnregister(&inW)
+ continue // Need to refresh readiness.
+ }
+ if err = t.Block(inCh); err != nil {
+ break
+ }
+ }
+ if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
+ if outCh == nil {
+ outCh = make(chan struct{}, 1)
+ outW, _ := waiter.NewChannelEntry(outCh)
+ outFile.EventRegister(&outW, eventMaskWrite)
+ defer outFile.EventUnregister(&outW)
+ continue // Need to refresh readiness.
+ }
+ if err = t.Block(outCh); err != nil {
+ break
+ }
+ }
+ }
+
+ // Copy updated offsets out.
+ if inOffsetPtr != 0 {
+ if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil {
+ return 0, nil, err
+ }
+ }
+ if outOffsetPtr != 0 {
+ if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil {
+ return 0, nil, err
+ }
+ }
+
+ if n == 0 {
+ return 0, nil, err
+ }
+
+ // On Linux, inotify behavior is not very consistent with splice(2). We try
+ // our best to emulate Linux for very basic calls to splice, where for some
+ // reason, events are generated for output files, but not input files.
+ outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+ return uintptr(n), nil, nil
+}
+
+// Tee implements Linux syscall tee(2).
+func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ inFD := args[0].Int()
+ outFD := args[1].Int()
+ count := int64(args[2].SizeT())
+ flags := args[3].Int()
+
+ if count == 0 {
+ return 0, nil, nil
+ }
+ if count > int64(kernel.MAX_RW_COUNT) {
+ count = int64(kernel.MAX_RW_COUNT)
+ }
+
+ // Check for invalid flags.
+ if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get file descriptions.
+ inFile := t.GetFileVFS2(inFD)
+ if inFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer inFile.DecRef()
+ outFile := t.GetFileVFS2(outFD)
+ if outFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer outFile.DecRef()
+
+ // Check that both files support the required directionality.
+ if !inFile.IsReadable() || !outFile.IsWritable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ // The operation is non-blocking if anything is non-blocking.
+ //
+ // N.B. This is a rather simplistic heuristic that avoids some
+ // poor edge case behavior since the exact semantics here are
+ // underspecified and vary between versions of Linux itself.
+ nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+ // Both file descriptions must represent pipes.
+ inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+ outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+ if !inIsPipe || !outIsPipe {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Copy data.
+ var (
+ inCh chan struct{}
+ outCh chan struct{}
+ )
+ for {
+ n, err := pipe.Tee(t, outPipeFD, inPipeFD, count)
+ if n != 0 {
+ return uintptr(n), nil, nil
+ }
+ if err != syserror.ErrWouldBlock || nonBlock {
+ return 0, nil, err
+ }
+
+ // Note that the blocking behavior here is a bit different than the
+ // normal pattern. Because we need to have both data to read and data
+ // to write simultaneously, we actually explicitly block on both of
+ // these cases in turn before returning to the tee operation.
+ if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
+ if inCh == nil {
+ inCh = make(chan struct{}, 1)
+ inW, _ := waiter.NewChannelEntry(inCh)
+ inFile.EventRegister(&inW, eventMaskRead)
+ defer inFile.EventUnregister(&inW)
+ continue // Need to refresh readiness.
+ }
+ if err := t.Block(inCh); err != nil {
+ return 0, nil, err
+ }
+ }
+ if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
+ if outCh == nil {
+ outCh = make(chan struct{}, 1)
+ outW, _ := waiter.NewChannelEntry(outCh)
+ outFile.EventRegister(&outW, eventMaskWrite)
+ defer outFile.EventUnregister(&outW)
+ continue // Need to refresh readiness.
+ }
+ if err := t.Block(outCh); err != nil {
+ return 0, nil, err
+ }
+ }
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
new file mode 100644
index 000000000..bb1d5cac4
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -0,0 +1,388 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/bits"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/gohacks"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Stat implements Linux syscall stat(2).
+func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ statAddr := args[1].Pointer()
+ return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, 0 /* flags */)
+}
+
+// Lstat implements Linux syscall lstat(2).
+func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ statAddr := args[1].Pointer()
+ return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, linux.AT_SYMLINK_NOFOLLOW)
+}
+
+// Newfstatat implements Linux syscall newfstatat, which backs fstatat(2).
+func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirfd := args[0].Int()
+ pathAddr := args[1].Pointer()
+ statAddr := args[2].Pointer()
+ flags := args[3].Int()
+ return 0, nil, fstatat(t, dirfd, pathAddr, statAddr, flags)
+}
+
+func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags int32) error {
+ if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+ return syserror.EINVAL
+ }
+
+ opts := vfs.StatOptions{
+ Mask: linux.STATX_BASIC_STATS,
+ }
+
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return err
+ }
+
+ root := t.FSContext().RootDirectoryVFS2()
+ defer root.DecRef()
+ start := root
+ if !path.Absolute {
+ if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+ return syserror.ENOENT
+ }
+ if dirfd == linux.AT_FDCWD {
+ start = t.FSContext().WorkingDirectoryVFS2()
+ defer start.DecRef()
+ } else {
+ dirfile := t.GetFileVFS2(dirfd)
+ if dirfile == nil {
+ return syserror.EBADF
+ }
+ if !path.HasComponents() {
+ // Use FileDescription.Stat() instead of
+ // VirtualFilesystem.StatAt() for fstatat(fd, ""), since the
+ // former may be able to use opened file state to expedite the
+ // Stat.
+ statx, err := dirfile.Stat(t, opts)
+ dirfile.DecRef()
+ if err != nil {
+ return err
+ }
+ var stat linux.Stat
+ convertStatxToUserStat(t, &statx, &stat)
+ _, err = stat.CopyOut(t, statAddr)
+ return err
+ }
+ start = dirfile.VirtualDentry()
+ start.IncRef()
+ defer start.DecRef()
+ dirfile.DecRef()
+ }
+ }
+
+ statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
+ Root: root,
+ Start: start,
+ Path: path,
+ FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+ }, &opts)
+ if err != nil {
+ return err
+ }
+ var stat linux.Stat
+ convertStatxToUserStat(t, &statx, &stat)
+ _, err = stat.CopyOut(t, statAddr)
+ return err
+}
+
+func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec {
+ return linux.Timespec{
+ Sec: sxts.Sec,
+ Nsec: int64(sxts.Nsec),
+ }
+}
+
+// Fstat implements Linux syscall fstat(2).
+func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ statAddr := args[1].Pointer()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ statx, err := file.Stat(t, vfs.StatOptions{
+ Mask: linux.STATX_BASIC_STATS,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ var stat linux.Stat
+ convertStatxToUserStat(t, &statx, &stat)
+ _, err = stat.CopyOut(t, statAddr)
+ return 0, nil, err
+}
+
+// Statx implements Linux syscall statx(2).
+func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirfd := args[0].Int()
+ pathAddr := args[1].Pointer()
+ flags := args[2].Int()
+ mask := args[3].Uint()
+ statxAddr := args[4].Pointer()
+
+ if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ // Make sure that only one sync type option is set.
+ syncType := uint32(flags & linux.AT_STATX_SYNC_TYPE)
+ if syncType != 0 && !bits.IsPowerOfTwo32(syncType) {
+ return 0, nil, syserror.EINVAL
+ }
+ if mask&linux.STATX__RESERVED != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ opts := vfs.StatOptions{
+ Mask: mask,
+ Sync: uint32(flags & linux.AT_STATX_SYNC_TYPE),
+ }
+
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ root := t.FSContext().RootDirectoryVFS2()
+ defer root.DecRef()
+ start := root
+ if !path.Absolute {
+ if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+ return 0, nil, syserror.ENOENT
+ }
+ if dirfd == linux.AT_FDCWD {
+ start = t.FSContext().WorkingDirectoryVFS2()
+ defer start.DecRef()
+ } else {
+ dirfile := t.GetFileVFS2(dirfd)
+ if dirfile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ if !path.HasComponents() {
+ // Use FileDescription.Stat() instead of
+ // VirtualFilesystem.StatAt() for statx(fd, ""), since the
+ // former may be able to use opened file state to expedite the
+ // Stat.
+ statx, err := dirfile.Stat(t, opts)
+ dirfile.DecRef()
+ if err != nil {
+ return 0, nil, err
+ }
+ userifyStatx(t, &statx)
+ _, err = statx.CopyOut(t, statxAddr)
+ return 0, nil, err
+ }
+ start = dirfile.VirtualDentry()
+ start.IncRef()
+ defer start.DecRef()
+ dirfile.DecRef()
+ }
+ }
+
+ statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
+ Root: root,
+ Start: start,
+ Path: path,
+ FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+ }, &opts)
+ if err != nil {
+ return 0, nil, err
+ }
+ userifyStatx(t, &statx)
+ _, err = statx.CopyOut(t, statxAddr)
+ return 0, nil, err
+}
+
+func userifyStatx(t *kernel.Task, statx *linux.Statx) {
+ userns := t.UserNamespace()
+ statx.UID = uint32(auth.KUID(statx.UID).In(userns).OrOverflow())
+ statx.GID = uint32(auth.KGID(statx.GID).In(userns).OrOverflow())
+}
+
+// Readlink implements Linux syscall readlink(2).
+func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ bufAddr := args[1].Pointer()
+ size := args[2].SizeT()
+ return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size)
+}
+
+// Access implements Linux syscall access(2).
+func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ mode := args[1].ModeT()
+
+ return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Faccessat implements Linux syscall faccessat(2).
+//
+// Note that the faccessat() system call does not take a flags argument:
+// "The raw faccessat() system call takes only the first three arguments. The
+// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within
+// the glibc wrapper function for faccessat(). If either of these flags is
+// specified, then the wrapper function employs fstatat(2) to determine access
+// permissions." - faccessat(2)
+func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirfd := args[0].Int()
+ addr := args[1].Pointer()
+ mode := args[2].ModeT()
+
+ return 0, nil, accessAt(t, dirfd, addr, mode)
+}
+
+func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error {
+ const rOK = 4
+ const wOK = 2
+ const xOK = 1
+
+ // Sanity check the mode.
+ if mode&^(rOK|wOK|xOK) != 0 {
+ return syserror.EINVAL
+ }
+
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return err
+ }
+ tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink)
+ if err != nil {
+ return err
+ }
+ defer tpop.Release()
+
+ // access(2) and faccessat(2) check permissions using real
+ // UID/GID, not effective UID/GID.
+ //
+ // "access() needs to use the real uid/gid, not the effective
+ // uid/gid. We do this by temporarily clearing all FS-related
+ // capabilities and switching the fsuid/fsgid around to the
+ // real ones." -fs/open.c:faccessat
+ creds := t.Credentials().Fork()
+ creds.EffectiveKUID = creds.RealKUID
+ creds.EffectiveKGID = creds.RealKGID
+ if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
+ creds.EffectiveCaps = creds.PermittedCaps
+ } else {
+ creds.EffectiveCaps = 0
+ }
+
+ return t.Kernel().VFS().AccessAt(t, creds, vfs.AccessTypes(mode), &tpop.pop)
+}
+
+// Readlinkat implements Linux syscall mknodat(2).
+func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ dirfd := args[0].Int()
+ pathAddr := args[1].Pointer()
+ bufAddr := args[2].Pointer()
+ size := args[3].SizeT()
+ return readlinkat(t, dirfd, pathAddr, bufAddr, size)
+}
+
+func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, size uint) (uintptr, *kernel.SyscallControl, error) {
+ if int(size) <= 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+ // "Since Linux 2.6.39, pathname can be an empty string, in which case the
+ // call operates on the symbolic link referred to by dirfd ..." -
+ // readlinkat(2)
+ tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release()
+
+ target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ if len(target) > int(size) {
+ target = target[:size]
+ }
+ n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target))
+ if n == 0 {
+ return 0, nil, err
+ }
+ return uintptr(n), nil, nil
+}
+
+// Statfs implements Linux syscall statfs(2).
+func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ bufAddr := args[1].Pointer()
+
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release()
+
+ statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
+ if err != nil {
+ return 0, nil, err
+ }
+ _, err = statfs.CopyOut(t, bufAddr)
+ return 0, nil, err
+}
+
+// Fstatfs implements Linux syscall fstatfs(2).
+func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ bufAddr := args[1].Pointer()
+
+ tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release()
+
+ statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
+ if err != nil {
+ return 0, nil, err
+ }
+ _, err = statfs.CopyOut(t, bufAddr)
+ return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
new file mode 100644
index 000000000..2da538fc6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+ // Linux just copies fields from struct kstat without regard to struct
+ // kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+ userns := t.UserNamespace()
+ *stat = linux.Stat{
+ Dev: uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+ Ino: statx.Ino,
+ Nlink: uint64(statx.Nlink),
+ Mode: uint32(statx.Mode),
+ UID: uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+ GID: uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+ Rdev: uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+ Size: int64(statx.Size),
+ Blksize: int64(statx.Blksize),
+ Blocks: int64(statx.Blocks),
+ ATime: timespecFromStatxTimestamp(statx.Atime),
+ MTime: timespecFromStatxTimestamp(statx.Mtime),
+ CTime: timespecFromStatxTimestamp(statx.Ctime),
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go
new file mode 100644
index 000000000..88b9c7627
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+ // Linux just copies fields from struct kstat without regard to struct
+ // kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+ userns := t.UserNamespace()
+ *stat = linux.Stat{
+ Dev: uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+ Ino: statx.Ino,
+ Nlink: uint32(statx.Nlink),
+ Mode: uint32(statx.Mode),
+ UID: uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+ GID: uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+ Rdev: uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+ Size: int64(statx.Size),
+ Blksize: int32(statx.Blksize),
+ Blocks: int64(statx.Blocks),
+ ATime: timespecFromStatxTimestamp(statx.Atime),
+ MTime: timespecFromStatxTimestamp(statx.Mtime),
+ CTime: timespecFromStatxTimestamp(statx.Ctime),
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go
new file mode 100644
index 000000000..0d0ebf46a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/sync.go
@@ -0,0 +1,115 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements Linux syscall sync(2).
+func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, t.Kernel().VFS().SyncAllFilesystems(t)
+}
+
+// Syncfs implements Linux syscall syncfs(2).
+func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ return 0, nil, file.SyncFS(t)
+}
+
+// Fsync implements Linux syscall fsync(2).
+func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ return 0, nil, file.Sync(t)
+}
+
+// Fdatasync implements Linux syscall fdatasync(2).
+func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ // TODO(gvisor.dev/issue/1897): Avoid writeback of unnecessary metadata.
+ return Fsync(t, args)
+}
+
+// SyncFileRange implements Linux syscall sync_file_range(2).
+func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ offset := args[1].Int64()
+ nbytes := args[2].Int64()
+ flags := args[3].Uint()
+
+ // Check for negative values and overflow.
+ if offset < 0 || offset+nbytes < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support
+ // is a full-file sync, i.e. fsync(2). As a result, there are severe
+ // limitations on how much we support sync_file_range:
+ // - In Linux, sync_file_range(2) doesn't write out the file's metadata, even
+ // if the file size is changed. We do.
+ // - We always sync the entire file instead of [offset, offset+nbytes).
+ // - We do not support the use of WAIT_BEFORE without WAIT_AFTER. For
+ // correctness, we would have to perform a write-out every time WAIT_BEFORE
+ // was used, but this would be much more expensive than expected if there
+ // were no write-out operations in progress.
+ // - Whenever WAIT_AFTER is used, we sync the file.
+ // - Ignore WRITE. If this flag is used with WAIT_AFTER, then the file will
+ // be synced anyway. If this flag is used without WAIT_AFTER, then it is
+ // safe (and less expensive) to do nothing, because the syscall will not
+ // wait for the write-out to complete--we only need to make sure that the
+ // next time WAIT_BEFORE or WAIT_AFTER are used, the write-out completes.
+ // - According to fs/sync.c, WAIT_BEFORE|WAIT_AFTER "will detect any I/O
+ // errors or ENOSPC conditions and will return those to the caller, after
+ // clearing the EIO and ENOSPC flags in the address_space." We don't do
+ // this.
+
+ if flags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 &&
+ flags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 {
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, syserror.ENOSYS
+ }
+
+ if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 {
+ if err := file.Sync(t); err != nil {
+ return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+ }
+ }
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
new file mode 100644
index 000000000..5ac79bc09
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
@@ -0,0 +1,127 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TimerfdCreate implements Linux syscall timerfd_create(2).
+func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ clockID := args[0].Int()
+ flags := args[1].Int()
+
+ if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Timerfds aren't writable per se (their implementation of Write just
+ // returns EINVAL), but they are "opened for writing", which is necessary
+ // to actually reach said implementation of Write.
+ fileFlags := uint32(linux.O_RDWR)
+ if flags&linux.TFD_NONBLOCK != 0 {
+ fileFlags |= linux.O_NONBLOCK
+ }
+
+ var clock ktime.Clock
+ switch clockID {
+ case linux.CLOCK_REALTIME:
+ clock = t.Kernel().RealtimeClock()
+ case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME:
+ clock = t.Kernel().MonotonicClock()
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+ vfsObj := t.Kernel().VFS()
+ file, err := timerfd.New(vfsObj, clock, fileFlags)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer file.DecRef()
+ fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+ CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(fd), nil, nil
+}
+
+// TimerfdSettime implements Linux syscall timerfd_settime(2).
+func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ flags := args[1].Int()
+ newValAddr := args[2].Pointer()
+ oldValAddr := args[3].Pointer()
+
+ if flags&^(linux.TFD_TIMER_ABSTIME) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ tfd, ok := file.Impl().(*timerfd.TimerFileDescription)
+ if !ok {
+ return 0, nil, syserror.EINVAL
+ }
+
+ var newVal linux.Itimerspec
+ if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+ return 0, nil, err
+ }
+ newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock())
+ if err != nil {
+ return 0, nil, err
+ }
+ tm, oldS := tfd.SetTime(newS)
+ if oldValAddr != 0 {
+ oldVal := ktime.ItimerspecFromSetting(tm, oldS)
+ if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+ return 0, nil, err
+ }
+ }
+ return 0, nil, nil
+}
+
+// TimerfdGettime implements Linux syscall timerfd_gettime(2).
+func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ curValAddr := args[1].Pointer()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ tfd, ok := file.Impl().(*timerfd.TimerFileDescription)
+ if !ok {
+ return 0, nil, syserror.EINVAL
+ }
+
+ tm, s := tfd.GetTime()
+ curVal := ktime.ItimerspecFromSetting(tm, s)
+ _, err := t.CopyOut(curValAddr, &curVal)
+ return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
new file mode 100644
index 000000000..8f497ecc7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -0,0 +1,168 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package vfs2 provides syscall implementations that use VFS2.
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/syscalls"
+ "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+)
+
+// Override syscall table to add syscalls implementations from this package.
+func Override() {
+ // Override AMD64.
+ s := linux.AMD64
+ s.Table[0] = syscalls.Supported("read", Read)
+ s.Table[1] = syscalls.Supported("write", Write)
+ s.Table[2] = syscalls.Supported("open", Open)
+ s.Table[3] = syscalls.Supported("close", Close)
+ s.Table[4] = syscalls.Supported("stat", Stat)
+ s.Table[5] = syscalls.Supported("fstat", Fstat)
+ s.Table[6] = syscalls.Supported("lstat", Lstat)
+ s.Table[7] = syscalls.Supported("poll", Poll)
+ s.Table[8] = syscalls.Supported("lseek", Lseek)
+ s.Table[9] = syscalls.Supported("mmap", Mmap)
+ s.Table[16] = syscalls.Supported("ioctl", Ioctl)
+ s.Table[17] = syscalls.Supported("pread64", Pread64)
+ s.Table[18] = syscalls.Supported("pwrite64", Pwrite64)
+ s.Table[19] = syscalls.Supported("readv", Readv)
+ s.Table[20] = syscalls.Supported("writev", Writev)
+ s.Table[21] = syscalls.Supported("access", Access)
+ s.Table[22] = syscalls.Supported("pipe", Pipe)
+ s.Table[23] = syscalls.Supported("select", Select)
+ s.Table[32] = syscalls.Supported("dup", Dup)
+ s.Table[33] = syscalls.Supported("dup2", Dup2)
+ delete(s.Table, 40) // sendfile
+ s.Table[41] = syscalls.Supported("socket", Socket)
+ s.Table[42] = syscalls.Supported("connect", Connect)
+ s.Table[43] = syscalls.Supported("accept", Accept)
+ s.Table[44] = syscalls.Supported("sendto", SendTo)
+ s.Table[45] = syscalls.Supported("recvfrom", RecvFrom)
+ s.Table[46] = syscalls.Supported("sendmsg", SendMsg)
+ s.Table[47] = syscalls.Supported("recvmsg", RecvMsg)
+ s.Table[48] = syscalls.Supported("shutdown", Shutdown)
+ s.Table[49] = syscalls.Supported("bind", Bind)
+ s.Table[50] = syscalls.Supported("listen", Listen)
+ s.Table[51] = syscalls.Supported("getsockname", GetSockName)
+ s.Table[52] = syscalls.Supported("getpeername", GetPeerName)
+ s.Table[53] = syscalls.Supported("socketpair", SocketPair)
+ s.Table[54] = syscalls.Supported("setsockopt", SetSockOpt)
+ s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt)
+ s.Table[59] = syscalls.Supported("execve", Execve)
+ s.Table[72] = syscalls.Supported("fcntl", Fcntl)
+ s.Table[73] = syscalls.Supported("fcntl", Flock)
+ s.Table[74] = syscalls.Supported("fsync", Fsync)
+ s.Table[75] = syscalls.Supported("fdatasync", Fdatasync)
+ s.Table[76] = syscalls.Supported("truncate", Truncate)
+ s.Table[77] = syscalls.Supported("ftruncate", Ftruncate)
+ s.Table[78] = syscalls.Supported("getdents", Getdents)
+ s.Table[79] = syscalls.Supported("getcwd", Getcwd)
+ s.Table[80] = syscalls.Supported("chdir", Chdir)
+ s.Table[81] = syscalls.Supported("fchdir", Fchdir)
+ s.Table[82] = syscalls.Supported("rename", Rename)
+ s.Table[83] = syscalls.Supported("mkdir", Mkdir)
+ s.Table[84] = syscalls.Supported("rmdir", Rmdir)
+ s.Table[85] = syscalls.Supported("creat", Creat)
+ s.Table[86] = syscalls.Supported("link", Link)
+ s.Table[87] = syscalls.Supported("unlink", Unlink)
+ s.Table[88] = syscalls.Supported("symlink", Symlink)
+ s.Table[89] = syscalls.Supported("readlink", Readlink)
+ s.Table[90] = syscalls.Supported("chmod", Chmod)
+ s.Table[91] = syscalls.Supported("fchmod", Fchmod)
+ s.Table[92] = syscalls.Supported("chown", Chown)
+ s.Table[93] = syscalls.Supported("fchown", Fchown)
+ s.Table[94] = syscalls.Supported("lchown", Lchown)
+ s.Table[132] = syscalls.Supported("utime", Utime)
+ s.Table[133] = syscalls.Supported("mknod", Mknod)
+ s.Table[137] = syscalls.Supported("statfs", Statfs)
+ s.Table[138] = syscalls.Supported("fstatfs", Fstatfs)
+ s.Table[161] = syscalls.Supported("chroot", Chroot)
+ s.Table[162] = syscalls.Supported("sync", Sync)
+ s.Table[165] = syscalls.Supported("mount", Mount)
+ s.Table[166] = syscalls.Supported("umount2", Umount2)
+ s.Table[187] = syscalls.Supported("readahead", Readahead)
+ s.Table[188] = syscalls.Supported("setxattr", Setxattr)
+ s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
+ s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
+ s.Table[191] = syscalls.Supported("getxattr", Getxattr)
+ s.Table[192] = syscalls.Supported("lgetxattr", Lgetxattr)
+ s.Table[193] = syscalls.Supported("fgetxattr", Fgetxattr)
+ s.Table[194] = syscalls.Supported("listxattr", Listxattr)
+ s.Table[195] = syscalls.Supported("llistxattr", Llistxattr)
+ s.Table[196] = syscalls.Supported("flistxattr", Flistxattr)
+ s.Table[197] = syscalls.Supported("removexattr", Removexattr)
+ s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
+ s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
+ s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"})
+ s.Table[213] = syscalls.Supported("epoll_create", EpollCreate)
+ s.Table[217] = syscalls.Supported("getdents64", Getdents64)
+ s.Table[221] = syscalls.PartiallySupported("fadvise64", Fadvise64, "The syscall is 'supported', but ignores all provided advice.", nil)
+ s.Table[232] = syscalls.Supported("epoll_wait", EpollWait)
+ s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
+ s.Table[235] = syscalls.Supported("utimes", Utimes)
+ s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil)
+ s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil)
+ s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil)
+ s.Table[257] = syscalls.Supported("openat", Openat)
+ s.Table[258] = syscalls.Supported("mkdirat", Mkdirat)
+ s.Table[259] = syscalls.Supported("mknodat", Mknodat)
+ s.Table[260] = syscalls.Supported("fchownat", Fchownat)
+ s.Table[261] = syscalls.Supported("futimesat", Futimesat)
+ s.Table[262] = syscalls.Supported("newfstatat", Newfstatat)
+ s.Table[263] = syscalls.Supported("unlinkat", Unlinkat)
+ s.Table[264] = syscalls.Supported("renameat", Renameat)
+ s.Table[265] = syscalls.Supported("linkat", Linkat)
+ s.Table[266] = syscalls.Supported("symlinkat", Symlinkat)
+ s.Table[267] = syscalls.Supported("readlinkat", Readlinkat)
+ s.Table[268] = syscalls.Supported("fchmodat", Fchmodat)
+ s.Table[269] = syscalls.Supported("faccessat", Faccessat)
+ s.Table[270] = syscalls.Supported("pselect", Pselect)
+ s.Table[271] = syscalls.Supported("ppoll", Ppoll)
+ s.Table[275] = syscalls.Supported("splice", Splice)
+ s.Table[276] = syscalls.Supported("tee", Tee)
+ s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange)
+ s.Table[280] = syscalls.Supported("utimensat", Utimensat)
+ s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait)
+ s.Table[282] = syscalls.Supported("signalfd", Signalfd)
+ s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate)
+ s.Table[284] = syscalls.Supported("eventfd", Eventfd)
+ s.Table[285] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil)
+ s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime)
+ s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime)
+ s.Table[288] = syscalls.Supported("accept4", Accept4)
+ s.Table[289] = syscalls.Supported("signalfd4", Signalfd4)
+ s.Table[290] = syscalls.Supported("eventfd2", Eventfd2)
+ s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
+ s.Table[292] = syscalls.Supported("dup3", Dup3)
+ s.Table[293] = syscalls.Supported("pipe2", Pipe2)
+ s.Table[294] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil)
+ s.Table[295] = syscalls.Supported("preadv", Preadv)
+ s.Table[296] = syscalls.Supported("pwritev", Pwritev)
+ s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg)
+ s.Table[306] = syscalls.Supported("syncfs", Syncfs)
+ s.Table[307] = syscalls.Supported("sendmmsg", SendMMsg)
+ s.Table[316] = syscalls.Supported("renameat2", Renameat2)
+ s.Table[319] = syscalls.Supported("memfd_create", MemfdCreate)
+ s.Table[322] = syscalls.Supported("execveat", Execveat)
+ s.Table[327] = syscalls.Supported("preadv2", Preadv2)
+ s.Table[328] = syscalls.Supported("pwritev2", Pwritev2)
+ s.Table[332] = syscalls.Supported("statx", Statx)
+ s.Init()
+
+ // Override ARM64.
+ s = linux.ARM64
+ s.Table[63] = syscalls.Supported("read", Read)
+ s.Init()
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
new file mode 100644
index 000000000..af455d5c1
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -0,0 +1,356 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "bytes"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/gohacks"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Listxattr implements Linux syscall listxattr(2).
+func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return listxattr(t, args, followFinalSymlink)
+}
+
+// Llistxattr implements Linux syscall llistxattr(2).
+func Llistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return listxattr(t, args, nofollowFinalSymlink)
+}
+
+func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ listAddr := args[1].Pointer()
+ size := args[2].SizeT()
+
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release()
+
+ names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
+ if err != nil {
+ return 0, nil, err
+ }
+ n, err := copyOutXattrNameList(t, listAddr, size, names)
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(n), nil, nil
+}
+
+// Flistxattr implements Linux syscall flistxattr(2).
+func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ listAddr := args[1].Pointer()
+ size := args[2].SizeT()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ names, err := file.Listxattr(t, uint64(size))
+ if err != nil {
+ return 0, nil, err
+ }
+ n, err := copyOutXattrNameList(t, listAddr, size, names)
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(n), nil, nil
+}
+
+// Getxattr implements Linux syscall getxattr(2).
+func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return getxattr(t, args, followFinalSymlink)
+}
+
+// Lgetxattr implements Linux syscall lgetxattr(2).
+func Lgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return getxattr(t, args, nofollowFinalSymlink)
+}
+
+func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ nameAddr := args[1].Pointer()
+ valueAddr := args[2].Pointer()
+ size := args[3].SizeT()
+
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release()
+
+ name, err := copyInXattrName(t, nameAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{
+ Name: name,
+ Size: uint64(size),
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ n, err := copyOutXattrValue(t, valueAddr, size, value)
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(n), nil, nil
+}
+
+// Fgetxattr implements Linux syscall fgetxattr(2).
+func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ nameAddr := args[1].Pointer()
+ valueAddr := args[2].Pointer()
+ size := args[3].SizeT()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ name, err := copyInXattrName(t, nameAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)})
+ if err != nil {
+ return 0, nil, err
+ }
+ n, err := copyOutXattrValue(t, valueAddr, size, value)
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(n), nil, nil
+}
+
+// Setxattr implements Linux syscall setxattr(2).
+func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, setxattr(t, args, followFinalSymlink)
+}
+
+// Lsetxattr implements Linux syscall lsetxattr(2).
+func Lsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, setxattr(t, args, nofollowFinalSymlink)
+}
+
+func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
+ pathAddr := args[0].Pointer()
+ nameAddr := args[1].Pointer()
+ valueAddr := args[2].Pointer()
+ size := args[3].SizeT()
+ flags := args[4].Int()
+
+ if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+ return syserror.EINVAL
+ }
+
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return err
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+ if err != nil {
+ return err
+ }
+ defer tpop.Release()
+
+ name, err := copyInXattrName(t, nameAddr)
+ if err != nil {
+ return err
+ }
+ value, err := copyInXattrValue(t, valueAddr, size)
+ if err != nil {
+ return err
+ }
+
+ return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{
+ Name: name,
+ Value: value,
+ Flags: uint32(flags),
+ })
+}
+
+// Fsetxattr implements Linux syscall fsetxattr(2).
+func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ nameAddr := args[1].Pointer()
+ valueAddr := args[2].Pointer()
+ size := args[3].SizeT()
+ flags := args[4].Int()
+
+ if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ name, err := copyInXattrName(t, nameAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+ value, err := copyInXattrValue(t, valueAddr, size)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{
+ Name: name,
+ Value: value,
+ Flags: uint32(flags),
+ })
+}
+
+// Removexattr implements Linux syscall removexattr(2).
+func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, removexattr(t, args, followFinalSymlink)
+}
+
+// Lremovexattr implements Linux syscall lremovexattr(2).
+func Lremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, removexattr(t, args, nofollowFinalSymlink)
+}
+
+func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
+ pathAddr := args[0].Pointer()
+ nameAddr := args[1].Pointer()
+
+ path, err := copyInPath(t, pathAddr)
+ if err != nil {
+ return err
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+ if err != nil {
+ return err
+ }
+ defer tpop.Release()
+
+ name, err := copyInXattrName(t, nameAddr)
+ if err != nil {
+ return err
+ }
+
+ return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name)
+}
+
+// Fremovexattr implements Linux syscall fremovexattr(2).
+func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ nameAddr := args[1].Pointer()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ name, err := copyInXattrName(t, nameAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, file.Removexattr(t, name)
+}
+
+func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
+ name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
+ if err != nil {
+ if err == syserror.ENAMETOOLONG {
+ return "", syserror.ERANGE
+ }
+ return "", err
+ }
+ if len(name) == 0 {
+ return "", syserror.ERANGE
+ }
+ return name, nil
+}
+
+func copyOutXattrNameList(t *kernel.Task, listAddr usermem.Addr, size uint, names []string) (int, error) {
+ if size > linux.XATTR_LIST_MAX {
+ size = linux.XATTR_LIST_MAX
+ }
+ var buf bytes.Buffer
+ for _, name := range names {
+ buf.WriteString(name)
+ buf.WriteByte(0)
+ }
+ if size == 0 {
+ // Return the size that would be required to accomodate the list.
+ return buf.Len(), nil
+ }
+ if buf.Len() > int(size) {
+ if size >= linux.XATTR_LIST_MAX {
+ return 0, syserror.E2BIG
+ }
+ return 0, syserror.ERANGE
+ }
+ return t.CopyOutBytes(listAddr, buf.Bytes())
+}
+
+func copyInXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint) (string, error) {
+ if size > linux.XATTR_SIZE_MAX {
+ return "", syserror.E2BIG
+ }
+ buf := make([]byte, size)
+ if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
+ return "", err
+ }
+ return gohacks.StringFromImmutableBytes(buf), nil
+}
+
+func copyOutXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint, value string) (int, error) {
+ if size > linux.XATTR_SIZE_MAX {
+ size = linux.XATTR_SIZE_MAX
+ }
+ if size == 0 {
+ // Return the size that would be required to accomodate the value.
+ return len(value), nil
+ }
+ if len(value) > int(size) {
+ if size >= linux.XATTR_SIZE_MAX {
+ return 0, syserror.E2BIG
+ }
+ return 0, syserror.ERANGE
+ }
+ return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value))
+}
diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go
new file mode 100644
index 000000000..f88055676
--- /dev/null
+++ b/pkg/sentry/syscalls/syscalls.go
@@ -0,0 +1,111 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package syscalls is the interface from the application to the kernel.
+// Traditionally, syscalls is the interface that is used by applications to
+// request services from the kernel of a operating system. We provide a
+// user-mode kernel that needs to handle those requests coming from unmodified
+// applications. Therefore, we still use the term "syscalls" to denote this
+// interface.
+//
+// Note that the stubs in this package may merely provide the interface, not
+// the actual implementation. It just makes writing syscall stubs
+// straightforward.
+package syscalls
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Supported returns a syscall that is fully supported.
+func Supported(name string, fn kernel.SyscallFn) kernel.Syscall {
+ return kernel.Syscall{
+ Name: name,
+ Fn: fn,
+ SupportLevel: kernel.SupportFull,
+ Note: "Fully Supported.",
+ }
+}
+
+// PartiallySupported returns a syscall that has a partial implementation.
+func PartiallySupported(name string, fn kernel.SyscallFn, note string, urls []string) kernel.Syscall {
+ return kernel.Syscall{
+ Name: name,
+ Fn: fn,
+ SupportLevel: kernel.SupportPartial,
+ Note: note,
+ URLs: urls,
+ }
+}
+
+// Error returns a syscall handler that will always give the passed error.
+func Error(name string, err error, note string, urls []string) kernel.Syscall {
+ if note != "" {
+ note = note + "; "
+ }
+ return kernel.Syscall{
+ Name: name,
+ Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, err
+ },
+ SupportLevel: kernel.SupportUnimplemented,
+ Note: fmt.Sprintf("%sReturns %q.", note, err.Error()),
+ URLs: urls,
+ }
+}
+
+// ErrorWithEvent gives a syscall function that sends an unimplemented
+// syscall event via the event channel and returns the passed error.
+func ErrorWithEvent(name string, err error, note string, urls []string) kernel.Syscall {
+ if note != "" {
+ note = note + "; "
+ }
+ return kernel.Syscall{
+ Name: name,
+ Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, err
+ },
+ SupportLevel: kernel.SupportUnimplemented,
+ Note: fmt.Sprintf("%sReturns %q.", note, err.Error()),
+ URLs: urls,
+ }
+}
+
+// CapError gives a syscall function that checks for capability c. If the task
+// has the capability, it returns ENOSYS, otherwise EPERM. To unprivileged
+// tasks, it will seem like there is an implementation.
+func CapError(name string, c linux.Capability, note string, urls []string) kernel.Syscall {
+ if note != "" {
+ note = note + "; "
+ }
+ return kernel.Syscall{
+ Name: name,
+ Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ if !t.HasCapability(c) {
+ return 0, nil, syserror.EPERM
+ }
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, syserror.ENOSYS
+ },
+ SupportLevel: kernel.SupportUnimplemented,
+ Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, syserror.EPERM, c.String(), syserror.ENOSYS),
+ URLs: urls,
+ }
+}