84 files changed, 20630 insertions, 0 deletions
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
new file mode 100644
index 000000000..b8d1bd415
--- /dev/null
+++ b/pkg/sentry/syscalls/BUILD
@@ -0,0 +1,21 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "syscalls",
+    srcs = [
+        "epoll.go",
+        "syscalls.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/epoll",
+        "//pkg/sentry/kernel/time",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
new file mode 100644
index 000000000..d9fb808c0
--- /dev/null
+++ b/pkg/sentry/syscalls/epoll.go
@@ -0,0 +1,173 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syscalls
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// CreateEpoll implements the epoll_create(2) linux syscall.
+func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) {
+	file := epoll.NewEventPoll(t)
+	defer file.DecRef()
+
+	fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
+		CloseOnExec: closeOnExec,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	return fd, nil
+}
+
+// AddEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_ADD.
+func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
+	// Get epoll from the file descriptor.
+	epollfile := t.GetFile(epfd)
+	if epollfile == nil {
+		return syserror.EBADF
+	}
+	defer epollfile.DecRef()
+
+	// Get the target file id.
+	file := t.GetFile(fd)
+	if file == nil {
+		return syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the epollPoll operations.
+	e, ok := epollfile.FileOperations.(*epoll.EventPoll)
+	if !ok {
+		return syserror.EBADF
+	}
+
+	// Try to add the entry.
+	return e.AddEntry(epoll.FileIdentifier{file, fd}, flags, mask, userData)
+}
+
+// UpdateEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_MOD.
+func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
+	// Get epoll from the file descriptor.
+	epollfile := t.GetFile(epfd)
+	if epollfile == nil {
+		return syserror.EBADF
+	}
+	defer epollfile.DecRef()
+
+	// Get the target file id.
+	file := t.GetFile(fd)
+	if file == nil {
+		return syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the epollPoll operations.
+	e, ok := epollfile.FileOperations.(*epoll.EventPoll)
+	if !ok {
+		return syserror.EBADF
+	}
+
+	// Try to update the entry.
+	return e.UpdateEntry(epoll.FileIdentifier{file, fd}, flags, mask, userData)
+}
+
+// RemoveEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_DEL.
+func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error {
+	// Get epoll from the file descriptor.
+	epollfile := t.GetFile(epfd)
+	if epollfile == nil {
+		return syserror.EBADF
+	}
+	defer epollfile.DecRef()
+
+	// Get the target file id.
+	file := t.GetFile(fd)
+	if file == nil {
+		return syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the epollPoll operations.
+	e, ok := epollfile.FileOperations.(*epoll.EventPoll)
+	if !ok {
+		return syserror.EBADF
+	}
+
+	// Try to remove the entry.
+	return e.RemoveEntry(epoll.FileIdentifier{file, fd})
+}
+
+// WaitEpoll implements the epoll_wait(2) linux syscall.
+func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEvent, error) {
+	// Get epoll from the file descriptor.
+	epollfile := t.GetFile(fd)
+	if epollfile == nil {
+		return nil, syserror.EBADF
+	}
+	defer epollfile.DecRef()
+
+	// Extract the epollPoll operations.
+	e, ok := epollfile.FileOperations.(*epoll.EventPoll)
+	if !ok {
+		return nil, syserror.EBADF
+	}
+
+	// Try to read events and return right away if we got them or if the
+	// caller requested a non-blocking "wait".
+	r := e.ReadEvents(max)
+	if len(r) != 0 || timeout == 0 {
+		return r, nil
+	}
+
+	// We'll have to wait. Set up the timer if a timeout was specified and
+	// and register with the epoll object for readability events.
+	var haveDeadline bool
+	var deadline ktime.Time
+	if timeout > 0 {
+		timeoutDur := time.Duration(timeout) * time.Millisecond
+		deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
+		haveDeadline = true
+	}
+
+	w, ch := waiter.NewChannelEntry(nil)
+	e.EventRegister(&w, waiter.EventIn)
+	defer e.EventUnregister(&w)
+
+	// Try to read the events again until we succeed, timeout or get
+	// interrupted.
+	for {
+		r = e.ReadEvents(max)
+		if len(r) != 0 {
+			return r, nil
+		}
+
+		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				return nil, nil
+			}
+
+			return nil, err
+		}
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
new file mode 100644
index 000000000..217fcfef2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -0,0 +1,103 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "linux",
+    srcs = [
+        "error.go",
+        "flags.go",
+        "linux64.go",
+        "sigset.go",
+        "sys_aio.go",
+        "sys_capability.go",
+        "sys_clone_amd64.go",
+        "sys_clone_arm64.go",
+        "sys_epoll.go",
+        "sys_eventfd.go",
+        "sys_file.go",
+        "sys_futex.go",
+        "sys_getdents.go",
+        "sys_identity.go",
+        "sys_inotify.go",
+        "sys_lseek.go",
+        "sys_mempolicy.go",
+        "sys_mmap.go",
+        "sys_mount.go",
+        "sys_pipe.go",
+        "sys_poll.go",
+        "sys_prctl.go",
+        "sys_random.go",
+        "sys_read.go",
+        "sys_rlimit.go",
+        "sys_rseq.go",
+        "sys_rusage.go",
+        "sys_sched.go",
+        "sys_seccomp.go",
+        "sys_sem.go",
+        "sys_shm.go",
+        "sys_signal.go",
+        "sys_socket.go",
+        "sys_splice.go",
+        "sys_stat.go",
+        "sys_stat_amd64.go",
+        "sys_stat_arm64.go",
+        "sys_sync.go",
+        "sys_sysinfo.go",
+        "sys_syslog.go",
+        "sys_thread.go",
+        "sys_time.go",
+        "sys_timer.go",
+        "sys_timerfd.go",
+        "sys_tls_amd64.go",
+        "sys_tls_arm64.go",
+        "sys_utsname.go",
+        "sys_write.go",
+        "sys_xattr.go",
+        "timespec.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi",
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/bpf",
+        "//pkg/context",
+        "//pkg/log",
+        "//pkg/metric",
+        "//pkg/rand",
+        "//pkg/safemem",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fs/timerfd",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/fsbridge",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/epoll",
+        "//pkg/sentry/kernel/eventfd",
+        "//pkg/sentry/kernel/fasync",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/kernel/shm",
+        "//pkg/sentry/kernel/signalfd",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/syscalls",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserr",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
new file mode 100644
index 000000000..64de56ac5
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -0,0 +1,157 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/metric"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+var (
+	partialResultMetric = metric.MustCreateNewUint64Metric("/syscalls/partial_result", true /* sync */, "Whether or not a partial result has occurred for this sandbox.")
+	partialResultOnce   sync.Once
+)
+
+// HandleIOErrorVFS2 handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// op and f are used only for panics.
+func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op string, f *vfs.FileDescription) error {
+	known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+	if err != nil {
+		return err
+	}
+	if !known {
+		// An unknown error is encountered with a partial read/write.
+		fs := f.Mount().Filesystem().VirtualFilesystem()
+		root := vfs.RootFromContext(t)
+		name, _ := fs.PathnameWithDeleted(t, root, f.VirtualDentry())
+		log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, err, err, op, name)
+		partialResultOnce.Do(partialResultMetric.Increment)
+	}
+	return nil
+}
+
+// handleIOError handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// op and f are used only for panics.
+func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error {
+	known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+	if err != nil {
+		return err
+	}
+	if !known {
+		// An unknown error is encountered with a partial read/write.
+		name, _ := f.Dirent.FullName(nil /* ignore chroot */)
+		log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations)
+		partialResultOnce.Do(partialResultMetric.Increment)
+	}
+	return nil
+}
+
+// handleIOError handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// Returns false if error is unknown.
+func handleIOErrorImpl(t *kernel.Task, partialResult bool, err, intr error, op string) (bool, error) {
+	switch err {
+	case nil:
+		// Typical successful syscall.
+		return true, nil
+	case io.EOF:
+		// EOF is always consumed. If this is a partial read/write
+		// (result != 0), the application will see that, otherwise
+		// they will see 0.
+		return true, nil
+	case syserror.ErrExceedsFileSizeLimit:
+		// Ignore partialResult because this error only applies to
+		// normal files, and for those files we cannot accumulate
+		// write results.
+		//
+		// Do not consume the error and return it as EFBIG.
+		// Simultaneously send a SIGXFSZ per setrlimit(2).
+		t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
+		return true, syserror.EFBIG
+	case syserror.ErrInterrupted:
+		// The syscall was interrupted. Return nil if it completed
+		// partially, otherwise return the error code that the syscall
+		// needs (to indicate to the kernel what it should do).
+		if partialResult {
+			return true, nil
+		}
+		return true, intr
+	}
+
+	if !partialResult {
+		// Typical syscall error.
+		return true, err
+	}
+
+	switch err {
+	case syserror.EINTR:
+		// Syscall interrupted, but completed a partial
+		// read/write.  Like ErrWouldBlock, since we have a
+		// partial read/write, we consume the error and return
+		// the partial result.
+		return true, nil
+	case syserror.EFAULT:
+		// EFAULT is only shown the user if nothing was
+		// read/written. If we read something (this case), they see
+		// a partial read/write. They will then presumably try again
+		// with an incremented buffer, which will EFAULT with
+		// result == 0.
+		return true, nil
+	case syserror.EPIPE:
+		// Writes to a pipe or socket will return EPIPE if the other
+		// side is gone. The partial write is returned. EPIPE will be
+		// returned on the next call.
+		//
+		// TODO(gvisor.dev/issue/161): In some cases SIGPIPE should
+		// also be sent to the application.
+		return true, nil
+	case syserror.ENOSPC:
+		// Similar to EPIPE. Return what we wrote this time, and let
+		// ENOSPC be returned on the next call.
+		return true, nil
+	case syserror.ECONNRESET:
+		// For TCP sendfile connections, we may have a reset. But we
+		// should just return n as the result.
+		return true, nil
+	case syserror.ErrWouldBlock:
+		// Syscall would block, but completed a partial read/write.
+		// This case should only be returned by IssueIO for nonblocking
+		// files. Since we have a partial read/write, we consume
+		// ErrWouldBlock, returning the partial result.
+		return true, nil
+	}
+
+	switch err.(type) {
+	case kernel.SyscallRestartErrno:
+		// Identical to the EINTR case.
+		return true, nil
+	}
+
+	// Error is unknown and cannot be properly handled.
+	return false, nil
+}
diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
new file mode 100644
index 000000000..07961dad9
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -0,0 +1,55 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+)
+
+// flagsToPermissions returns a Permissions object from Linux flags.
+// This includes truncate permission if O_TRUNC is set in the mask.
+func flagsToPermissions(mask uint) (p fs.PermMask) {
+	if mask&linux.O_TRUNC != 0 {
+		p.Write = true
+	}
+	switch mask & linux.O_ACCMODE {
+	case linux.O_WRONLY:
+		p.Write = true
+	case linux.O_RDWR:
+		p.Write = true
+		p.Read = true
+	case linux.O_RDONLY:
+		p.Read = true
+	}
+	return
+}
+
+// linuxToFlags converts Linux file flags to a FileFlags object.
+func linuxToFlags(mask uint) fs.FileFlags {
+	return fs.FileFlags{
+		Direct:      mask&linux.O_DIRECT != 0,
+		DSync:       mask&(linux.O_DSYNC|linux.O_SYNC) != 0,
+		Sync:        mask&linux.O_SYNC != 0,
+		NonBlocking: mask&linux.O_NONBLOCK != 0,
+		Read:        (mask & linux.O_ACCMODE) != linux.O_WRONLY,
+		Write:       (mask & linux.O_ACCMODE) != linux.O_RDONLY,
+		Append:      mask&linux.O_APPEND != 0,
+		Directory:   mask&linux.O_DIRECTORY != 0,
+		Async:       mask&linux.O_ASYNC != 0,
+		LargeFile:   mask&linux.O_LARGEFILE != 0,
+		Truncate:    mask&linux.O_TRUNC != 0,
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
new file mode 100644
index 000000000..ea4f9b1a7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -0,0 +1,736 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package linux provides syscall tables for amd64 Linux.
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+	// LinuxSysname is the OS name advertised by gVisor.
+	LinuxSysname = "Linux"
+
+	// LinuxRelease is the Linux release version number advertised by gVisor.
+	LinuxRelease = "4.4.0"
+
+	// LinuxVersion is the version info advertised by gVisor.
+	LinuxVersion = "#1 SMP Sun Jan 10 15:06:54 PST 2016"
+)
+
+// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
+// numbers from Linux 4.4.
+var AMD64 = &kernel.SyscallTable{
+	OS:   abi.Linux,
+	Arch: arch.AMD64,
+	Version: kernel.Version{
+		// Version 4.4 is chosen as a stable, longterm version of Linux, which
+		// guides the interface provided by this syscall table. The build
+		// version is that for a clean build with default kernel config, at 5
+		// minutes after v4.4 was tagged.
+		Sysname: LinuxSysname,
+		Release: LinuxRelease,
+		Version: LinuxVersion,
+	},
+	AuditNumber: linux.AUDIT_ARCH_X86_64,
+	Table: map[uintptr]kernel.Syscall{
+		0:   syscalls.Supported("read", Read),
+		1:   syscalls.Supported("write", Write),
+		2:   syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil),
+		3:   syscalls.Supported("close", Close),
+		4:   syscalls.Supported("stat", Stat),
+		5:   syscalls.Supported("fstat", Fstat),
+		6:   syscalls.Supported("lstat", Lstat),
+		7:   syscalls.Supported("poll", Poll),
+		8:   syscalls.Supported("lseek", Lseek),
+		9:   syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil),
+		10:  syscalls.Supported("mprotect", Mprotect),
+		11:  syscalls.Supported("munmap", Munmap),
+		12:  syscalls.Supported("brk", Brk),
+		13:  syscalls.Supported("rt_sigaction", RtSigaction),
+		14:  syscalls.Supported("rt_sigprocmask", RtSigprocmask),
+		15:  syscalls.Supported("rt_sigreturn", RtSigreturn),
+		16:  syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
+		17:  syscalls.Supported("pread64", Pread64),
+		18:  syscalls.Supported("pwrite64", Pwrite64),
+		19:  syscalls.Supported("readv", Readv),
+		20:  syscalls.Supported("writev", Writev),
+		21:  syscalls.Supported("access", Access),
+		22:  syscalls.Supported("pipe", Pipe),
+		23:  syscalls.Supported("select", Select),
+		24:  syscalls.Supported("sched_yield", SchedYield),
+		25:  syscalls.Supported("mremap", Mremap),
+		26:  syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil),
+		27:  syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil),
+		28:  syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil),
+		29:  syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
+		30:  syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil),
+		31:  syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil),
+		32:  syscalls.Supported("dup", Dup),
+		33:  syscalls.Supported("dup2", Dup2),
+		34:  syscalls.Supported("pause", Pause),
+		35:  syscalls.Supported("nanosleep", Nanosleep),
+		36:  syscalls.Supported("getitimer", Getitimer),
+		37:  syscalls.Supported("alarm", Alarm),
+		38:  syscalls.Supported("setitimer", Setitimer),
+		39:  syscalls.Supported("getpid", Getpid),
+		40:  syscalls.Supported("sendfile", Sendfile),
+		41:  syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil),
+		42:  syscalls.Supported("connect", Connect),
+		43:  syscalls.Supported("accept", Accept),
+		44:  syscalls.Supported("sendto", SendTo),
+		45:  syscalls.Supported("recvfrom", RecvFrom),
+		46:  syscalls.Supported("sendmsg", SendMsg),
+		47:  syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil),
+		48:  syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil),
+		49:  syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil),
+		50:  syscalls.Supported("listen", Listen),
+		51:  syscalls.Supported("getsockname", GetSockName),
+		52:  syscalls.Supported("getpeername", GetPeerName),
+		53:  syscalls.Supported("socketpair", SocketPair),
+		54:  syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil),
+		55:  syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil),
+		56:  syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil),
+		57:  syscalls.Supported("fork", Fork),
+		58:  syscalls.Supported("vfork", Vfork),
+		59:  syscalls.Supported("execve", Execve),
+		60:  syscalls.Supported("exit", Exit),
+		61:  syscalls.Supported("wait4", Wait4),
+		62:  syscalls.Supported("kill", Kill),
+		63:  syscalls.Supported("uname", Uname),
+		64:  syscalls.Supported("semget", Semget),
+		65:  syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
+		66:  syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+		67:  syscalls.Supported("shmdt", Shmdt),
+		68:  syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+		69:  syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+		70:  syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+		71:  syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+		72:  syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
+		73:  syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil),
+		74:  syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil),
+		75:  syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil),
+		76:  syscalls.Supported("truncate", Truncate),
+		77:  syscalls.Supported("ftruncate", Ftruncate),
+		78:  syscalls.Supported("getdents", Getdents),
+		79:  syscalls.Supported("getcwd", Getcwd),
+		80:  syscalls.Supported("chdir", Chdir),
+		81:  syscalls.Supported("fchdir", Fchdir),
+		82:  syscalls.Supported("rename", Rename),
+		83:  syscalls.Supported("mkdir", Mkdir),
+		84:  syscalls.Supported("rmdir", Rmdir),
+		85:  syscalls.Supported("creat", Creat),
+		86:  syscalls.Supported("link", Link),
+		87:  syscalls.Supported("unlink", Unlink),
+		88:  syscalls.Supported("symlink", Symlink),
+		89:  syscalls.Supported("readlink", Readlink),
+		90:  syscalls.Supported("chmod", Chmod),
+		91:  syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil),
+		92:  syscalls.Supported("chown", Chown),
+		93:  syscalls.Supported("fchown", Fchown),
+		94:  syscalls.Supported("lchown", Lchown),
+		95:  syscalls.Supported("umask", Umask),
+		96:  syscalls.Supported("gettimeofday", Gettimeofday),
+		97:  syscalls.Supported("getrlimit", Getrlimit),
+		98:  syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil),
+		99:  syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil),
+		100: syscalls.Supported("times", Times),
+		101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil),
+		102: syscalls.Supported("getuid", Getuid),
+		103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil),
+		104: syscalls.Supported("getgid", Getgid),
+		105: syscalls.Supported("setuid", Setuid),
+		106: syscalls.Supported("setgid", Setgid),
+		107: syscalls.Supported("geteuid", Geteuid),
+		108: syscalls.Supported("getegid", Getegid),
+		109: syscalls.Supported("setpgid", Setpgid),
+		110: syscalls.Supported("getppid", Getppid),
+		111: syscalls.Supported("getpgrp", Getpgrp),
+		112: syscalls.Supported("setsid", Setsid),
+		113: syscalls.Supported("setreuid", Setreuid),
+		114: syscalls.Supported("setregid", Setregid),
+		115: syscalls.Supported("getgroups", Getgroups),
+		116: syscalls.Supported("setgroups", Setgroups),
+		117: syscalls.Supported("setresuid", Setresuid),
+		118: syscalls.Supported("getresuid", Getresuid),
+		119: syscalls.Supported("setresgid", Setresgid),
+		120: syscalls.Supported("getresgid", Getresgid),
+		121: syscalls.Supported("getpgid", Getpgid),
+		122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+		123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+		124: syscalls.Supported("getsid", Getsid),
+		125: syscalls.Supported("capget", Capget),
+		126: syscalls.Supported("capset", Capset),
+		127: syscalls.Supported("rt_sigpending", RtSigpending),
+		128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait),
+		129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo),
+		130: syscalls.Supported("rt_sigsuspend", RtSigsuspend),
+		131: syscalls.Supported("sigaltstack", Sigaltstack),
+		132: syscalls.Supported("utime", Utime),
+		133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil),
+		134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil),
+		135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil),
+		136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil),
+		137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil),
+		138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil),
+		139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}),
+		140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil),
+		141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil),
+		142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil),
+		143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil),
+		144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil),
+		145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil),
+		146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil),
+		147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil),
+		148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil),
+		149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil),
+		154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil),
+		155: syscalls.Error("pivot_root", syserror.EPERM, "", nil),
+		156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil),
+		157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil),
+		158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil),
+		159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil),
+		160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil),
+		161: syscalls.Supported("chroot", Chroot),
+		162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil),
+		163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil),
+		164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil),
+		165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil),
+		166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil),
+		167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
+		168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
+		169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil),
+		170: syscalls.Supported("sethostname", Sethostname),
+		171: syscalls.Supported("setdomainname", Setdomainname),
+		172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil),
+		173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil),
+		174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil),
+		175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil),
+		176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil),
+		177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil),
+		178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil),
+		179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations
+		180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil),
+		181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil),
+		182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil),
+		183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil),
+		184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil),
+		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
+		186: syscalls.Supported("gettid", Gettid),
+		187: syscalls.Supported("readahead", Readahead),
+		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+		194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+		195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+		196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+		197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+		198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+		199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
+		200: syscalls.Supported("tkill", Tkill),
+		201: syscalls.Supported("time", Time),
+		202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
+		203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil),
+		204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil),
+		205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
+		206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
+		212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
+		213: syscalls.Supported("epoll_create", EpollCreate),
+		214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil),
+		215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil),
+		216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil),
+		217: syscalls.Supported("getdents64", Getdents64),
+		218: syscalls.Supported("set_tid_address", SetTidAddress),
+		219: syscalls.Supported("restart_syscall", RestartSyscall),
+		220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
+		221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
+		222: syscalls.Supported("timer_create", TimerCreate),
+		223: syscalls.Supported("timer_settime", TimerSettime),
+		224: syscalls.Supported("timer_gettime", TimerGettime),
+		225: syscalls.Supported("timer_getoverrun", TimerGetoverrun),
+		226: syscalls.Supported("timer_delete", TimerDelete),
+		227: syscalls.Supported("clock_settime", ClockSettime),
+		228: syscalls.Supported("clock_gettime", ClockGettime),
+		229: syscalls.Supported("clock_getres", ClockGetres),
+		230: syscalls.Supported("clock_nanosleep", ClockNanosleep),
+		231: syscalls.Supported("exit_group", ExitGroup),
+		232: syscalls.Supported("epoll_wait", EpollWait),
+		233: syscalls.Supported("epoll_ctl", EpollCtl),
+		234: syscalls.Supported("tgkill", Tgkill),
+		235: syscalls.Supported("utimes", Utimes),
+		236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil),
+		237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}),
+		238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil),
+		239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil),
+		240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),         // TODO(b/29354921)
+		241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
+		242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),    // TODO(b/29354921)
+		243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+		244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
+		245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),   // TODO(b/29354921)
+		246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil),
+		247: syscalls.Supported("waitid", Waitid),
+		248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil),
+		249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil),
+		250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
+		251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+		252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+		253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil),
+		254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
+		255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+		256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
+		257: syscalls.Supported("openat", Openat),
+		258: syscalls.Supported("mkdirat", Mkdirat),
+		259: syscalls.Supported("mknodat", Mknodat),
+		260: syscalls.Supported("fchownat", Fchownat),
+		261: syscalls.Supported("futimesat", Futimesat),
+		262: syscalls.Supported("fstatat", Fstatat),
+		263: syscalls.Supported("unlinkat", Unlinkat),
+		264: syscalls.Supported("renameat", Renameat),
+		265: syscalls.Supported("linkat", Linkat),
+		266: syscalls.Supported("symlinkat", Symlinkat),
+		267: syscalls.Supported("readlinkat", Readlinkat),
+		268: syscalls.Supported("fchmodat", Fchmodat),
+		269: syscalls.Supported("faccessat", Faccessat),
+		270: syscalls.Supported("pselect", Pselect),
+		271: syscalls.Supported("ppoll", Ppoll),
+		272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
+		273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+		274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+		275: syscalls.Supported("splice", Splice),
+		276: syscalls.Supported("tee", Tee),
+		277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
+		278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+		279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil),                               // requires cap_sys_nice (mostly)
+		280: syscalls.Supported("utimensat", Utimensat),
+		281: syscalls.Supported("epoll_pwait", EpollPwait),
+		282: syscalls.PartiallySupported("signalfd", Signalfd, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
+		283: syscalls.Supported("timerfd_create", TimerfdCreate),
+		284: syscalls.Supported("eventfd", Eventfd),
+		285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil),
+		286: syscalls.Supported("timerfd_settime", TimerfdSettime),
+		287: syscalls.Supported("timerfd_gettime", TimerfdGettime),
+		288: syscalls.Supported("accept4", Accept4),
+		289: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
+		290: syscalls.Supported("eventfd2", Eventfd2),
+		291: syscalls.Supported("epoll_create1", EpollCreate1),
+		292: syscalls.Supported("dup3", Dup3),
+		293: syscalls.Supported("pipe2", Pipe2),
+		294: syscalls.Supported("inotify_init1", InotifyInit1),
+		295: syscalls.Supported("preadv", Preadv),
+		296: syscalls.Supported("pwritev", Pwritev),
+		297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
+		298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil),
+		299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil),
+		300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+		301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+		302: syscalls.Supported("prlimit64", Prlimit64),
+		303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
+		304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
+		305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil),
+		306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil),
+		307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil),
+		308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995)
+		309: syscalls.Supported("getcpu", Getcpu),
+		310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+		311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+		312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil),
+		313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil),
+		314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+		315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+		316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}),                                           // TODO(b/118902772)
+		317: syscalls.Supported("seccomp", Seccomp),
+		318: syscalls.Supported("getrandom", GetRandom),
+		319: syscalls.Supported("memfd_create", MemfdCreate),
+		320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil),
+		321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
+		322: syscalls.Supported("execveat", Execveat),
+		323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
+		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
+		325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+
+		// Syscalls implemented after 325 are "backports" from versions
+		// of Linux after 4.4.
+		326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
+		327: syscalls.Supported("preadv2", Preadv2),
+		328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
+		329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
+		330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
+		331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
+		332: syscalls.Supported("statx", Statx),
+		333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
+		334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
+
+		// Linux skips ahead to syscall 424 to sync numbers between arches.
+		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
+		425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
+		426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
+		427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
+		428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
+		429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
+		430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
+		431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
+		432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
+		433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
+		434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
+		435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
+	},
+	Emulate: map[usermem.Addr]uintptr{
+		0xffffffffff600000: 96,  // vsyscall gettimeofday(2)
+		0xffffffffff600400: 201, // vsyscall time(2)
+		0xffffffffff600800: 309, // vsyscall getcpu(2)
+	},
+	Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, syserror.ENOSYS
+	},
+}
+
+// ARM64 is a table of Linux arm64 syscall API with the corresponding syscall
+// numbers from Linux 4.4.
+var ARM64 = &kernel.SyscallTable{
+	OS:   abi.Linux,
+	Arch: arch.ARM64,
+	Version: kernel.Version{
+		Sysname: LinuxSysname,
+		Release: LinuxRelease,
+		Version: LinuxVersion,
+	},
+	AuditNumber: linux.AUDIT_ARCH_AARCH64,
+	Table: map[uintptr]kernel.Syscall{
+		0:   syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		1:   syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+		11:  syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+		12:  syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+		13:  syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+		14:  syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+		15:  syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+		16:  syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
+		17:  syscalls.Supported("getcwd", Getcwd),
+		18:  syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
+		19:  syscalls.Supported("eventfd2", Eventfd2),
+		20:  syscalls.Supported("epoll_create1", EpollCreate1),
+		21:  syscalls.Supported("epoll_ctl", EpollCtl),
+		22:  syscalls.Supported("epoll_pwait", EpollPwait),
+		23:  syscalls.Supported("dup", Dup),
+		24:  syscalls.Supported("dup3", Dup3),
+		25:  syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
+		26:  syscalls.Supported("inotify_init1", InotifyInit1),
+		27:  syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
+		28:  syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+		29:  syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
+		30:  syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+		31:  syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+		32:  syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil),
+		33:  syscalls.Supported("mknodat", Mknodat),
+		34:  syscalls.Supported("mkdirat", Mkdirat),
+		35:  syscalls.Supported("unlinkat", Unlinkat),
+		36:  syscalls.Supported("symlinkat", Symlinkat),
+		37:  syscalls.Supported("linkat", Linkat),
+		38:  syscalls.Supported("renameat", Renameat),
+		39:  syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil),
+		40:  syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil),
+		41:  syscalls.Error("pivot_root", syserror.EPERM, "", nil),
+		42:  syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil),
+		43:  syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil),
+		44:  syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil),
+		45:  syscalls.Supported("truncate", Truncate),
+		46:  syscalls.Supported("ftruncate", Ftruncate),
+		47:  syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil),
+		48:  syscalls.Supported("faccessat", Faccessat),
+		49:  syscalls.Supported("chdir", Chdir),
+		50:  syscalls.Supported("fchdir", Fchdir),
+		51:  syscalls.Supported("chroot", Chroot),
+		52:  syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil),
+		53:  syscalls.Supported("fchmodat", Fchmodat),
+		54:  syscalls.Supported("fchownat", Fchownat),
+		55:  syscalls.Supported("fchown", Fchown),
+		56:  syscalls.Supported("openat", Openat),
+		57:  syscalls.Supported("close", Close),
+		58:  syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil),
+		59:  syscalls.Supported("pipe2", Pipe2),
+		60:  syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations
+		61:  syscalls.Supported("getdents64", Getdents64),
+		62:  syscalls.Supported("lseek", Lseek),
+		63:  syscalls.Supported("read", Read),
+		64:  syscalls.Supported("write", Write),
+		65:  syscalls.Supported("readv", Readv),
+		66:  syscalls.Supported("writev", Writev),
+		67:  syscalls.Supported("pread64", Pread64),
+		68:  syscalls.Supported("pwrite64", Pwrite64),
+		69:  syscalls.Supported("preadv", Preadv),
+		70:  syscalls.Supported("pwritev", Pwritev),
+		71:  syscalls.Supported("sendfile", Sendfile),
+		72:  syscalls.Supported("pselect", Pselect),
+		73:  syscalls.Supported("ppoll", Ppoll),
+		74:  syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
+		75:  syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}),              // TODO(b/29354098)
+		76:  syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+		77:  syscalls.Supported("tee", Tee),
+		78:  syscalls.Supported("readlinkat", Readlinkat),
+		79:  syscalls.Supported("fstatat", Fstatat),
+		80:  syscalls.Supported("fstat", Fstat),
+		81:  syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil),
+		82:  syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil),
+		83:  syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil),
+		84:  syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
+		85:  syscalls.Supported("timerfd_create", TimerfdCreate),
+		86:  syscalls.Supported("timerfd_settime", TimerfdSettime),
+		87:  syscalls.Supported("timerfd_gettime", TimerfdGettime),
+		88:  syscalls.Supported("utimensat", Utimensat),
+		89:  syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil),
+		90:  syscalls.Supported("capget", Capget),
+		91:  syscalls.Supported("capset", Capset),
+		92:  syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil),
+		93:  syscalls.Supported("exit", Exit),
+		94:  syscalls.Supported("exit_group", ExitGroup),
+		95:  syscalls.Supported("waitid", Waitid),
+		96:  syscalls.Supported("set_tid_address", SetTidAddress),
+		97:  syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
+		98:  syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
+		99:  syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+		100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+		101: syscalls.Supported("nanosleep", Nanosleep),
+		102: syscalls.Supported("getitimer", Getitimer),
+		103: syscalls.Supported("setitimer", Setitimer),
+		104: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil),
+		105: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil),
+		106: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil),
+		107: syscalls.Supported("timer_create", TimerCreate),
+		108: syscalls.Supported("timer_gettime", TimerGettime),
+		109: syscalls.Supported("timer_getoverrun", TimerGetoverrun),
+		110: syscalls.Supported("timer_settime", TimerSettime),
+		111: syscalls.Supported("timer_delete", TimerDelete),
+		112: syscalls.Supported("clock_settime", ClockSettime),
+		113: syscalls.Supported("clock_gettime", ClockGettime),
+		114: syscalls.Supported("clock_getres", ClockGetres),
+		115: syscalls.Supported("clock_nanosleep", ClockNanosleep),
+		116: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil),
+		117: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil),
+		118: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil),
+		119: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil),
+		120: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil),
+		121: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil),
+		122: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil),
+		123: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil),
+		124: syscalls.Supported("sched_yield", SchedYield),
+		125: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil),
+		126: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil),
+		127: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil),
+		128: syscalls.Supported("restart_syscall", RestartSyscall),
+		129: syscalls.Supported("kill", Kill),
+		130: syscalls.Supported("tkill", Tkill),
+		131: syscalls.Supported("tgkill", Tgkill),
+		132: syscalls.Supported("sigaltstack", Sigaltstack),
+		133: syscalls.Supported("rt_sigsuspend", RtSigsuspend),
+		134: syscalls.Supported("rt_sigaction", RtSigaction),
+		135: syscalls.Supported("rt_sigprocmask", RtSigprocmask),
+		136: syscalls.Supported("rt_sigpending", RtSigpending),
+		137: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait),
+		138: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo),
+		139: syscalls.Supported("rt_sigreturn", RtSigreturn),
+		140: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil),
+		141: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil),
+		142: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil),
+		143: syscalls.Supported("setregid", Setregid),
+		144: syscalls.Supported("setgid", Setgid),
+		145: syscalls.Supported("setreuid", Setreuid),
+		146: syscalls.Supported("setuid", Setuid),
+		147: syscalls.Supported("setresuid", Setresuid),
+		148: syscalls.Supported("getresuid", Getresuid),
+		149: syscalls.Supported("setresgid", Setresgid),
+		150: syscalls.Supported("getresgid", Getresgid),
+		151: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+		152: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+		153: syscalls.Supported("times", Times),
+		154: syscalls.Supported("setpgid", Setpgid),
+		155: syscalls.Supported("getpgid", Getpgid),
+		156: syscalls.Supported("getsid", Getsid),
+		157: syscalls.Supported("setsid", Setsid),
+		158: syscalls.Supported("getgroups", Getgroups),
+		159: syscalls.Supported("setgroups", Setgroups),
+		160: syscalls.Supported("uname", Uname),
+		161: syscalls.Supported("sethostname", Sethostname),
+		162: syscalls.Supported("setdomainname", Setdomainname),
+		163: syscalls.Supported("getrlimit", Getrlimit),
+		164: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil),
+		165: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil),
+		166: syscalls.Supported("umask", Umask),
+		167: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil),
+		168: syscalls.Supported("getcpu", Getcpu),
+		169: syscalls.Supported("gettimeofday", Gettimeofday),
+		170: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil),
+		171: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil),
+		172: syscalls.Supported("getpid", Getpid),
+		173: syscalls.Supported("getppid", Getppid),
+		174: syscalls.Supported("getuid", Getuid),
+		175: syscalls.Supported("geteuid", Geteuid),
+		176: syscalls.Supported("getgid", Getgid),
+		177: syscalls.Supported("getegid", Getegid),
+		178: syscalls.Supported("gettid", Gettid),
+		179: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil),
+		180: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),         // TODO(b/29354921)
+		181: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
+		182: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),    // TODO(b/29354921)
+		183: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+		184: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
+		185: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),   // TODO(b/29354921)
+		186: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
+		187: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
+		188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
+		189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
+		190: syscalls.Supported("semget", Semget),
+		191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+		192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
+		193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
+		194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
+		195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil),
+		196: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil),
+		197: syscalls.Supported("shmdt", Shmdt),
+		198: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil),
+		199: syscalls.Supported("socketpair", SocketPair),
+		200: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil),
+		201: syscalls.Supported("listen", Listen),
+		202: syscalls.Supported("accept", Accept),
+		203: syscalls.Supported("connect", Connect),
+		204: syscalls.Supported("getsockname", GetSockName),
+		205: syscalls.Supported("getpeername", GetPeerName),
+		206: syscalls.Supported("sendto", SendTo),
+		207: syscalls.Supported("recvfrom", RecvFrom),
+		208: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil),
+		209: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil),
+		210: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil),
+		211: syscalls.Supported("sendmsg", SendMsg),
+		212: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil),
+		213: syscalls.Supported("readahead", Readahead),
+		214: syscalls.Supported("brk", Brk),
+		215: syscalls.Supported("munmap", Munmap),
+		216: syscalls.Supported("mremap", Mremap),
+		217: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil),
+		218: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil),
+		219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
+		220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil),
+		221: syscalls.Supported("execve", Execve),
+		222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil),
+		223: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
+		224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
+		225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
+		226: syscalls.Supported("mprotect", Mprotect),
+		227: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil),
+		228: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		229: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		230: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		231: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		232: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil),
+		233: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil),
+		234: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil),
+		235: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}),
+		236: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil),
+		237: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil),
+		238: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
+		239: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly)
+		240: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
+		241: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil),
+		242: syscalls.Supported("accept4", Accept4),
+		243: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil),
+		260: syscalls.Supported("wait4", Wait4),
+		261: syscalls.Supported("prlimit64", Prlimit64),
+		262: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+		263: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+		264: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
+		265: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
+		266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil),
+		267: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil),
+		268: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995)
+		269: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil),
+		270: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+		271: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+		272: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil),
+		273: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil),
+		274: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+		275: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+		276: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}),                                           // TODO(b/118902772)
+		277: syscalls.Supported("seccomp", Seccomp),
+		278: syscalls.Supported("getrandom", GetRandom),
+		279: syscalls.Supported("memfd_create", MemfdCreate),
+		280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
+		281: syscalls.Supported("execveat", Execveat),
+		282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
+		283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
+		284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+
+		// Syscalls after 284 are "backports" from versions of Linux after 4.4.
+		285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
+		286: syscalls.Supported("preadv2", Preadv2),
+		287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
+		288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
+		289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
+		290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
+		291: syscalls.Supported("statx", Statx),
+		292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
+		293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
+
+		// Linux skips ahead to syscall 424 to sync numbers between arches.
+		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
+		425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
+		426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
+		427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
+		428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
+		429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
+		430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
+		431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
+		432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
+		433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
+		434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
+		435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
+	},
+	Emulate: map[usermem.Addr]uintptr{},
+	Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, syserror.ENOSYS
+	},
+}
+
+func init() {
+	kernel.RegisterSyscallTable(AMD64)
+	kernel.RegisterSyscallTable(ARM64)
+}
diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go
new file mode 100644
index 000000000..434559b80
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sigset.go
@@ -0,0 +1,71 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// CopyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and
+// STOP are clear.
+//
+// TODO(gvisor.dev/issue/1624): This is only exported because
+// syscalls/vfs2/signal.go depends on it. Once vfs1 is deleted and the vfs2
+// syscalls are moved into this package, then they can be unexported.
+func CopyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) {
+	if size != linux.SignalSetSize {
+		return 0, syserror.EINVAL
+	}
+	b := t.CopyScratchBuffer(8)
+	if _, err := t.CopyInBytes(sigSetAddr, b); err != nil {
+		return 0, err
+	}
+	mask := usermem.ByteOrder.Uint64(b[:])
+	return linux.SignalSet(mask) &^ kernel.UnblockableSignals, nil
+}
+
+// copyOutSigSet copies out a sigset_t.
+func copyOutSigSet(t *kernel.Task, sigSetAddr usermem.Addr, mask linux.SignalSet) error {
+	b := t.CopyScratchBuffer(8)
+	usermem.ByteOrder.PutUint64(b, uint64(mask))
+	_, err := t.CopyOutBytes(sigSetAddr, b)
+	return err
+}
+
+// copyInSigSetWithSize copies in a structure as below
+//
+//   struct {
+//       sigset_t* sigset_addr;
+//       size_t sizeof_sigset;
+//   };
+//
+// and returns sigset_addr and size.
+func copyInSigSetWithSize(t *kernel.Task, addr usermem.Addr) (usermem.Addr, uint, error) {
+	switch t.Arch().Width() {
+	case 8:
+		in := t.CopyScratchBuffer(16)
+		if _, err := t.CopyInBytes(addr, in); err != nil {
+			return 0, 0, err
+		}
+		maskAddr := usermem.Addr(usermem.ByteOrder.Uint64(in[0:]))
+		maskSize := uint(usermem.ByteOrder.Uint64(in[8:]))
+		return maskAddr, maskSize, nil
+	default:
+		return 0, 0, syserror.ENOSYS
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
new file mode 100644
index 000000000..ba2557c52
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -0,0 +1,382 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/eventfd"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// IoSetup implements linux syscall io_setup(2).
+func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nrEvents := args[0].Int()
+	idAddr := args[1].Pointer()
+
+	// Linux uses the native long as the aio ID.
+	//
+	// The context pointer _must_ be zero initially.
+	var idIn uint64
+	if _, err := t.CopyIn(idAddr, &idIn); err != nil {
+		return 0, nil, err
+	}
+	if idIn != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	id, err := t.MemoryManager().NewAIOContext(t, uint32(nrEvents))
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Copy out the new ID.
+	if _, err := t.CopyOut(idAddr, &id); err != nil {
+		t.MemoryManager().DestroyAIOContext(t, id)
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// IoDestroy implements linux syscall io_destroy(2).
+func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Uint64()
+
+	ctx := t.MemoryManager().DestroyAIOContext(t, id)
+	if ctx == nil {
+		// Does not exist.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Drain completed requests amd wait for pending requests until there are no
+	// more.
+	for {
+		ctx.Drain()
+
+		ch := ctx.WaitChannel()
+		if ch == nil {
+			// No more requests, we're done.
+			return 0, nil, nil
+		}
+		// The task cannot be interrupted during the wait. Equivalent to
+		// TASK_UNINTERRUPTIBLE in Linux.
+		t.UninterruptibleSleepStart(true /* deactivate */)
+		<-ch
+		t.UninterruptibleSleepFinish(true /* activate */)
+	}
+}
+
+// IoGetevents implements linux syscall io_getevents(2).
+func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Uint64()
+	minEvents := args[1].Int()
+	events := args[2].Int()
+	eventsAddr := args[3].Pointer()
+	timespecAddr := args[4].Pointer()
+
+	// Sanity check arguments.
+	if minEvents < 0 || minEvents > events {
+		return 0, nil, syserror.EINVAL
+	}
+
+	ctx, ok := t.MemoryManager().LookupAIOContext(t, id)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Setup the timeout.
+	var haveDeadline bool
+	var deadline ktime.Time
+	if timespecAddr != 0 {
+		d, err := copyTimespecIn(t, timespecAddr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if !d.Valid() {
+			return 0, nil, syserror.EINVAL
+		}
+		deadline = t.Kernel().MonotonicClock().Now().Add(d.ToDuration())
+		haveDeadline = true
+	}
+
+	// Loop over all requests.
+	for count := int32(0); count < events; count++ {
+		// Get a request, per semantics.
+		var v interface{}
+		if count >= minEvents {
+			var ok bool
+			v, ok = ctx.PopRequest()
+			if !ok {
+				return uintptr(count), nil, nil
+			}
+		} else {
+			var err error
+			v, err = waitForRequest(ctx, t, haveDeadline, deadline)
+			if err != nil {
+				if count > 0 || err == syserror.ETIMEDOUT {
+					return uintptr(count), nil, nil
+				}
+				return 0, nil, syserror.ConvertIntr(err, syserror.EINTR)
+			}
+		}
+
+		ev := v.(*linux.IOEvent)
+
+		// Copy out the result.
+		if _, err := t.CopyOut(eventsAddr, ev); err != nil {
+			if count > 0 {
+				return uintptr(count), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Keep rolling.
+		eventsAddr += usermem.Addr(linux.IOEventSize)
+	}
+
+	// Everything finished.
+	return uintptr(events), nil, nil
+}
+
+func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (interface{}, error) {
+	for {
+		if v, ok := ctx.PopRequest(); ok {
+			// Request was readily available. Just return it.
+			return v, nil
+		}
+
+		// Need to wait for request completion.
+		done := ctx.WaitChannel()
+		if done == nil {
+			// Context has been destroyed.
+			return nil, syserror.EINVAL
+		}
+		if err := t.BlockWithDeadline(done, haveDeadline, deadline); err != nil {
+			return nil, err
+		}
+	}
+}
+
+// memoryFor returns appropriate memory for the given callback.
+func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) {
+	bytes := int(cb.Bytes)
+	if bytes < 0 {
+		// Linux also requires that this field fit in ssize_t.
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+
+	// Since this I/O will be asynchronous with respect to t's task goroutine,
+	// we have no guarantee that t's AddressSpace will be active during the
+	// I/O.
+	switch cb.OpCode {
+	case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE:
+		return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+			AddressSpaceActive: false,
+		})
+
+	case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV:
+		return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+			AddressSpaceActive: false,
+		})
+
+	case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP:
+		return usermem.IOSequence{}, nil
+
+	default:
+		// Not a supported command.
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+}
+
+// IoCancel implements linux syscall io_cancel(2).
+//
+// It is not presently supported (ENOSYS indicates no support on this
+// architecture).
+func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, syserror.ENOSYS
+}
+
+// LINT.IfChange
+
+func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, actx *mm.AIOContext, eventFile *fs.File) kernel.AIOCallback {
+	return func(ctx context.Context) {
+		if actx.Dead() {
+			actx.CancelPendingRequest()
+			return
+		}
+		ev := &linux.IOEvent{
+			Data: cb.Data,
+			Obj:  uint64(cbAddr),
+		}
+
+		var err error
+		switch cb.OpCode {
+		case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV:
+			ev.Result, err = file.Preadv(ctx, ioseq, cb.Offset)
+		case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+			ev.Result, err = file.Pwritev(ctx, ioseq, cb.Offset)
+		case linux.IOCB_CMD_FSYNC:
+			err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+		case linux.IOCB_CMD_FDSYNC:
+			err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncData)
+		}
+
+		// Update the result.
+		if err != nil {
+			err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file)
+			ev.Result = -int64(kernel.ExtractErrno(err, 0))
+		}
+
+		file.DecRef()
+
+		// Queue the result for delivery.
+		actx.FinishRequest(ev)
+
+		// Notify the event file if one was specified. This needs to happen
+		// *after* queueing the result to avoid racing with the thread we may
+		// wake up.
+		if eventFile != nil {
+			eventFile.FileOperations.(*eventfd.EventOperations).Signal(1)
+			eventFile.DecRef()
+		}
+	}
+}
+
+// submitCallback processes a single callback.
+func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error {
+	file := t.GetFile(cb.FD)
+	if file == nil {
+		// File not found.
+		return syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Was there an eventFD? Extract it.
+	var eventFile *fs.File
+	if cb.Flags&linux.IOCB_FLAG_RESFD != 0 {
+		eventFile = t.GetFile(cb.ResFD)
+		if eventFile == nil {
+			// Bad FD.
+			return syserror.EBADF
+		}
+		defer eventFile.DecRef()
+
+		// Check that it is an eventfd.
+		if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok {
+			// Not an event FD.
+			return syserror.EINVAL
+		}
+	}
+
+	ioseq, err := memoryFor(t, cb)
+	if err != nil {
+		return err
+	}
+
+	// Check offset for reads/writes.
+	switch cb.OpCode {
+	case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+		if cb.Offset < 0 {
+			return syserror.EINVAL
+		}
+	}
+
+	// Prepare the request.
+	ctx, ok := t.MemoryManager().LookupAIOContext(t, id)
+	if !ok {
+		return syserror.EINVAL
+	}
+	if ready := ctx.Prepare(); !ready {
+		// Context is busy.
+		return syserror.EAGAIN
+	}
+
+	if eventFile != nil {
+		// The request is set. Make sure there's a ref on the file.
+		//
+		// This is necessary when the callback executes on completion,
+		// which is also what will release this reference.
+		eventFile.IncRef()
+	}
+
+	// Perform the request asynchronously.
+	file.IncRef()
+	t.QueueAIO(getAIOCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile))
+
+	// All set.
+	return nil
+}
+
+// IoSubmit implements linux syscall io_submit(2).
+func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Uint64()
+	nrEvents := args[1].Int()
+	addr := args[2].Pointer()
+
+	if nrEvents < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	for i := int32(0); i < nrEvents; i++ {
+		// Copy in the address.
+		cbAddrNative := t.Arch().Native(0)
+		if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
+			if i > 0 {
+				// Some successful.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Copy in this callback.
+		var cb linux.IOCallback
+		cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
+		if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+
+			if i > 0 {
+				// Some have been successful.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Process this callback.
+		if err := submitCallback(t, id, &cb, cbAddr); err != nil {
+			if i > 0 {
+				// Partial success.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Advance to the next one.
+		addr += usermem.Addr(t.Arch().Width())
+	}
+
+	return uintptr(nrEvents), nil, nil
+}
+
+// LINT.ThenChange(vfs2/aio.go)
diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go
new file mode 100644
index 000000000..adf5ea5f2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_capability.go
@@ -0,0 +1,149 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func lookupCaps(t *kernel.Task, tid kernel.ThreadID) (permitted, inheritable, effective auth.CapabilitySet, err error) {
+	if tid < 0 {
+		err = syserror.EINVAL
+		return
+	}
+	if tid > 0 {
+		t = t.PIDNamespace().TaskWithID(tid)
+	}
+	if t == nil {
+		err = syserror.ESRCH
+		return
+	}
+	creds := t.Credentials()
+	permitted, inheritable, effective = creds.PermittedCaps, creds.InheritableCaps, creds.EffectiveCaps
+	return
+}
+
+// Capget implements Linux syscall capget.
+func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	hdrAddr := args[0].Pointer()
+	dataAddr := args[1].Pointer()
+
+	var hdr linux.CapUserHeader
+	if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+		return 0, nil, err
+	}
+	// hdr.Pid doesn't need to be valid if this capget() is a "version probe"
+	// (hdr.Version is unrecognized and dataAddr is null), so we can't do the
+	// lookup yet.
+	switch hdr.Version {
+	case linux.LINUX_CAPABILITY_VERSION_1:
+		if dataAddr == 0 {
+			return 0, nil, nil
+		}
+		p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid))
+		if err != nil {
+			return 0, nil, err
+		}
+		data := linux.CapUserData{
+			Effective:   uint32(e),
+			Permitted:   uint32(p),
+			Inheritable: uint32(i),
+		}
+		_, err = t.CopyOut(dataAddr, &data)
+		return 0, nil, err
+
+	case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
+		if dataAddr == 0 {
+			return 0, nil, nil
+		}
+		p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid))
+		if err != nil {
+			return 0, nil, err
+		}
+		data := [2]linux.CapUserData{
+			{
+				Effective:   uint32(e),
+				Permitted:   uint32(p),
+				Inheritable: uint32(i),
+			},
+			{
+				Effective:   uint32(e >> 32),
+				Permitted:   uint32(p >> 32),
+				Inheritable: uint32(i >> 32),
+			},
+		}
+		_, err = t.CopyOut(dataAddr, &data)
+		return 0, nil, err
+
+	default:
+		hdr.Version = linux.HighestCapabilityVersion
+		if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+			return 0, nil, err
+		}
+		if dataAddr != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		return 0, nil, nil
+	}
+}
+
+// Capset implements Linux syscall capset.
+func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	hdrAddr := args[0].Pointer()
+	dataAddr := args[1].Pointer()
+
+	var hdr linux.CapUserHeader
+	if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+		return 0, nil, err
+	}
+	switch hdr.Version {
+	case linux.LINUX_CAPABILITY_VERSION_1:
+		if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() {
+			return 0, nil, syserror.EPERM
+		}
+		var data linux.CapUserData
+		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+			return 0, nil, err
+		}
+		p := auth.CapabilitySet(data.Permitted) & auth.AllCapabilities
+		i := auth.CapabilitySet(data.Inheritable) & auth.AllCapabilities
+		e := auth.CapabilitySet(data.Effective) & auth.AllCapabilities
+		return 0, nil, t.SetCapabilitySets(p, i, e)
+
+	case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
+		if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() {
+			return 0, nil, syserror.EPERM
+		}
+		var data [2]linux.CapUserData
+		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+			return 0, nil, err
+		}
+		p := (auth.CapabilitySet(data[0].Permitted) | (auth.CapabilitySet(data[1].Permitted) << 32)) & auth.AllCapabilities
+		i := (auth.CapabilitySet(data[0].Inheritable) | (auth.CapabilitySet(data[1].Inheritable) << 32)) & auth.AllCapabilities
+		e := (auth.CapabilitySet(data[0].Effective) | (auth.CapabilitySet(data[1].Effective) << 32)) & auth.AllCapabilities
+		return 0, nil, t.SetCapabilitySets(p, i, e)
+
+	default:
+		hdr.Version = linux.HighestCapabilityVersion
+		if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_clone_amd64.go b/pkg/sentry/syscalls/linux/sys_clone_amd64.go
new file mode 100644
index 000000000..dd43cf18d
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_clone_amd64.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// Clone implements linux syscall clone(2).
+// sys_clone has so many flavors. We implement the default one in linux 3.11
+// x86_64:
+//    sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val)
+func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := int(args[0].Int())
+	stack := args[1].Pointer()
+	parentTID := args[2].Pointer()
+	childTID := args[3].Pointer()
+	tls := args[4].Pointer()
+	return clone(t, flags, stack, parentTID, childTID, tls)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_clone_arm64.go b/pkg/sentry/syscalls/linux/sys_clone_arm64.go
new file mode 100644
index 000000000..cf68a8949
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_clone_arm64.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// Clone implements linux syscall clone(2).
+// sys_clone has so many flavors, and we implement the default one in linux 3.11
+// arm64(kernel/fork.c with CONFIG_CLONE_BACKWARDS defined in the config file):
+//    sys_clone(clone_flags, newsp, parent_tidptr, tls_val, child_tidptr)
+func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := int(args[0].Int())
+	stack := args[1].Pointer()
+	parentTID := args[2].Pointer()
+	tls := args[3].Pointer()
+	childTID := args[4].Pointer()
+	return clone(t, flags, stack, parentTID, childTID, tls)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
new file mode 100644
index 000000000..7f460d30b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -0,0 +1,147 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// LINT.IfChange
+
+// EpollCreate1 implements the epoll_create1(2) linux syscall.
+func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	if flags & ^linux.EPOLL_CLOEXEC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	closeOnExec := flags&linux.EPOLL_CLOEXEC != 0
+	fd, err := syscalls.CreateEpoll(t, closeOnExec)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// EpollCreate implements the epoll_create(2) linux syscall.
+func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := args[0].Int()
+
+	if size <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	fd, err := syscalls.CreateEpoll(t, false)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// EpollCtl implements the epoll_ctl(2) linux syscall.
+func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := args[0].Int()
+	op := args[1].Int()
+	fd := args[2].Int()
+	eventAddr := args[3].Pointer()
+
+	// Capture the event state if needed.
+	flags := epoll.EntryFlags(0)
+	mask := waiter.EventMask(0)
+	var data [2]int32
+	if op != linux.EPOLL_CTL_DEL {
+		var e linux.EpollEvent
+		if _, err := e.CopyIn(t, eventAddr); err != nil {
+			return 0, nil, err
+		}
+
+		if e.Events&linux.EPOLLONESHOT != 0 {
+			flags |= epoll.OneShot
+		}
+
+		if e.Events&linux.EPOLLET != 0 {
+			flags |= epoll.EdgeTriggered
+		}
+
+		mask = waiter.EventMaskFromLinux(e.Events)
+		data = e.Data
+	}
+
+	// Perform the requested operations.
+	switch op {
+	case linux.EPOLL_CTL_ADD:
+		// See fs/eventpoll.c.
+		mask |= waiter.EventHUp | waiter.EventErr
+		return 0, nil, syscalls.AddEpoll(t, epfd, fd, flags, mask, data)
+	case linux.EPOLL_CTL_DEL:
+		return 0, nil, syscalls.RemoveEpoll(t, epfd, fd)
+	case linux.EPOLL_CTL_MOD:
+		// Same as EPOLL_CTL_ADD.
+		mask |= waiter.EventHUp | waiter.EventErr
+		return 0, nil, syscalls.UpdateEpoll(t, epfd, fd, flags, mask, data)
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+// EpollWait implements the epoll_wait(2) linux syscall.
+func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := args[0].Int()
+	eventsAddr := args[1].Pointer()
+	maxEvents := int(args[2].Int())
+	timeout := int(args[3].Int())
+
+	r, err := syscalls.WaitEpoll(t, epfd, maxEvents, timeout)
+	if err != nil {
+		return 0, nil, syserror.ConvertIntr(err, syserror.EINTR)
+	}
+
+	if len(r) != 0 {
+		if _, err := linux.CopyEpollEventSliceOut(t, eventsAddr, r); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	return uintptr(len(r)), nil, nil
+}
+
+// EpollPwait implements the epoll_pwait(2) linux syscall.
+func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	maskAddr := args[4].Pointer()
+	maskSize := uint(args[5].Uint())
+
+	if maskAddr != 0 {
+		mask, err := CopyInSigSet(t, maskAddr, maskSize)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		oldmask := t.SignalMask()
+		t.SetSignalMask(mask)
+		t.SetSavedSignalMask(oldmask)
+	}
+
+	return EpollWait(t, args)
+}
+
+// LINT.ThenChange(vfs2/epoll.go)
diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go
new file mode 100644
index 000000000..ed3413ca6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_eventfd.go
@@ -0,0 +1,56 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/eventfd"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Eventfd2 implements linux syscall eventfd2(2).
+func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	initVal := args[0].Int()
+	flags := uint(args[1].Uint())
+	allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC)
+
+	if flags & ^allOps != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	event := eventfd.New(t, uint64(initVal), flags&linux.EFD_SEMAPHORE != 0)
+	event.SetFlags(fs.SettableFileFlags{
+		NonBlocking: flags&linux.EFD_NONBLOCK != 0,
+	})
+	defer event.DecRef()
+
+	fd, err := t.NewFDFrom(0, event, kernel.FDFlags{
+		CloseOnExec: flags&linux.EFD_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// Eventfd implements linux syscall eventfd(2).
+func Eventfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	args[1].Value = 0
+	return Eventfd2(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
new file mode 100644
index 000000000..2797c6a72
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -0,0 +1,2238 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fileOpAt performs an operation on the second last component in the path.
+func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error {
+	// Extract the last component.
+	dir, name := fs.SplitLast(path)
+	if dir == "/" {
+		// Common case: we are accessing a file in the root.
+		root := t.FSContext().RootDirectory()
+		err := fn(root, root, name, linux.MaxSymlinkTraversals)
+		root.DecRef()
+		return err
+	} else if dir == "." && dirFD == linux.AT_FDCWD {
+		// Common case: we are accessing a file relative to the current
+		// working directory; skip the look-up.
+		wd := t.FSContext().WorkingDirectory()
+		root := t.FSContext().RootDirectory()
+		err := fn(root, wd, name, linux.MaxSymlinkTraversals)
+		wd.DecRef()
+		root.DecRef()
+		return err
+	}
+
+	return fileOpOn(t, dirFD, dir, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error {
+		return fn(root, d, name, remainingTraversals)
+	})
+}
+
+// fileOpOn performs an operation on the last entry of the path.
+func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error {
+	var (
+		d   *fs.Dirent // The file.
+		wd  *fs.Dirent // The working directory (if required.)
+		rel *fs.Dirent // The relative directory for search (if required.)
+		f   *fs.File   // The file corresponding to dirFD (if required.)
+		err error
+	)
+
+	// Extract the working directory (maybe).
+	if len(path) > 0 && path[0] == '/' {
+		// Absolute path; rel can be nil.
+	} else if dirFD == linux.AT_FDCWD {
+		// Need to reference the working directory.
+		wd = t.FSContext().WorkingDirectory()
+		rel = wd
+	} else {
+		// Need to extract the given FD.
+		f = t.GetFile(dirFD)
+		if f == nil {
+			return syserror.EBADF
+		}
+		rel = f.Dirent
+		if !fs.IsDir(rel.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+	}
+
+	// Grab the root (always required.)
+	root := t.FSContext().RootDirectory()
+
+	// Lookup the node.
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
+	if resolve {
+		d, err = t.MountNamespace().FindInode(t, root, rel, path, &remainingTraversals)
+	} else {
+		d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals)
+	}
+	root.DecRef()
+	if wd != nil {
+		wd.DecRef()
+	}
+	if f != nil {
+		f.DecRef()
+	}
+	if err != nil {
+		return err
+	}
+
+	err = fn(root, d, remainingTraversals)
+	d.DecRef()
+	return err
+}
+
+// copyInPath copies a path in.
+func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string, dirPath bool, err error) {
+	path, err = t.CopyInString(addr, linux.PATH_MAX)
+	if err != nil {
+		return "", false, err
+	}
+	if path == "" && !allowEmpty {
+		return "", false, syserror.ENOENT
+	}
+
+	// If the path ends with a /, then checks must be enforced in various
+	// ways in the different callers. We pass this back to the caller.
+	path, dirPath = fs.TrimTrailingSlashes(path)
+
+	return path, dirPath, nil
+}
+
+// LINT.IfChange
+
+func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, err
+	}
+
+	resolve := flags&linux.O_NOFOLLOW == 0
+	err = fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		// First check a few things about the filesystem before trying to get the file
+		// reference.
+		//
+		// It's required that Check does not try to open files not that aren't backed by
+		// this dirent (e.g. pipes and sockets) because this would result in opening these
+		// files an extra time just to check permissions.
+		if err := d.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
+			return err
+		}
+
+		if fs.IsSymlink(d.Inode.StableAttr) && !resolve {
+			return syserror.ELOOP
+		}
+
+		fileFlags := linuxToFlags(flags)
+		// Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
+		fileFlags.LargeFile = true
+		if fs.IsDir(d.Inode.StableAttr) {
+			// Don't allow directories to be opened writable.
+			if fileFlags.Write {
+				return syserror.EISDIR
+			}
+		} else {
+			// If O_DIRECTORY is set, but the file is not a directory, then fail.
+			if fileFlags.Directory {
+				return syserror.ENOTDIR
+			}
+			// If it's a directory, then make sure.
+			if dirPath {
+				return syserror.ENOTDIR
+			}
+		}
+
+		// Truncate is called when O_TRUNC is specified for any kind of
+		// existing Dirent. Behavior is delegated to the entry's Truncate
+		// implementation.
+		if flags&linux.O_TRUNC != 0 {
+			if err := d.Inode.Truncate(t, d, 0); err != nil {
+				return err
+			}
+		}
+
+		file, err := d.Inode.GetFile(t, d, fileFlags)
+		if err != nil {
+			return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+		}
+		defer file.DecRef()
+
+		// Success.
+		newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
+			CloseOnExec: flags&linux.O_CLOEXEC != 0,
+		})
+		if err != nil {
+			return err
+		}
+
+		// Set return result in frame.
+		fd = uintptr(newFD)
+
+		// Generate notification for opened file.
+		d.InotifyEvent(linux.IN_OPEN, 0)
+
+		return nil
+	})
+	return fd, err // Use result in frame.
+}
+
+func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Do we have the appropriate permissions on the parent?
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+			return err
+		}
+
+		// Attempt a creation.
+		perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+
+		switch mode.FileType() {
+		case 0:
+			// "Zero file type is equivalent to type S_IFREG." - mknod(2)
+			fallthrough
+		case linux.ModeRegular:
+			// We are not going to return the file, so the actual
+			// flags used don't matter, but they cannot be empty or
+			// Create will complain.
+			flags := fs.FileFlags{Read: true, Write: true}
+			file, err := d.Create(t, root, name, flags, perms)
+			if err != nil {
+				return err
+			}
+			file.DecRef()
+			return nil
+
+		case linux.ModeNamedPipe:
+			return d.CreateFifo(t, root, name, perms)
+
+		case linux.ModeSocket:
+			// While it is possible create a unix domain socket file on linux
+			// using mknod(2), in practice this is pretty useless from an
+			// application. Linux internally uses mknod() to create the socket
+			// node during bind(2), but we implement bind(2) independently. If
+			// an application explicitly creates a socket node using mknod(),
+			// you can't seem to bind() or connect() to the resulting socket.
+			//
+			// Instead of emulating this seemingly useless behaviour, we'll
+			// indicate that the filesystem doesn't support the creation of
+			// sockets.
+			return syserror.EOPNOTSUPP
+
+		case linux.ModeCharacterDevice:
+			fallthrough
+		case linux.ModeBlockDevice:
+			// TODO(b/72101894): We don't support creating block or character
+			// devices at the moment.
+			//
+			// When we start supporting block and character devices, we'll
+			// need to check for CAP_MKNOD here.
+			return syserror.EPERM
+
+		default:
+			// "EINVAL - mode requested creation of something other than a
+			// regular file, device special file, FIFO or socket." - mknod(2)
+			return syserror.EINVAL
+		}
+	})
+}
+
+// Mknod implements the linux syscall mknod(2).
+func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	path := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+	// We don't need this argument until we support creation of device nodes.
+	_ = args[2].Uint() // dev
+
+	return 0, nil, mknodAt(t, linux.AT_FDCWD, path, mode)
+}
+
+// Mknodat implements the linux syscall mknodat(2).
+func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := args[0].Int()
+	path := args[1].Pointer()
+	mode := linux.FileMode(args[2].ModeT())
+	// We don't need this argument until we support creation of device nodes.
+	_ = args[3].Uint() // dev
+
+	return 0, nil, mknodAt(t, dirFD, path, mode)
+}
+
+func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, err
+	}
+	if dirPath {
+		return 0, syserror.ENOENT
+	}
+
+	fileFlags := linuxToFlags(flags)
+	// Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
+	fileFlags.LargeFile = true
+
+	err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, parent *fs.Dirent, name string, remainingTraversals uint) error {
+		// Resolve the name to see if it exists, and follow any
+		// symlinks along the way. We must do the symlink resolution
+		// manually because if the symlink target does not exist, we
+		// must create the target (and not the symlink itself).
+		var (
+			found *fs.Dirent
+			err   error
+		)
+		for {
+			if !fs.IsDir(parent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
+
+			// Start by looking up the dirent at 'name'.
+			found, err = t.MountNamespace().FindLink(t, root, parent, name, &remainingTraversals)
+			if err != nil {
+				break
+			}
+			defer found.DecRef()
+
+			// We found something (possibly a symlink). If the
+			// O_EXCL flag was passed, then we can immediately
+			// return EEXIST.
+			if flags&linux.O_EXCL != 0 {
+				return syserror.EEXIST
+			}
+
+			// If we have a non-symlink, then we can proceed.
+			if !fs.IsSymlink(found.Inode.StableAttr) {
+				break
+			}
+
+			// If O_NOFOLLOW was passed, then don't try to resolve
+			// anything.
+			if flags&linux.O_NOFOLLOW != 0 {
+				return syserror.ELOOP
+			}
+
+			// Try to resolve the symlink directly to a Dirent.
+			var resolved *fs.Dirent
+			resolved, err = found.Inode.Getlink(t)
+			if err == nil {
+				// No more resolution necessary.
+				defer resolved.DecRef()
+				break
+			}
+			if err != fs.ErrResolveViaReadlink {
+				return err
+			}
+
+			// Are we able to resolve further?
+			if remainingTraversals == 0 {
+				return syscall.ELOOP
+			}
+
+			// Resolve the symlink to a path via Readlink.
+			var path string
+			path, err = found.Inode.Readlink(t)
+			if err != nil {
+				break
+			}
+			remainingTraversals--
+
+			// Get the new parent from the target path.
+			var newParent *fs.Dirent
+			newParentPath, newName := fs.SplitLast(path)
+			newParent, err = t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals)
+			if err != nil {
+				break
+			}
+			defer newParent.DecRef()
+
+			// Repeat the process with the parent and name of the
+			// symlink target.
+			parent = newParent
+			name = newName
+		}
+
+		var newFile *fs.File
+		switch err {
+		case nil:
+			// Like sys_open, check for a few things about the
+			// filesystem before trying to get a reference to the
+			// fs.File. The same constraints on Check apply.
+			if err := found.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
+				return err
+			}
+
+			// Truncate is called when O_TRUNC is specified for any kind of
+			// existing Dirent. Behavior is delegated to the entry's Truncate
+			// implementation.
+			if flags&linux.O_TRUNC != 0 {
+				if err := found.Inode.Truncate(t, found, 0); err != nil {
+					return err
+				}
+			}
+
+			// Create a new fs.File.
+			newFile, err = found.Inode.GetFile(t, found, fileFlags)
+			if err != nil {
+				return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+			}
+			defer newFile.DecRef()
+		case syserror.ENOENT:
+			// File does not exist. Proceed with creation.
+
+			// Do we have write permissions on the parent?
+			if err := parent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+
+			// Attempt a creation.
+			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+			newFile, err = parent.Create(t, root, name, fileFlags, perms)
+			if err != nil {
+				// No luck, bail.
+				return err
+			}
+			defer newFile.DecRef()
+			found = newFile.Dirent
+		default:
+			return err
+		}
+
+		// Success.
+		newFD, err := t.NewFDFrom(0, newFile, kernel.FDFlags{
+			CloseOnExec: flags&linux.O_CLOEXEC != 0,
+		})
+		if err != nil {
+			return err
+		}
+
+		// Set result in frame.
+		fd = uintptr(newFD)
+
+		// Queue the open inotify event. The creation event is
+		// automatically queued when the dirent is found. The open
+		// events are implemented at the syscall layer so we need to
+		// manually queue one here.
+		found.InotifyEvent(linux.IN_OPEN, 0)
+
+		return nil
+	})
+	return fd, err // Use result in frame.
+}
+
+// Open implements linux syscall open(2).
+func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := uint(args[1].Uint())
+	if flags&linux.O_CREAT != 0 {
+		mode := linux.FileMode(args[2].ModeT())
+		n, err := createAt(t, linux.AT_FDCWD, addr, flags, mode)
+		return n, nil, err
+	}
+	n, err := openAt(t, linux.AT_FDCWD, addr, flags)
+	return n, nil, err
+}
+
+// Openat implements linux syscall openat(2).
+func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := args[0].Int()
+	addr := args[1].Pointer()
+	flags := uint(args[2].Uint())
+	if flags&linux.O_CREAT != 0 {
+		mode := linux.FileMode(args[3].ModeT())
+		n, err := createAt(t, dirFD, addr, flags, mode)
+		return n, nil, err
+	}
+	n, err := openAt(t, dirFD, addr, flags)
+	return n, nil, err
+}
+
+// Creat implements linux syscall creat(2).
+func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+	n, err := createAt(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_TRUNC, mode)
+	return n, nil, err
+}
+
+// accessContext is a context that overrides the credentials used, but
+// otherwise carries the same values as the embedded context.
+//
+// accessContext should only be used for access(2).
+type accessContext struct {
+	context.Context
+	creds *auth.Credentials
+}
+
+// Value implements context.Context.
+func (ac accessContext) Value(key interface{}) interface{} {
+	switch key {
+	case auth.CtxCredentials:
+		return ac.creds
+	default:
+		return ac.Context.Value(key)
+	}
+}
+
+func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode uint) error {
+	const rOK = 4
+	const wOK = 2
+	const xOK = 1
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	// Sanity check the mode.
+	if mode&^(rOK|wOK|xOK) != 0 {
+		return syserror.EINVAL
+	}
+
+	return fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		// access(2) and faccessat(2) check permissions using real
+		// UID/GID, not effective UID/GID.
+		//
+		// "access() needs to use the real uid/gid, not the effective
+		// uid/gid. We do this by temporarily clearing all FS-related
+		// capabilities and switching the fsuid/fsgid around to the
+		// real ones." -fs/open.c:faccessat
+		creds := t.Credentials().Fork()
+		creds.EffectiveKUID = creds.RealKUID
+		creds.EffectiveKGID = creds.RealKGID
+		if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
+			creds.EffectiveCaps = creds.PermittedCaps
+		} else {
+			creds.EffectiveCaps = 0
+		}
+
+		ctx := &accessContext{
+			Context: t,
+			creds:   creds,
+		}
+
+		return d.Inode.CheckPermission(ctx, fs.PermMask{
+			Read:    mode&rOK != 0,
+			Write:   mode&wOK != 0,
+			Execute: mode&xOK != 0,
+		})
+	})
+}
+
+// Access implements linux syscall access(2).
+func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+
+	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Faccessat implements linux syscall faccessat(2).
+//
+// Note that the faccessat() system call does not take a flags argument:
+// "The raw faccessat() system call takes only the first three arguments. The
+// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within
+// the glibc wrapper function for faccessat().  If either of these flags is
+// specified, then the wrapper function employs fstatat(2) to determine access
+// permissions." - faccessat(2)
+func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+
+	return 0, nil, accessAt(t, dirFD, addr, mode)
+}
+
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
+// Ioctl implements linux syscall ioctl(2).
+func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	request := int(args[1].Int())
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Shared flags between file and socket.
+	switch request {
+	case linux.FIONCLEX:
+		t.FDTable().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: false,
+		})
+		return 0, nil, nil
+	case linux.FIOCLEX:
+		t.FDTable().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: true,
+		})
+		return 0, nil, nil
+
+	case linux.FIONBIO:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		flags := file.Flags()
+		if set != 0 {
+			flags.NonBlocking = true
+		} else {
+			flags.NonBlocking = false
+		}
+		file.SetFlags(flags.Settable())
+		return 0, nil, nil
+
+	case linux.FIOASYNC:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		flags := file.Flags()
+		if set != 0 {
+			flags.Async = true
+		} else {
+			flags.Async = false
+		}
+		file.SetFlags(flags.Settable())
+		return 0, nil, nil
+
+	case linux.FIOSETOWN, linux.SIOCSPGRP:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		fSetOwn(t, file, set)
+		return 0, nil, nil
+
+	case linux.FIOGETOWN, linux.SIOCGPGRP:
+		who := fGetOwn(t, file)
+		_, err := t.CopyOut(args[2].Pointer(), &who)
+		return 0, nil, err
+
+	default:
+		ret, err := file.FileOperations.Ioctl(t, file, t.MemoryManager(), args)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		return ret, nil, nil
+	}
+}
+
+// LINT.ThenChange(vfs2/ioctl.go)
+
+// LINT.IfChange
+
+// Getcwd implements the linux syscall getcwd(2).
+func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	size := args[1].SizeT()
+	cwd := t.FSContext().WorkingDirectory()
+	defer cwd.DecRef()
+	root := t.FSContext().RootDirectory()
+	defer root.DecRef()
+
+	// Get our fullname from the root and preprend unreachable if the root was
+	// unreachable from our current dirent this is the same behavior as on linux.
+	s, reachable := cwd.FullName(root)
+	if !reachable {
+		s = "(unreachable)" + s
+	}
+
+	// Note this is >= because we need a terminator.
+	if uint(len(s)) >= size {
+		return 0, nil, syserror.ERANGE
+	}
+
+	// Copy out the path name for the node.
+	bytes, err := t.CopyOutBytes(addr, []byte(s))
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Top it off with a terminator.
+	_, err = t.CopyOut(addr+usermem.Addr(bytes), []byte("\x00"))
+	return uintptr(bytes + 1), nil, err
+}
+
+// Chroot implements the linux syscall chroot(2).
+func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	if !t.HasCapability(linux.CAP_SYS_CHROOT) {
+		return 0, nil, syserror.EPERM
+	}
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		// Is it a directory?
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does it have execute permissions?
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+			return err
+		}
+
+		t.FSContext().SetRootDirectory(d)
+		return nil
+	})
+}
+
+// Chdir implements the linux syscall chdir(2).
+func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		// Is it a directory?
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does it have execute permissions?
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+			return err
+		}
+
+		t.FSContext().SetWorkingDirectory(d)
+		return nil
+	})
+}
+
+// Fchdir implements the linux syscall fchdir(2).
+func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Is it a directory?
+	if !fs.IsDir(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ENOTDIR
+	}
+
+	// Does it have execute permissions?
+	if err := file.Dirent.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+		return 0, nil, err
+	}
+
+	t.FSContext().SetWorkingDirectory(file.Dirent)
+	return 0, nil, nil
+}
+
+// LINT.ThenChange(vfs2/fscontext.go)
+
+// LINT.IfChange
+
+// Close implements linux syscall close(2).
+func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	// Note that Remove provides a reference on the file that we may use to
+	// flush. It is still active until we drop the final reference below
+	// (and other reference-holding operations complete).
+	file, _ := t.FDTable().Remove(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.Flush(t)
+	return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file)
+}
+
+// Dup implements linux syscall dup(2).
+func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{})
+	if err != nil {
+		return 0, nil, syserror.EMFILE
+	}
+	return uintptr(newFD), nil, nil
+}
+
+// Dup2 implements linux syscall dup2(2).
+func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := args[0].Int()
+	newfd := args[1].Int()
+
+	// If oldfd is a valid file descriptor, and newfd has the same value as oldfd,
+	// then dup2() does nothing, and returns newfd.
+	if oldfd == newfd {
+		oldFile := t.GetFile(oldfd)
+		if oldFile == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer oldFile.DecRef()
+
+		return uintptr(newfd), nil, nil
+	}
+
+	// Zero out flags arg to be used by Dup3.
+	args[2].Value = 0
+	return Dup3(t, args)
+}
+
+// Dup3 implements linux syscall dup3(2).
+func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := args[0].Int()
+	newfd := args[1].Int()
+	flags := args[2].Uint()
+
+	if oldfd == newfd {
+		return 0, nil, syserror.EINVAL
+	}
+
+	oldFile := t.GetFile(oldfd)
+	if oldFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer oldFile.DecRef()
+
+	err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(newfd), nil, nil
+}
+
+func fGetOwnEx(t *kernel.Task, file *fs.File) linux.FOwnerEx {
+	ma := file.Async(nil)
+	if ma == nil {
+		return linux.FOwnerEx{}
+	}
+	a := ma.(*fasync.FileAsync)
+	ot, otg, opg := a.Owner()
+	switch {
+	case ot != nil:
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_TID,
+			PID:  int32(t.PIDNamespace().IDOfTask(ot)),
+		}
+	case otg != nil:
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_PID,
+			PID:  int32(t.PIDNamespace().IDOfThreadGroup(otg)),
+		}
+	case opg != nil:
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_PGRP,
+			PID:  int32(t.PIDNamespace().IDOfProcessGroup(opg)),
+		}
+	default:
+		return linux.FOwnerEx{}
+	}
+}
+
+func fGetOwn(t *kernel.Task, file *fs.File) int32 {
+	owner := fGetOwnEx(t, file)
+	if owner.Type == linux.F_OWNER_PGRP {
+		return -owner.PID
+	}
+	return owner.PID
+}
+
+// fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux.
+//
+// If who is positive, it represents a PID. If negative, it represents a PGID.
+// If the PID or PGID is invalid, the owner is silently unset.
+func fSetOwn(t *kernel.Task, file *fs.File, who int32) error {
+	a := file.Async(fasync.New).(*fasync.FileAsync)
+	if who < 0 {
+		// Check for overflow before flipping the sign.
+		if who-1 > who {
+			return syserror.EINVAL
+		}
+		pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who))
+		a.SetOwnerProcessGroup(t, pg)
+	} else {
+		tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who))
+		a.SetOwnerThreadGroup(t, tg)
+	}
+	return nil
+}
+
+// Fcntl implements linux syscall fcntl(2).
+func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	cmd := args[1].Int()
+
+	file, flags := t.FDTable().Get(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	switch cmd {
+	case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
+		from := args[2].Int()
+		fd, err := t.NewFDFrom(from, file, kernel.FDFlags{
+			CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
+		})
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(fd), nil, nil
+	case linux.F_GETFD:
+		return uintptr(flags.ToLinuxFDFlags()), nil, nil
+	case linux.F_SETFD:
+		flags := args[2].Uint()
+		err := t.FDTable().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
+		})
+		return 0, nil, err
+	case linux.F_GETFL:
+		return uintptr(file.Flags().ToLinux()), nil, nil
+	case linux.F_SETFL:
+		flags := uint(args[2].Uint())
+		file.SetFlags(linuxToFlags(flags).Settable())
+		return 0, nil, nil
+	case linux.F_SETLK, linux.F_SETLKW:
+		// In Linux the file system can choose to provide lock operations for an inode.
+		// Normally pipe and socket types lack lock operations. We diverge and use a heavy
+		// hammer by only allowing locks on files and directories.
+		if !fs.IsFile(file.Dirent.Inode.StableAttr) && !fs.IsDir(file.Dirent.Inode.StableAttr) {
+			return 0, nil, syserror.EBADF
+		}
+
+		// Copy in the lock request.
+		flockAddr := args[2].Pointer()
+		var flock linux.Flock
+		if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+			return 0, nil, err
+		}
+
+		// Compute the lock whence.
+		var sw fs.SeekWhence
+		switch flock.Whence {
+		case 0:
+			sw = fs.SeekSet
+		case 1:
+			sw = fs.SeekCurrent
+		case 2:
+			sw = fs.SeekEnd
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Compute the lock offset.
+		var off int64
+		switch sw {
+		case fs.SeekSet:
+			off = 0
+		case fs.SeekCurrent:
+			// Note that Linux does not hold any mutexes while retrieving the file offset,
+			// see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
+			off = file.Offset()
+		case fs.SeekEnd:
+			uattr, err := file.Dirent.Inode.UnstableAttr(t)
+			if err != nil {
+				return 0, nil, err
+			}
+			off = uattr.Size
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Compute the lock range.
+		rng, err := lock.ComputeRange(flock.Start, flock.Len, off)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		// These locks don't block; execute the non-blocking operation using the inode's lock
+		// context directly.
+		switch flock.Type {
+		case linux.F_RDLCK:
+			if !file.Flags().Read {
+				return 0, nil, syserror.EBADF
+			}
+			if cmd == linux.F_SETLK {
+				// Non-blocking lock, provide a nil lock.Blocker.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, nil) {
+					return 0, nil, syserror.EAGAIN
+				}
+			} else {
+				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, t) {
+					return 0, nil, syserror.EINTR
+				}
+			}
+			return 0, nil, nil
+		case linux.F_WRLCK:
+			if !file.Flags().Write {
+				return 0, nil, syserror.EBADF
+			}
+			if cmd == linux.F_SETLK {
+				// Non-blocking lock, provide a nil lock.Blocker.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, nil) {
+					return 0, nil, syserror.EAGAIN
+				}
+			} else {
+				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, t) {
+					return 0, nil, syserror.EINTR
+				}
+			}
+			return 0, nil, nil
+		case linux.F_UNLCK:
+			file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng)
+			return 0, nil, nil
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+	case linux.F_GETOWN:
+		return uintptr(fGetOwn(t, file)), nil, nil
+	case linux.F_SETOWN:
+		return 0, nil, fSetOwn(t, file, args[2].Int())
+	case linux.F_GETOWN_EX:
+		addr := args[2].Pointer()
+		owner := fGetOwnEx(t, file)
+		_, err := t.CopyOut(addr, &owner)
+		return 0, nil, err
+	case linux.F_SETOWN_EX:
+		addr := args[2].Pointer()
+		var owner linux.FOwnerEx
+		n, err := t.CopyIn(addr, &owner)
+		if err != nil {
+			return 0, nil, err
+		}
+		a := file.Async(fasync.New).(*fasync.FileAsync)
+		switch owner.Type {
+		case linux.F_OWNER_TID:
+			task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID))
+			if task == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			a.SetOwnerTask(t, task)
+			return uintptr(n), nil, nil
+		case linux.F_OWNER_PID:
+			tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID))
+			if tg == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			a.SetOwnerThreadGroup(t, tg)
+			return uintptr(n), nil, nil
+		case linux.F_OWNER_PGRP:
+			pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID))
+			if pg == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			a.SetOwnerProcessGroup(t, pg)
+			return uintptr(n), nil, nil
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+	case linux.F_GET_SEALS:
+		val, err := tmpfs.GetSeals(file.Dirent.Inode)
+		return uintptr(val), nil, err
+	case linux.F_ADD_SEALS:
+		if !file.Flags().Write {
+			return 0, nil, syserror.EPERM
+		}
+		err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint())
+		return 0, nil, err
+	case linux.F_GETPIPE_SZ:
+		sz, ok := file.FileOperations.(fs.FifoSizer)
+		if !ok {
+			return 0, nil, syserror.EINVAL
+		}
+		size, err := sz.FifoSize(t, file)
+		return uintptr(size), nil, err
+	case linux.F_SETPIPE_SZ:
+		sz, ok := file.FileOperations.(fs.FifoSizer)
+		if !ok {
+			return 0, nil, syserror.EINVAL
+		}
+		n, err := sz.SetFifoSize(int64(args[2].Int()))
+		return uintptr(n), nil, err
+	default:
+		// Everything else is not yet supported.
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+// Fadvise64 implements linux syscall fadvise64(2).
+// This implementation currently ignores the provided advice.
+func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	length := args[2].Int64()
+	advice := args[3].Int()
+
+	// Note: offset is allowed to be negative.
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// If the FD refers to a pipe or FIFO, return error.
+	if fs.IsPipe(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	switch advice {
+	case linux.POSIX_FADV_NORMAL:
+	case linux.POSIX_FADV_RANDOM:
+	case linux.POSIX_FADV_SEQUENTIAL:
+	case linux.POSIX_FADV_WILLNEED:
+	case linux.POSIX_FADV_DONTNEED:
+	case linux.POSIX_FADV_NOREUSE:
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Sure, whatever.
+	return 0, nil, nil
+}
+
+func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does this directory exist already?
+		remainingTraversals := uint(linux.MaxSymlinkTraversals)
+		f, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals)
+		switch err {
+		case nil:
+			// The directory existed.
+			defer f.DecRef()
+			return syserror.EEXIST
+		case syserror.EACCES:
+			// Permission denied while walking to the directory.
+			return err
+		default:
+			// Do we have write permissions on the parent?
+			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+
+			// Create the directory.
+			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+			return d.CreateDirectory(t, root, name, perms)
+		}
+	})
+}
+
+// Mkdir implements linux syscall mkdir(2).
+func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+
+	return 0, nil, mkdirAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Mkdirat implements linux syscall mkdirat(2).
+func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := args[0].Int()
+	addr := args[1].Pointer()
+	mode := linux.FileMode(args[2].ModeT())
+
+	return 0, nil, mkdirAt(t, dirFD, addr, mode)
+}
+
+func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	// Special case: removing the root always returns EBUSY.
+	if path == "/" {
+		return syserror.EBUSY
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Linux returns different ernos when the path ends in single
+		// dot vs. double dots.
+		switch name {
+		case ".":
+			return syserror.EINVAL
+		case "..":
+			return syserror.ENOTEMPTY
+		}
+
+		if err := d.MayDelete(t, root, name); err != nil {
+			return err
+		}
+
+		return d.RemoveDirectory(t, root, name)
+	})
+}
+
+// Rmdir implements linux syscall rmdir(2).
+func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr)
+}
+
+func symlinkAt(t *kernel.Task, dirFD int32, newAddr usermem.Addr, oldAddr usermem.Addr) error {
+	newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	// The oldPath is copied in verbatim. This is because the symlink
+	// will include all details, including trailing slashes.
+	oldPath, err := t.CopyInString(oldAddr, linux.PATH_MAX)
+	if err != nil {
+		return err
+	}
+	if oldPath == "" {
+		return syserror.ENOENT
+	}
+
+	return fileOpAt(t, dirFD, newPath, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Make sure we have write permissions on the parent directory.
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+			return err
+		}
+		return d.CreateLink(t, root, oldPath, name)
+	})
+}
+
+// Symlink implements linux syscall symlink(2).
+func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	newAddr := args[1].Pointer()
+
+	return 0, nil, symlinkAt(t, linux.AT_FDCWD, newAddr, oldAddr)
+}
+
+// Symlinkat implements linux syscall symlinkat(2).
+func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	dirFD := args[1].Int()
+	newAddr := args[2].Pointer()
+
+	return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr)
+}
+
+// mayLinkAt determines whether t can create a hard link to target.
+//
+// This corresponds to Linux's fs/namei.c:may_linkat.
+func mayLinkAt(t *kernel.Task, target *fs.Inode) error {
+	// Linux will impose the following restrictions on hard links only if
+	// sysctl_protected_hardlinks is enabled. The kernel disables this
+	// setting by default for backward compatibility (see commit
+	// 561ec64ae67e), but also recommends that distributions enable it (and
+	// Debian does:
+	// https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=889098).
+	//
+	// gVisor currently behaves as though sysctl_protected_hardlinks is
+	// always enabled, and thus imposes the following restrictions on hard
+	// links.
+
+	if target.CheckOwnership(t) {
+		// fs/namei.c:may_linkat: "Source inode owner (or CAP_FOWNER)
+		// can hardlink all they like."
+		return nil
+	}
+
+	// If we are not the owner, then the file must be regular and have
+	// Read+Write permissions.
+	if !fs.IsRegular(target.StableAttr) {
+		return syserror.EPERM
+	}
+	if target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil {
+		return syserror.EPERM
+	}
+
+	return nil
+}
+
+// linkAt creates a hard link to the target specified by oldDirFD and oldAddr,
+// specified by newDirFD and newAddr.  If resolve is true, then the symlinks
+// will be followed when evaluating the target.
+func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr, resolve, allowEmpty bool) error {
+	oldPath, _, err := copyInPath(t, oldAddr, allowEmpty)
+	if err != nil {
+		return err
+	}
+	newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	if allowEmpty && oldPath == "" {
+		target := t.GetFile(oldDirFD)
+		if target == nil {
+			return syserror.EBADF
+		}
+		defer target.DecRef()
+		if err := mayLinkAt(t, target.Dirent.Inode); err != nil {
+			return err
+		}
+
+		// Resolve the target directory.
+		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error {
+			if !fs.IsDir(newParent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
+
+			// Make sure we have write permissions on the parent directory.
+			if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+			return newParent.CreateHardLink(t, root, target.Dirent, newName)
+		})
+	}
+
+	// Resolve oldDirFD and oldAddr to a dirent.  The "resolve" argument
+	// only applies to this name.
+	return fileOpOn(t, oldDirFD, oldPath, resolve, func(root *fs.Dirent, target *fs.Dirent, _ uint) error {
+		if err := mayLinkAt(t, target.Inode); err != nil {
+			return err
+		}
+
+		// Next resolve newDirFD and newAddr to the parent dirent and name.
+		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error {
+			if !fs.IsDir(newParent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
+
+			// Make sure we have write permissions on the parent directory.
+			if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+			return newParent.CreateHardLink(t, root, target, newName)
+		})
+	})
+}
+
+// Link implements linux syscall link(2).
+func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	newAddr := args[1].Pointer()
+
+	// man link(2):
+	// POSIX.1-2001 says that link() should dereference oldpath if it is a
+	// symbolic link. However, since kernel 2.0, Linux does not do so: if
+	// oldpath is a symbolic link, then newpath is created as a (hard) link
+	// to the same symbolic link file (i.e., newpath becomes a symbolic
+	// link to the same file that oldpath refers to).
+	resolve := false
+	return 0, nil, linkAt(t, linux.AT_FDCWD, oldAddr, linux.AT_FDCWD, newAddr, resolve, false /* allowEmpty */)
+}
+
+// Linkat implements linux syscall linkat(2).
+func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldDirFD := args[0].Int()
+	oldAddr := args[1].Pointer()
+	newDirFD := args[2].Int()
+	newAddr := args[3].Pointer()
+
+	// man linkat(2):
+	// By default, linkat(), does not dereference oldpath if it is a
+	// symbolic link (like link(2)). Since Linux 2.6.18, the flag
+	// AT_SYMLINK_FOLLOW can be specified in flags to cause oldpath to be
+	// dereferenced if it is a symbolic link.
+	flags := args[4].Int()
+
+	// Sanity check flags.
+	if flags&^(linux.AT_SYMLINK_FOLLOW|linux.AT_EMPTY_PATH) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	resolve := flags&linux.AT_SYMLINK_FOLLOW == linux.AT_SYMLINK_FOLLOW
+	allowEmpty := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH
+
+	if allowEmpty && !t.HasCapabilityIn(linux.CAP_DAC_READ_SEARCH, t.UserNamespace().Root()) {
+		return 0, nil, syserror.ENOENT
+	}
+
+	return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
+}
+
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
+func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, err
+	}
+	if dirPath {
+		return 0, syserror.ENOENT
+	}
+
+	err = fileOpOn(t, dirFD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		// Check for Read permission.
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Read: true}); err != nil {
+			return err
+		}
+
+		s, err := d.Inode.Readlink(t)
+		if err == syserror.ENOLINK {
+			return syserror.EINVAL
+		}
+		if err != nil {
+			return err
+		}
+
+		buffer := []byte(s)
+		if uint(len(buffer)) > size {
+			buffer = buffer[:size]
+		}
+
+		n, err := t.CopyOutBytes(bufAddr, buffer)
+
+		// Update frame return value.
+		copied = uintptr(n)
+
+		return err
+	})
+	return copied, err // Return frame value.
+}
+
+// Readlink implements linux syscall readlink(2).
+func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	n, err := readlinkAt(t, linux.AT_FDCWD, addr, bufAddr, size)
+	return n, nil, err
+}
+
+// Readlinkat implements linux syscall readlinkat(2).
+func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := args[0].Int()
+	addr := args[1].Pointer()
+	bufAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	n, err := readlinkAt(t, dirFD, addr, bufAddr, size)
+	return n, nil, err
+}
+
+// LINT.ThenChange(vfs2/stat.go)
+
+// LINT.IfChange
+
+func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		if err := d.MayDelete(t, root, name); err != nil {
+			return err
+		}
+
+		return d.Remove(t, root, name, dirPath)
+	})
+}
+
+// Unlink implements linux syscall unlink(2).
+func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	return 0, nil, unlinkAt(t, linux.AT_FDCWD, addr)
+}
+
+// Unlinkat implements linux syscall unlinkat(2).
+func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := args[0].Int()
+	addr := args[1].Pointer()
+	flags := args[2].Uint()
+	if flags&linux.AT_REMOVEDIR != 0 {
+		return 0, nil, rmdirAt(t, dirFD, addr)
+	}
+	return 0, nil, unlinkAt(t, dirFD, addr)
+}
+
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
+// Truncate implements linux syscall truncate(2).
+func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+	if dirPath {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(linux.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if fs.IsDir(d.Inode.StableAttr) {
+			return syserror.EISDIR
+		}
+		// In contrast to open(O_TRUNC), truncate(2) is only valid for file
+		// types.
+		if !fs.IsFile(d.Inode.StableAttr) {
+			return syserror.EINVAL
+		}
+
+		// Reject truncation if the access permissions do not allow truncation.
+		// This is different from the behavior of sys_ftruncate, see below.
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
+			return err
+		}
+
+		if err := d.Inode.Truncate(t, d, length); err != nil {
+			return err
+		}
+
+		// File length modified, generate notification.
+		d.InotifyEvent(linux.IN_MODIFY, 0)
+
+		return nil
+	})
+}
+
+// Ftruncate implements linux syscall ftruncate(2).
+func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	length := args[1].Int64()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Reject truncation if the file flags do not permit this operation.
+	// This is different from truncate(2) above.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// In contrast to open(O_TRUNC), truncate(2) is only valid for file
+	// types. Note that this is different from truncate(2) above, where a
+	// directory returns EISDIR.
+	if !fs.IsFile(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(linux.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	if err := file.Dirent.Inode.Truncate(t, file.Dirent, length); err != nil {
+		return 0, nil, err
+	}
+
+	// File length modified, generate notification.
+	file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+
+	return 0, nil, nil
+}
+
+// LINT.ThenChange(vfs2/setstat.go)
+
+// Umask implements linux syscall umask(2).
+func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	mask := args[0].ModeT()
+	mask = t.FSContext().SwapUmask(mask & 0777)
+	return uintptr(mask), nil, nil
+}
+
+// LINT.IfChange
+
+// Change ownership of a file.
+//
+// uid and gid may be -1, in which case they will not be changed.
+func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error {
+	owner := fs.FileOwner{
+		UID: auth.NoID,
+		GID: auth.NoID,
+	}
+
+	uattr, err := d.Inode.UnstableAttr(t)
+	if err != nil {
+		return err
+	}
+	c := t.Credentials()
+	hasCap := d.Inode.CheckCapability(t, linux.CAP_CHOWN)
+	isOwner := uattr.Owner.UID == c.EffectiveKUID
+	if uid.Ok() {
+		kuid := c.UserNamespace.MapToKUID(uid)
+		// Valid UID must be supplied if UID is to be changed.
+		if !kuid.Ok() {
+			return syserror.EINVAL
+		}
+
+		// "Only a privileged process (CAP_CHOWN) may change the owner
+		// of a file." -chown(2)
+		//
+		// Linux also allows chown if you own the file and are
+		// explicitly not changing its UID.
+		isNoop := uattr.Owner.UID == kuid
+		if !(hasCap || (isOwner && isNoop)) {
+			return syserror.EPERM
+		}
+
+		owner.UID = kuid
+	}
+	if gid.Ok() {
+		kgid := c.UserNamespace.MapToKGID(gid)
+		// Valid GID must be supplied if GID is to be changed.
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+
+		// "The owner of a file may change the group of the file to any
+		// group of which that owner is a member. A privileged process
+		// (CAP_CHOWN) may change the group arbitrarily." -chown(2)
+		isNoop := uattr.Owner.GID == kgid
+		isMemberGroup := c.InGroup(kgid)
+		if !(hasCap || (isOwner && (isNoop || isMemberGroup))) {
+			return syserror.EPERM
+		}
+
+		owner.GID = kgid
+	}
+
+	// FIXME(b/62949101): This is racy; the inode's owner may have changed in
+	// the meantime. (Linux holds i_mutex while calling
+	// fs/attr.c:notify_change() => inode_operations::setattr =>
+	// inode_change_ok().)
+	if err := d.Inode.SetOwner(t, d, owner); err != nil {
+		return err
+	}
+
+	// When the owner or group are changed by an unprivileged user,
+	// chown(2) also clears the set-user-ID and set-group-ID bits, but
+	// we do not support them.
+	return nil
+}
+
+func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error {
+	path, _, err := copyInPath(t, addr, allowEmpty)
+	if err != nil {
+		return err
+	}
+
+	if path == "" {
+		// Annoying. What's wrong with fchown?
+		file := t.GetFile(fd)
+		if file == nil {
+			return syserror.EBADF
+		}
+		defer file.DecRef()
+
+		return chown(t, file.Dirent, uid, gid)
+	}
+
+	return fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		return chown(t, d, uid, gid)
+	})
+}
+
+// Chown implements linux syscall chown(2).
+func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	uid := auth.UID(args[1].Uint())
+	gid := auth.GID(args[2].Uint())
+
+	return 0, nil, chownAt(t, linux.AT_FDCWD, addr, true /* resolve */, false /* allowEmpty */, uid, gid)
+}
+
+// Lchown implements linux syscall lchown(2).
+func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	uid := auth.UID(args[1].Uint())
+	gid := auth.GID(args[2].Uint())
+
+	return 0, nil, chownAt(t, linux.AT_FDCWD, addr, false /* resolve */, false /* allowEmpty */, uid, gid)
+}
+
+// Fchown implements linux syscall fchown(2).
+func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	uid := auth.UID(args[1].Uint())
+	gid := auth.GID(args[2].Uint())
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, chown(t, file.Dirent, uid, gid)
+}
+
+// Fchownat implements Linux syscall fchownat(2).
+func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := args[0].Int()
+	addr := args[1].Pointer()
+	uid := auth.UID(args[2].Uint())
+	gid := auth.GID(args[3].Uint())
+	flags := args[4].Int()
+
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, chownAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, flags&linux.AT_EMPTY_PATH != 0, uid, gid)
+}
+
+func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error {
+	// Must own file to change mode.
+	if !d.Inode.CheckOwnership(t) {
+		return syserror.EPERM
+	}
+
+	p := fs.FilePermsFromMode(mode)
+	if !d.Inode.SetPermissions(t, d, p) {
+		return syserror.EPERM
+	}
+
+	// File attribute changed, generate notification.
+	d.InotifyEvent(linux.IN_ATTRIB, 0)
+
+	return nil
+}
+
+func chmodAt(t *kernel.Task, fd int32, addr usermem.Addr, mode linux.FileMode) error {
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpOn(t, fd, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		return chmod(t, d, mode)
+	})
+}
+
+// Chmod implements linux syscall chmod(2).
+func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+
+	return 0, nil, chmodAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Fchmod implements linux syscall fchmod(2).
+func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	mode := linux.FileMode(args[1].ModeT())
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, chmod(t, file.Dirent, mode)
+}
+
+// Fchmodat implements linux syscall fchmodat(2).
+func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := linux.FileMode(args[2].ModeT())
+
+	return 0, nil, chmodAt(t, fd, addr, mode)
+}
+
+// defaultSetToSystemTimeSpec returns a TimeSpec that will set ATime and MTime
+// to the system time.
+func defaultSetToSystemTimeSpec() fs.TimeSpec {
+	return fs.TimeSpec{
+		ATimeSetSystemTime: true,
+		MTimeSetSystemTime: true,
+	}
+}
+
+func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error {
+	setTimestamp := func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		// Does the task own the file?
+		if !d.Inode.CheckOwnership(t) {
+			// Trying to set a specific time? Must be owner.
+			if (ts.ATimeOmit || !ts.ATimeSetSystemTime) && (ts.MTimeOmit || !ts.MTimeSetSystemTime) {
+				return syserror.EPERM
+			}
+
+			// Trying to set to current system time? Must have write access.
+			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
+				return err
+			}
+		}
+
+		if err := d.Inode.SetTimestamps(t, d, ts); err != nil {
+			return err
+		}
+
+		// File attribute changed, generate notification.
+		d.InotifyEvent(linux.IN_ATTRIB, 0)
+		return nil
+	}
+
+	// From utimes.c:
+	// "If filename is NULL and dfd refers to an open file, then operate on
+	// the file.  Otherwise look up filename, possibly using dfd as a
+	// starting point."
+	if addr == 0 && dirFD != linux.AT_FDCWD {
+		if !resolve {
+			// Linux returns EINVAL in this case. See utimes.c.
+			return syserror.EINVAL
+		}
+		f := t.GetFile(dirFD)
+		if f == nil {
+			return syserror.EBADF
+		}
+		defer f.DecRef()
+
+		root := t.FSContext().RootDirectory()
+		defer root.DecRef()
+
+		return setTimestamp(root, f.Dirent, linux.MaxSymlinkTraversals)
+	}
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpOn(t, dirFD, path, resolve, setTimestamp)
+}
+
+// Utime implements linux syscall utime(2).
+func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	filenameAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times linux.Utime
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		ts = fs.TimeSpec{
+			ATime: ktime.FromSeconds(times.Actime),
+			MTime: ktime.FromSeconds(times.Modtime),
+		}
+	}
+	return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
+}
+
+// Utimes implements linux syscall utimes(2).
+func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	filenameAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times [2]linux.Timeval
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		ts = fs.TimeSpec{
+			ATime: ktime.FromTimeval(times[0]),
+			MTime: ktime.FromTimeval(times[1]),
+		}
+	}
+	return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
+}
+
+// timespecIsValid checks that the timespec is valid for use in utimensat.
+func timespecIsValid(ts linux.Timespec) bool {
+	// Nsec must be UTIME_OMIT, UTIME_NOW, or less than 10^9.
+	return ts.Nsec == linux.UTIME_OMIT || ts.Nsec == linux.UTIME_NOW || ts.Nsec < 1e9
+}
+
+// Utimensat implements linux syscall utimensat(2).
+func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := args[0].Int()
+	pathnameAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+	flags := args[3].Int()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times [2]linux.Timespec
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) {
+			return 0, nil, syserror.EINVAL
+		}
+
+		// If both are UTIME_OMIT, this is a noop.
+		if times[0].Nsec == linux.UTIME_OMIT && times[1].Nsec == linux.UTIME_OMIT {
+			return 0, nil, nil
+		}
+
+		ts = fs.TimeSpec{
+			ATime:              ktime.FromTimespec(times[0]),
+			ATimeOmit:          times[0].Nsec == linux.UTIME_OMIT,
+			ATimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
+			MTime:              ktime.FromTimespec(times[1]),
+			MTimeOmit:          times[1].Nsec == linux.UTIME_OMIT,
+			MTimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
+		}
+	}
+	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, flags&linux.AT_SYMLINK_NOFOLLOW == 0)
+}
+
+// Futimesat implements linux syscall futimesat(2).
+func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := args[0].Int()
+	pathnameAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times [2]linux.Timeval
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		if times[0].Usec >= 1e6 || times[0].Usec < 0 ||
+			times[1].Usec >= 1e6 || times[1].Usec < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+
+		ts = fs.TimeSpec{
+			ATime: ktime.FromTimeval(times[0]),
+			MTime: ktime.FromTimeval(times[1]),
+		}
+	}
+	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
+}
+
+// LINT.ThenChange(vfs2/setstat.go)
+
+// LINT.IfChange
+
+func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error {
+	newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	oldPath, _, err := copyInPath(t, oldAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpAt(t, oldDirFD, oldPath, func(root *fs.Dirent, oldParent *fs.Dirent, oldName string, _ uint) error {
+		if !fs.IsDir(oldParent.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Rename rejects paths that end in ".", "..", or empty (i.e.
+		// the root) with EBUSY.
+		switch oldName {
+		case "", ".", "..":
+			return syserror.EBUSY
+		}
+
+		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error {
+			if !fs.IsDir(newParent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
+
+			// Rename rejects paths that end in ".", "..", or empty
+			// (i.e.  the root) with EBUSY.
+			switch newName {
+			case "", ".", "..":
+				return syserror.EBUSY
+			}
+
+			return fs.Rename(t, root, oldParent, oldName, newParent, newName)
+		})
+	})
+}
+
+// Rename implements linux syscall rename(2).
+func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldPathAddr := args[0].Pointer()
+	newPathAddr := args[1].Pointer()
+	return 0, nil, renameAt(t, linux.AT_FDCWD, oldPathAddr, linux.AT_FDCWD, newPathAddr)
+}
+
+// Renameat implements linux syscall renameat(2).
+func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldDirFD := args[0].Int()
+	oldPathAddr := args[1].Pointer()
+	newDirFD := args[2].Int()
+	newPathAddr := args[3].Pointer()
+	return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
+}
+
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// Fallocate implements linux system call fallocate(2).
+func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	mode := args[1].Int64()
+	offset := args[2].Int64()
+	length := args[3].Int64()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	if offset < 0 || length <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if mode != 0 {
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.ENOTSUP
+	}
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+	if fs.IsPipe(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ESPIPE
+	}
+	if fs.IsDir(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EISDIR
+	}
+	if !fs.IsRegular(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ENODEV
+	}
+	size := offset + length
+	if size < 0 {
+		return 0, nil, syserror.EFBIG
+	}
+	if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(linux.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	if err := file.Dirent.Inode.Allocate(t, file.Dirent, offset, length); err != nil {
+		return 0, nil, err
+	}
+
+	// File length modified, generate notification.
+	file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+
+	return 0, nil, nil
+}
+
+// Flock implements linux syscall flock(2).
+func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	operation := args[1].Int()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		// flock(2): EBADF fd is not an open file descriptor.
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	nonblocking := operation&linux.LOCK_NB != 0
+	operation &^= linux.LOCK_NB
+
+	// A BSD style lock spans the entire file.
+	rng := lock.LockRange{
+		Start: 0,
+		End:   lock.LockEOF,
+	}
+
+	switch operation {
+	case linux.LOCK_EX:
+		if nonblocking {
+			// Since we're nonblocking we pass a nil lock.Blocker implementation.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, nil) {
+				return 0, nil, syserror.EWOULDBLOCK
+			}
+		} else {
+			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, t) {
+				return 0, nil, syserror.EINTR
+			}
+		}
+	case linux.LOCK_SH:
+		if nonblocking {
+			// Since we're nonblocking we pass a nil lock.Blocker implementation.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, nil) {
+				return 0, nil, syserror.EWOULDBLOCK
+			}
+		} else {
+			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, t) {
+				return 0, nil, syserror.EINTR
+			}
+		}
+	case linux.LOCK_UN:
+		file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng)
+	default:
+		// flock(2): EINVAL operation is invalid.
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, nil
+}
+
+const (
+	memfdPrefix     = "/memfd:"
+	memfdAllFlags   = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
+	memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + 1
+)
+
+// MemfdCreate implements the linux syscall memfd_create(2).
+func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Uint()
+
+	if flags&^memfdAllFlags != 0 {
+		// Unknown bits in flags.
+		return 0, nil, syserror.EINVAL
+	}
+
+	allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
+	cloExec := flags&linux.MFD_CLOEXEC != 0
+
+	name, err := t.CopyInString(addr, syscall.PathMax-len(memfdPrefix))
+	if err != nil {
+		return 0, nil, err
+	}
+	if len(name) > memfdMaxNameLen {
+		return 0, nil, syserror.EINVAL
+	}
+	name = memfdPrefix + name
+
+	inode := tmpfs.NewMemfdInode(t, allowSeals)
+	dirent := fs.NewDirent(t, inode, name)
+	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
+	// FMODE_READ | FMODE_WRITE.
+	file, err := inode.GetFile(t, dirent, fs.FileFlags{Read: true, Write: true})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	defer dirent.DecRef()
+	defer file.DecRef()
+
+	newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
+		CloseOnExec: cloExec,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(newFD), nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
new file mode 100644
index 000000000..b68261f72
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -0,0 +1,288 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// futexWaitRestartBlock encapsulates the state required to restart futex(2)
+// via restart_syscall(2).
+//
+// +stateify savable
+type futexWaitRestartBlock struct {
+	duration time.Duration
+
+	// addr stored as uint64 since uintptr is not save-able.
+	addr    uint64
+	private bool
+	val     uint32
+	mask    uint32
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return futexWaitDuration(t, f.duration, false, usermem.Addr(f.addr), f.private, f.val, f.mask)
+}
+
+// futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is
+// complete.
+//
+// The wait blocks forever if forever is true, otherwise it blocks until ts.
+//
+// If blocking is interrupted, the syscall is restarted with the original
+// arguments.
+func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr usermem.Addr, private bool, val, mask uint32) (uintptr, error) {
+	w := t.FutexWaiter()
+	err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
+	if err != nil {
+		return 0, err
+	}
+
+	if forever {
+		err = t.Block(w.C)
+	} else if clockRealtime {
+		notifier, tchan := ktime.NewChannelNotifier()
+		timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
+		timer.Swap(ktime.Setting{
+			Enabled: true,
+			Next:    ktime.FromTimespec(ts),
+		})
+		err = t.BlockWithTimer(w.C, tchan)
+		timer.Destroy()
+	} else {
+		err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts))
+	}
+
+	t.Futex().WaitComplete(w)
+	return 0, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is
+// complete.
+//
+// The wait blocks forever if forever is true, otherwise is blocks for
+// duration.
+//
+// If blocking is interrupted, forever determines how to restart the
+// syscall. If forever is true, the syscall is restarted with the original
+// arguments. If forever is false, duration is a relative timeout and the
+// syscall is restarted with the remaining timeout.
+func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr usermem.Addr, private bool, val, mask uint32) (uintptr, error) {
+	w := t.FutexWaiter()
+	err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
+	if err != nil {
+		return 0, err
+	}
+
+	remaining, err := t.BlockWithTimeout(w.C, !forever, duration)
+	t.Futex().WaitComplete(w)
+	if err == nil {
+		return 0, nil
+	}
+
+	// The wait was unsuccessful for some reason other than interruption. Simply
+	// forward the error.
+	if err != syserror.ErrInterrupted {
+		return 0, err
+	}
+
+	// The wait was interrupted and we need to restart. Decide how.
+
+	// The wait duration was absolute, restart with the original arguments.
+	if forever {
+		return 0, kernel.ERESTARTSYS
+	}
+
+	// The wait duration was relative, restart with the remaining duration.
+	t.SetSyscallRestartBlock(&futexWaitRestartBlock{
+		duration: remaining,
+		addr:     uint64(addr),
+		private:  private,
+		val:      val,
+		mask:     mask,
+	})
+	return 0, kernel.ERESTART_RESTARTBLOCK
+}
+
+func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.Addr, private bool) error {
+	w := t.FutexWaiter()
+	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false)
+	if err != nil {
+		return err
+	}
+	if locked {
+		// Futex acquired, we're done!
+		return nil
+	}
+
+	if forever {
+		err = t.Block(w.C)
+	} else {
+		notifier, tchan := ktime.NewChannelNotifier()
+		timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
+		timer.Swap(ktime.Setting{
+			Enabled: true,
+			Next:    ktime.FromTimespec(ts),
+		})
+		err = t.BlockWithTimer(w.C, tchan)
+		timer.Destroy()
+	}
+
+	t.Futex().WaitComplete(w)
+	return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+func tryLockPI(t *kernel.Task, addr usermem.Addr, private bool) error {
+	w := t.FutexWaiter()
+	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true)
+	if err != nil {
+		return err
+	}
+	if !locked {
+		return syserror.EWOULDBLOCK
+	}
+	return nil
+}
+
+// Futex implements linux syscall futex(2).
+// It provides a method for a program to wait for a value at a given address to
+// change, and a method to wake up anyone waiting on a particular address.
+func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	futexOp := args[1].Int()
+	val := int(args[2].Int())
+	nreq := int(args[3].Int())
+	timeout := args[3].Pointer()
+	naddr := args[4].Pointer()
+	val3 := args[5].Int()
+
+	cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME)
+	private := (futexOp & linux.FUTEX_PRIVATE_FLAG) != 0
+	clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME
+	mask := uint32(val3)
+
+	switch cmd {
+	case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET:
+		// WAIT{_BITSET} wait forever if the timeout isn't passed.
+		forever := (timeout == 0)
+
+		var timespec linux.Timespec
+		if !forever {
+			var err error
+			timespec, err = copyTimespecIn(t, timeout)
+			if err != nil {
+				return 0, nil, err
+			}
+		}
+
+		switch cmd {
+		case linux.FUTEX_WAIT:
+			// WAIT uses a relative timeout.
+			mask = ^uint32(0)
+			var timeoutDur time.Duration
+			if !forever {
+				timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond
+			}
+			n, err := futexWaitDuration(t, timeoutDur, forever, addr, private, uint32(val), mask)
+			return n, nil, err
+
+		case linux.FUTEX_WAIT_BITSET:
+			// WAIT_BITSET uses an absolute timeout which is either
+			// CLOCK_MONOTONIC or CLOCK_REALTIME.
+			if mask == 0 {
+				return 0, nil, syserror.EINVAL
+			}
+			n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask)
+			return n, nil, err
+		default:
+			panic("unreachable")
+		}
+
+	case linux.FUTEX_WAKE:
+		mask = ^uint32(0)
+		fallthrough
+
+	case linux.FUTEX_WAKE_BITSET:
+		if mask == 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if val <= 0 {
+			// The Linux kernel wakes one waiter even if val is
+			// non-positive.
+			val = 1
+		}
+		n, err := t.Futex().Wake(t, addr, private, mask, val)
+		return uintptr(n), nil, err
+
+	case linux.FUTEX_REQUEUE:
+		n, err := t.Futex().Requeue(t, addr, naddr, private, val, nreq)
+		return uintptr(n), nil, err
+
+	case linux.FUTEX_CMP_REQUEUE:
+		// 'val3' contains the value to be checked at 'addr' and
+		// 'val' is the number of waiters that should be woken up.
+		nval := uint32(val3)
+		n, err := t.Futex().RequeueCmp(t, addr, naddr, private, nval, val, nreq)
+		return uintptr(n), nil, err
+
+	case linux.FUTEX_WAKE_OP:
+		op := uint32(val3)
+		if val <= 0 {
+			// The Linux kernel wakes one waiter even if val is
+			// non-positive.
+			val = 1
+		}
+		n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
+		return uintptr(n), nil, err
+
+	case linux.FUTEX_LOCK_PI:
+		forever := (timeout == 0)
+
+		var timespec linux.Timespec
+		if !forever {
+			var err error
+			timespec, err = copyTimespecIn(t, timeout)
+			if err != nil {
+				return 0, nil, err
+			}
+		}
+		err := futexLockPI(t, timespec, forever, addr, private)
+		return 0, nil, err
+
+	case linux.FUTEX_TRYLOCK_PI:
+		err := tryLockPI(t, addr, private)
+		return 0, nil, err
+
+	case linux.FUTEX_UNLOCK_PI:
+		err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private)
+		return 0, nil, err
+
+	case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.ENOSYS
+
+	default:
+		// We don't even know about this command.
+		return 0, nil, syserror.ENOSYS
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
new file mode 100644
index 000000000..b126fecc0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -0,0 +1,250 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"bytes"
+	"io"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// LINT.IfChange
+
+// Getdents implements linux syscall getdents(2) for 64bit systems.
+func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := int(args[2].Uint())
+
+	minSize := int(smallestDirent(t.Arch()))
+	if size < minSize {
+		// size is smaller than smallest possible dirent.
+		return 0, nil, syserror.EINVAL
+	}
+
+	n, err := getdents(t, fd, addr, size, (*dirent).Serialize)
+	return n, nil, err
+}
+
+// Getdents64 implements linux syscall getdents64(2).
+func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := int(args[2].Uint())
+
+	minSize := int(smallestDirent64(t.Arch()))
+	if size < minSize {
+		// size is smaller than smallest possible dirent.
+		return 0, nil, syserror.EINVAL
+	}
+
+	n, err := getdents(t, fd, addr, size, (*dirent).Serialize64)
+	return n, nil, err
+}
+
+// getdents implements the core of getdents(2)/getdents64(2).
+// f is the syscall implementation dirent serialization function.
+func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) {
+	dir := t.GetFile(fd)
+	if dir == nil {
+		return 0, syserror.EBADF
+	}
+	defer dir.DecRef()
+
+	w := &usermem.IOReadWriter{
+		Ctx:  t,
+		IO:   t.MemoryManager(),
+		Addr: addr,
+		Opts: usermem.IOOpts{
+			AddressSpaceActive: true,
+		},
+	}
+
+	ds := newDirentSerializer(f, w, t.Arch(), size)
+	rerr := dir.Readdir(t, ds)
+
+	switch err := handleIOError(t, ds.Written() > 0, rerr, kernel.ERESTARTSYS, "getdents", dir); err {
+	case nil:
+		dir.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+		return uintptr(ds.Written()), nil
+	case io.EOF:
+		return 0, nil
+	default:
+		return 0, err
+	}
+}
+
+// oldDirentHdr is a fixed sized header matching the fixed size
+// fields found in the old linux dirent struct.
+type oldDirentHdr struct {
+	Ino    uint64
+	Off    uint64
+	Reclen uint16
+}
+
+// direntHdr is a fixed sized header matching the fixed size
+// fields found in the new linux dirent struct.
+type direntHdr struct {
+	OldHdr oldDirentHdr
+	Typ    uint8
+}
+
+// dirent contains the data pointed to by a new linux dirent struct.
+type dirent struct {
+	Hdr  direntHdr
+	Name []byte
+}
+
+// newDirent returns a dirent from an fs.InodeOperationsInfo.
+func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent {
+	d := &dirent{
+		Hdr: direntHdr{
+			OldHdr: oldDirentHdr{
+				Ino: attr.InodeID,
+				Off: offset,
+			},
+			Typ: fs.ToDirentType(attr.Type),
+		},
+		Name: []byte(name),
+	}
+	d.Hdr.OldHdr.Reclen = d.padRec(int(width))
+	return d
+}
+
+// smallestDirent returns the size of the smallest possible dirent using
+// the old linux dirent format.
+func smallestDirent(a arch.Context) uint {
+	d := dirent{}
+	return uint(binary.Size(d.Hdr.OldHdr)) + a.Width() + 1
+}
+
+// smallestDirent64 returns the size of the smallest possible dirent using
+// the new linux dirent format.
+func smallestDirent64(a arch.Context) uint {
+	d := dirent{}
+	return uint(binary.Size(d.Hdr)) + a.Width()
+}
+
+// padRec pads the name field until the rec length is a multiple of the width,
+// which must be a power of 2. It returns the padded rec length.
+func (d *dirent) padRec(width int) uint16 {
+	a := int(binary.Size(d.Hdr)) + len(d.Name)
+	r := (a + width) &^ (width - 1)
+	padding := r - a
+	d.Name = append(d.Name, make([]byte, padding)...)
+	return uint16(r)
+}
+
+// Serialize64 serializes a Dirent struct to a byte slice, keeping the new
+// linux dirent format. Returns the number of bytes serialized or an error.
+func (d *dirent) Serialize64(w io.Writer) (int, error) {
+	n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr))
+	if err != nil {
+		return 0, err
+	}
+	n2, err := w.Write(d.Name)
+	if err != nil {
+		return 0, err
+	}
+	return n1 + n2, nil
+}
+
+// Serialize serializes a Dirent struct to a byte slice, using the old linux
+// dirent format.
+// Returns the number of bytes serialized or an error.
+func (d *dirent) Serialize(w io.Writer) (int, error) {
+	n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr.OldHdr))
+	if err != nil {
+		return 0, err
+	}
+	n2, err := w.Write(d.Name)
+	if err != nil {
+		return 0, err
+	}
+	n3, err := w.Write([]byte{d.Hdr.Typ})
+	if err != nil {
+		return 0, err
+	}
+	return n1 + n2 + n3, nil
+}
+
+// direntSerializer implements fs.InodeOperationsInfoSerializer, serializing dirents to an
+// io.Writer.
+type direntSerializer struct {
+	serialize func(*dirent, io.Writer) (int, error)
+	w         io.Writer
+	// width is the arch native value width.
+	width uint
+	// offset is the current dirent offset.
+	offset uint64
+	// written is the total bytes serialized.
+	written int
+	// size is the size of the buffer to serialize into.
+	size int
+}
+
+func newDirentSerializer(f func(d *dirent, w io.Writer) (int, error), w io.Writer, ac arch.Context, size int) *direntSerializer {
+	return &direntSerializer{
+		serialize: f,
+		w:         w,
+		width:     ac.Width(),
+		size:      size,
+	}
+}
+
+// CopyOut implements fs.InodeOperationsInfoSerializer.CopyOut.
+// It serializes and writes the fs.DentAttr to the direntSerializer io.Writer.
+func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error {
+	ds.offset++
+
+	d := newDirent(ds.width, name, attr, ds.offset)
+
+	// Serialize dirent into a temp buffer.
+	var b bytes.Buffer
+	n, err := ds.serialize(d, &b)
+	if err != nil {
+		ds.offset--
+		return err
+	}
+
+	// Check that we have enough room remaining to write the dirent.
+	if n > (ds.size - ds.written) {
+		ds.offset--
+		return io.EOF
+	}
+
+	// Write out the temp buffer.
+	if _, err := b.WriteTo(ds.w); err != nil {
+		ds.offset--
+		return err
+	}
+
+	ds.written += n
+	return nil
+}
+
+// Written returns the total number of bytes written.
+func (ds *direntSerializer) Written() int {
+	return ds.written
+}
+
+// LINT.ThenChange(vfs2/getdents.go)
diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go
new file mode 100644
index 000000000..715ac45e6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_identity.go
@@ -0,0 +1,180 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+	// As NGROUPS_MAX in include/uapi/linux/limits.h.
+	maxNGroups = 65536
+)
+
+// Getuid implements the Linux syscall getuid.
+func Getuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	c := t.Credentials()
+	ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
+	return uintptr(ruid), nil, nil
+}
+
+// Geteuid implements the Linux syscall geteuid.
+func Geteuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	c := t.Credentials()
+	euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
+	return uintptr(euid), nil, nil
+}
+
+// Getresuid implements the Linux syscall getresuid.
+func Getresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ruidAddr := args[0].Pointer()
+	euidAddr := args[1].Pointer()
+	suidAddr := args[2].Pointer()
+	c := t.Credentials()
+	ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
+	euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
+	suid := c.SavedKUID.In(c.UserNamespace).OrOverflow()
+	if _, err := t.CopyOut(ruidAddr, ruid); err != nil {
+		return 0, nil, err
+	}
+	if _, err := t.CopyOut(euidAddr, euid); err != nil {
+		return 0, nil, err
+	}
+	if _, err := t.CopyOut(suidAddr, suid); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, nil
+}
+
+// Getgid implements the Linux syscall getgid.
+func Getgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	c := t.Credentials()
+	rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
+	return uintptr(rgid), nil, nil
+}
+
+// Getegid implements the Linux syscall getegid.
+func Getegid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	c := t.Credentials()
+	egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
+	return uintptr(egid), nil, nil
+}
+
+// Getresgid implements the Linux syscall getresgid.
+func Getresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	rgidAddr := args[0].Pointer()
+	egidAddr := args[1].Pointer()
+	sgidAddr := args[2].Pointer()
+	c := t.Credentials()
+	rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
+	egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
+	sgid := c.SavedKGID.In(c.UserNamespace).OrOverflow()
+	if _, err := t.CopyOut(rgidAddr, rgid); err != nil {
+		return 0, nil, err
+	}
+	if _, err := t.CopyOut(egidAddr, egid); err != nil {
+		return 0, nil, err
+	}
+	if _, err := t.CopyOut(sgidAddr, sgid); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, nil
+}
+
+// Setuid implements the Linux syscall setuid.
+func Setuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	uid := auth.UID(args[0].Int())
+	return 0, nil, t.SetUID(uid)
+}
+
+// Setreuid implements the Linux syscall setreuid.
+func Setreuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ruid := auth.UID(args[0].Int())
+	euid := auth.UID(args[1].Int())
+	return 0, nil, t.SetREUID(ruid, euid)
+}
+
+// Setresuid implements the Linux syscall setreuid.
+func Setresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ruid := auth.UID(args[0].Int())
+	euid := auth.UID(args[1].Int())
+	suid := auth.UID(args[2].Int())
+	return 0, nil, t.SetRESUID(ruid, euid, suid)
+}
+
+// Setgid implements the Linux syscall setgid.
+func Setgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	gid := auth.GID(args[0].Int())
+	return 0, nil, t.SetGID(gid)
+}
+
+// Setregid implements the Linux syscall setregid.
+func Setregid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	rgid := auth.GID(args[0].Int())
+	egid := auth.GID(args[1].Int())
+	return 0, nil, t.SetREGID(rgid, egid)
+}
+
+// Setresgid implements the Linux syscall setregid.
+func Setresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	rgid := auth.GID(args[0].Int())
+	egid := auth.GID(args[1].Int())
+	sgid := auth.GID(args[2].Int())
+	return 0, nil, t.SetRESGID(rgid, egid, sgid)
+}
+
+// Getgroups implements the Linux syscall getgroups.
+func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := int(args[0].Int())
+	if size < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	kgids := t.Credentials().ExtraKGIDs
+	// "If size is zero, list is not modified, but the total number of
+	// supplementary group IDs for the process is returned." - getgroups(2)
+	if size == 0 {
+		return uintptr(len(kgids)), nil, nil
+	}
+	if size < len(kgids) {
+		return 0, nil, syserror.EINVAL
+	}
+	gids := make([]auth.GID, len(kgids))
+	for i, kgid := range kgids {
+		gids[i] = kgid.In(t.UserNamespace()).OrOverflow()
+	}
+	if _, err := t.CopyOut(args[1].Pointer(), gids); err != nil {
+		return 0, nil, err
+	}
+	return uintptr(len(gids)), nil, nil
+}
+
+// Setgroups implements the Linux syscall setgroups.
+func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := args[0].Int()
+	if size < 0 || size > maxNGroups {
+		return 0, nil, syserror.EINVAL
+	}
+	if size == 0 {
+		return 0, nil, t.SetExtraGIDs(nil)
+	}
+	gids := make([]auth.GID, size)
+	if _, err := t.CopyIn(args[1].Pointer(), &gids); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, t.SetExtraGIDs(gids)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go
new file mode 100644
index 000000000..b2c7b3444
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_inotify.go
@@ -0,0 +1,133 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC)
+
+// InotifyInit1 implements the inotify_init1() syscalls.
+func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := int(args[0].Int())
+
+	if flags&^allFlags != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	dirent := fs.NewDirent(t, anon.NewInode(t), "inotify")
+	fileFlags := fs.FileFlags{
+		Read:        true,
+		Write:       true,
+		NonBlocking: flags&linux.IN_NONBLOCK != 0,
+	}
+	n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t))
+	defer n.DecRef()
+
+	fd, err := t.NewFDFrom(0, n, kernel.FDFlags{
+		CloseOnExec: flags&linux.IN_CLOEXEC != 0,
+	})
+
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// InotifyInit implements the inotify_init() syscalls.
+func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	args[0].Value = 0
+	return InotifyInit1(t, args)
+}
+
+// fdToInotify resolves an fd to an inotify object. If successful, the file will
+// have an extra ref and the caller is responsible for releasing the ref.
+func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) {
+	file := t.GetFile(fd)
+	if file == nil {
+		// Invalid fd.
+		return nil, nil, syserror.EBADF
+	}
+
+	ino, ok := file.FileOperations.(*fs.Inotify)
+	if !ok {
+		// Not an inotify fd.
+		file.DecRef()
+		return nil, nil, syserror.EINVAL
+	}
+
+	return ino, file, nil
+}
+
+// InotifyAddWatch implements the inotify_add_watch() syscall.
+func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	mask := args[2].Uint()
+
+	// "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link."
+	//  -- inotify(7)
+	resolve := mask&linux.IN_DONT_FOLLOW == 0
+
+	// "EINVAL: The given event mask contains no valid events."
+	// -- inotify_add_watch(2)
+	if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	ino, file, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, dirent *fs.Dirent, _ uint) error {
+		// "IN_ONLYDIR: Only watch pathname if it is a directory." -- inotify(7)
+		if onlyDir := mask&linux.IN_ONLYDIR != 0; onlyDir && !fs.IsDir(dirent.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Copy out to the return frame.
+		fd = ino.AddWatch(dirent, mask)
+
+		return nil
+	})
+	return uintptr(fd), nil, err // Return from the existing value.
+}
+
+// InotifyRmWatch implements the inotify_rm_watch() syscall.
+func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	wd := args[1].Int()
+
+	ino, file, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+	return 0, nil, ino.RmWatch(wd)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
new file mode 100644
index 000000000..3f7691eae
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -0,0 +1,58 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// LINT.IfChange
+
+// Lseek implements linux syscall lseek(2).
+func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	whence := args[2].Int()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	var sw fs.SeekWhence
+	switch whence {
+	case 0:
+		sw = fs.SeekSet
+	case 1:
+		sw = fs.SeekCurrent
+	case 2:
+		sw = fs.SeekEnd
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	offset, serr := file.Seek(t, sw, offset)
+	err := handleIOError(t, false /* partialResult */, serr, kernel.ERESTARTSYS, "lseek", file)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(offset), nil, err
+}
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go
new file mode 100644
index 000000000..9b4a5c3f1
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go
@@ -0,0 +1,312 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// We unconditionally report a single NUMA node. This also means that our
+// "nodemask_t" is a single unsigned long (uint64).
+const (
+	maxNodes        = 1
+	allowedNodemask = (1 << maxNodes) - 1
+)
+
+func copyInNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32) (uint64, error) {
+	// "nodemask points to a bit mask of node IDs that contains up to maxnode
+	// bits. The bit mask size is rounded to the next multiple of
+	// sizeof(unsigned long), but the kernel will use bits only up to maxnode.
+	// A NULL value of nodemask or a maxnode value of zero specifies the empty
+	// set of nodes. If the value of maxnode is zero, the nodemask argument is
+	// ignored." - set_mempolicy(2). Unfortunately, most of this is inaccurate
+	// because of what appears to be a bug: mm/mempolicy.c:get_nodes() uses
+	// maxnode-1, not maxnode, as the number of bits.
+	bits := maxnode - 1
+	if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
+		return 0, syserror.EINVAL
+	}
+	if bits == 0 {
+		return 0, nil
+	}
+	// Copy in the whole nodemask.
+	numUint64 := (bits + 63) / 64
+	buf := t.CopyScratchBuffer(int(numUint64) * 8)
+	if _, err := t.CopyInBytes(addr, buf); err != nil {
+		return 0, err
+	}
+	val := usermem.ByteOrder.Uint64(buf)
+	// Check that only allowed bits in the first unsigned long in the nodemask
+	// are set.
+	if val&^allowedNodemask != 0 {
+		return 0, syserror.EINVAL
+	}
+	// Check that all remaining bits in the nodemask are 0.
+	for i := 8; i < len(buf); i++ {
+		if buf[i] != 0 {
+			return 0, syserror.EINVAL
+		}
+	}
+	return val, nil
+}
+
+func copyOutNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32, val uint64) error {
+	// mm/mempolicy.c:copy_nodes_to_user() also uses maxnode-1 as the number of
+	// bits.
+	bits := maxnode - 1
+	if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
+		return syserror.EINVAL
+	}
+	if bits == 0 {
+		return nil
+	}
+	// Copy out the first unsigned long in the nodemask.
+	buf := t.CopyScratchBuffer(8)
+	usermem.ByteOrder.PutUint64(buf, val)
+	if _, err := t.CopyOutBytes(addr, buf); err != nil {
+		return err
+	}
+	// Zero out remaining unsigned longs in the nodemask.
+	if bits > 64 {
+		remAddr, ok := addr.AddLength(8)
+		if !ok {
+			return syserror.EFAULT
+		}
+		remUint64 := (bits - 1) / 64
+		if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// GetMempolicy implements the syscall get_mempolicy(2).
+func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	mode := args[0].Pointer()
+	nodemask := args[1].Pointer()
+	maxnode := args[2].Uint()
+	addr := args[3].Pointer()
+	flags := args[4].Uint()
+
+	if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	nodeFlag := flags&linux.MPOL_F_NODE != 0
+	addrFlag := flags&linux.MPOL_F_ADDR != 0
+	memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0
+
+	// "EINVAL: The value specified by maxnode is less than the number of node
+	// IDs supported by the system." - get_mempolicy(2)
+	if nodemask != 0 && maxnode < maxNodes {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is
+	// ignored and the set of nodes (memories) that the thread is allowed to
+	// specify in subsequent calls to mbind(2) or set_mempolicy(2) (in the
+	// absence of any mode flags) is returned in nodemask."
+	if memsAllowed {
+		// "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either
+		// MPOL_F_ADDR or MPOL_F_NODE."
+		if nodeFlag || addrFlag {
+			return 0, nil, syserror.EINVAL
+		}
+		if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, nil
+	}
+
+	// "If flags specifies MPOL_F_ADDR, then information is returned about the
+	// policy governing the memory address given in addr. ... If the mode
+	// argument is not NULL, then get_mempolicy() will store the policy mode
+	// and any optional mode flags of the requested NUMA policy in the location
+	// pointed to by this argument. If nodemask is not NULL, then the nodemask
+	// associated with the policy will be stored in the location pointed to by
+	// this argument."
+	if addrFlag {
+		policy, nodemaskVal, err := t.MemoryManager().NumaPolicy(addr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if nodeFlag {
+			// "If flags specifies both MPOL_F_NODE and MPOL_F_ADDR,
+			// get_mempolicy() will return the node ID of the node on which the
+			// address addr is allocated into the location pointed to by mode.
+			// If no page has yet been allocated for the specified address,
+			// get_mempolicy() will allocate a page as if the thread had
+			// performed a read (load) access to that address, and return the
+			// ID of the node where that page was allocated."
+			buf := t.CopyScratchBuffer(1)
+			_, err := t.CopyInBytes(addr, buf)
+			if err != nil {
+				return 0, nil, err
+			}
+			policy = linux.MPOL_DEFAULT // maxNodes == 1
+		}
+		if mode != 0 {
+			if _, err := policy.CopyOut(t, mode); err != nil {
+				return 0, nil, err
+			}
+		}
+		if nodemask != 0 {
+			if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
+				return 0, nil, err
+			}
+		}
+		return 0, nil, nil
+	}
+
+	// "EINVAL: ... flags specified MPOL_F_ADDR and addr is NULL, or flags did
+	// not specify MPOL_F_ADDR and addr is not NULL." This is partially
+	// inaccurate: if flags specifies MPOL_F_ADDR,
+	// mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will
+	// just (usually) fail to find a VMA at address 0 and return EFAULT.
+	if addr != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "If flags is specified as 0, then information about the calling thread's
+	// default policy (as set by set_mempolicy(2)) is returned, in the buffers
+	// pointed to by mode and nodemask. ... If flags specifies MPOL_F_NODE, but
+	// not MPOL_F_ADDR, and the thread's current policy is MPOL_INTERLEAVE,
+	// then get_mempolicy() will return in the location pointed to by a
+	// non-NULL mode argument, the node ID of the next node that will be used
+	// for interleaving of internal kernel pages allocated on behalf of the
+	// thread."
+	policy, nodemaskVal := t.NumaPolicy()
+	if nodeFlag {
+		if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE {
+			return 0, nil, syserror.EINVAL
+		}
+		policy = linux.MPOL_DEFAULT // maxNodes == 1
+	}
+	if mode != 0 {
+		if _, err := policy.CopyOut(t, mode); err != nil {
+			return 0, nil, err
+		}
+	}
+	if nodemask != 0 {
+		if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// SetMempolicy implements the syscall set_mempolicy(2).
+func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	modeWithFlags := linux.NumaPolicy(args[0].Int())
+	nodemask := args[1].Pointer()
+	maxnode := args[2].Uint()
+
+	modeWithFlags, nodemaskVal, err := copyInMempolicyNodemask(t, modeWithFlags, nodemask, maxnode)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	t.SetNumaPolicy(modeWithFlags, nodemaskVal)
+	return 0, nil, nil
+}
+
+// Mbind implements the syscall mbind(2).
+func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Uint64()
+	mode := linux.NumaPolicy(args[2].Int())
+	nodemask := args[3].Pointer()
+	maxnode := args[4].Uint()
+	flags := args[5].Uint()
+
+	if flags&^linux.MPOL_MF_VALID != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	// "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be
+	// privileged (CAP_SYS_NICE) to use this flag." - mbind(2)
+	if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) {
+		return 0, nil, syserror.EPERM
+	}
+
+	mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Since we claim to have only a single node, all flags can be ignored
+	// (since all pages must already be on that single node).
+	err = t.MemoryManager().SetNumaPolicy(addr, length, mode, nodemaskVal)
+	return 0, nil, err
+}
+
+func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nodemask usermem.Addr, maxnode uint32) (linux.NumaPolicy, uint64, error) {
+	flags := linux.NumaPolicy(modeWithFlags & linux.MPOL_MODE_FLAGS)
+	mode := linux.NumaPolicy(modeWithFlags &^ linux.MPOL_MODE_FLAGS)
+	if flags == linux.MPOL_MODE_FLAGS {
+		// Can't specify both mode flags simultaneously.
+		return 0, 0, syserror.EINVAL
+	}
+	if mode < 0 || mode >= linux.MPOL_MAX {
+		// Must specify a valid mode.
+		return 0, 0, syserror.EINVAL
+	}
+
+	var nodemaskVal uint64
+	if nodemask != 0 {
+		var err error
+		nodemaskVal, err = copyInNodemask(t, nodemask, maxnode)
+		if err != nil {
+			return 0, 0, err
+		}
+	}
+
+	switch mode {
+	case linux.MPOL_DEFAULT:
+		// "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate;
+		// Linux allows a nodemask to be specified, as long as it is empty.
+		if nodemaskVal != 0 {
+			return 0, 0, syserror.EINVAL
+		}
+	case linux.MPOL_BIND, linux.MPOL_INTERLEAVE:
+		// These require a non-empty nodemask.
+		if nodemaskVal == 0 {
+			return 0, 0, syserror.EINVAL
+		}
+	case linux.MPOL_PREFERRED:
+		// This permits an empty nodemask, as long as no flags are set.
+		if nodemaskVal == 0 && flags != 0 {
+			return 0, 0, syserror.EINVAL
+		}
+	case linux.MPOL_LOCAL:
+		// This requires an empty nodemask and no flags set ...
+		if nodemaskVal != 0 || flags != 0 {
+			return 0, 0, syserror.EINVAL
+		}
+		// ... and is implemented as MPOL_PREFERRED.
+		mode = linux.MPOL_PREFERRED
+	default:
+		// Unknown mode, which we should have rejected above.
+		panic(fmt.Sprintf("unknown mode: %v", mode))
+	}
+
+	return mode | flags, nodemaskVal, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
new file mode 100644
index 000000000..91694d374
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -0,0 +1,332 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"bytes"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Brk implements linux syscall brk(2).
+func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr, _ := t.MemoryManager().Brk(t, args[0].Pointer())
+	// "However, the actual Linux system call returns the new program break on
+	// success. On failure, the system call returns the current break." -
+	// brk(2)
+	return uintptr(addr), nil, nil
+}
+
+// LINT.IfChange
+
+// Mmap implements linux syscall mmap(2).
+func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	prot := args[2].Int()
+	flags := args[3].Int()
+	fd := args[4].Int()
+	fixed := flags&linux.MAP_FIXED != 0
+	private := flags&linux.MAP_PRIVATE != 0
+	shared := flags&linux.MAP_SHARED != 0
+	anon := flags&linux.MAP_ANONYMOUS != 0
+	map32bit := flags&linux.MAP_32BIT != 0
+
+	// Require exactly one of MAP_PRIVATE and MAP_SHARED.
+	if private == shared {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts := memmap.MMapOpts{
+		Length:   args[1].Uint64(),
+		Offset:   args[5].Uint64(),
+		Addr:     args[0].Pointer(),
+		Fixed:    fixed,
+		Unmap:    fixed,
+		Map32Bit: map32bit,
+		Private:  private,
+		Perms: usermem.AccessType{
+			Read:    linux.PROT_READ&prot != 0,
+			Write:   linux.PROT_WRITE&prot != 0,
+			Execute: linux.PROT_EXEC&prot != 0,
+		},
+		MaxPerms:  usermem.AnyAccess,
+		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
+		Precommit: linux.MAP_POPULATE&flags != 0,
+	}
+	if linux.MAP_LOCKED&flags != 0 {
+		opts.MLockMode = memmap.MLockEager
+	}
+	defer func() {
+		if opts.MappingIdentity != nil {
+			opts.MappingIdentity.DecRef()
+		}
+	}()
+
+	if !anon {
+		// Convert the passed FD to a file reference.
+		file := t.GetFile(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		flags := file.Flags()
+		// mmap unconditionally requires that the FD is readable.
+		if !flags.Read {
+			return 0, nil, syserror.EACCES
+		}
+		// MAP_SHARED requires that the FD be writable for PROT_WRITE.
+		if shared && !flags.Write {
+			opts.MaxPerms.Write = false
+		}
+
+		if err := file.ConfigureMMap(t, &opts); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	rv, err := t.MemoryManager().MMap(t, opts)
+	return uintptr(rv), nil, err
+}
+
+// LINT.ThenChange(vfs2/mmap.go)
+
+// Munmap implements linux syscall munmap(2).
+func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64())
+}
+
+// Mremap implements linux syscall mremap(2).
+func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	oldSize := args[1].Uint64()
+	newSize := args[2].Uint64()
+	flags := args[3].Uint64()
+	newAddr := args[4].Pointer()
+
+	if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	mayMove := flags&linux.MREMAP_MAYMOVE != 0
+	fixed := flags&linux.MREMAP_FIXED != 0
+	var moveMode mm.MRemapMoveMode
+	switch {
+	case !mayMove && !fixed:
+		moveMode = mm.MRemapNoMove
+	case mayMove && !fixed:
+		moveMode = mm.MRemapMayMove
+	case mayMove && fixed:
+		moveMode = mm.MRemapMustMove
+	case !mayMove && fixed:
+		// "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be
+		// specified." - mremap(2)
+		return 0, nil, syserror.EINVAL
+	}
+
+	rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{
+		Move:    moveMode,
+		NewAddr: newAddr,
+	})
+	return uintptr(rv), nil, err
+}
+
+// Mprotect implements linux syscall mprotect(2).
+func Mprotect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	length := args[1].Uint64()
+	prot := args[2].Int()
+	err := t.MemoryManager().MProtect(args[0].Pointer(), length, usermem.AccessType{
+		Read:    linux.PROT_READ&prot != 0,
+		Write:   linux.PROT_WRITE&prot != 0,
+		Execute: linux.PROT_EXEC&prot != 0,
+	}, linux.PROT_GROWSDOWN&prot != 0)
+	return 0, nil, err
+}
+
+// Madvise implements linux syscall madvise(2).
+func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := uint64(args[1].SizeT())
+	adv := args[2].Int()
+
+	// "The Linux implementation requires that the address addr be
+	// page-aligned, and allows length to be zero." - madvise(2)
+	if addr.RoundDown() != addr {
+		return 0, nil, syserror.EINVAL
+	}
+	if length == 0 {
+		return 0, nil, nil
+	}
+	// Not explicitly stated: length need not be page-aligned.
+	lenAddr, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+	length = uint64(lenAddr)
+
+	switch adv {
+	case linux.MADV_DONTNEED:
+		return 0, nil, t.MemoryManager().Decommit(addr, length)
+	case linux.MADV_DOFORK:
+		return 0, nil, t.MemoryManager().SetDontFork(addr, length, false)
+	case linux.MADV_DONTFORK:
+		return 0, nil, t.MemoryManager().SetDontFork(addr, length, true)
+	case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE:
+		fallthrough
+	case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE:
+		fallthrough
+	case linux.MADV_DONTDUMP, linux.MADV_DODUMP:
+		// TODO(b/72045799): Core dumping isn't implemented, so these are
+		// no-ops.
+		fallthrough
+	case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED:
+		// Do nothing, we totally ignore the suggestions above.
+		return 0, nil, nil
+	case linux.MADV_REMOVE:
+		// These "suggestions" have application-visible side effects, so we
+		// have to indicate that we don't support them.
+		return 0, nil, syserror.ENOSYS
+	case linux.MADV_HWPOISON:
+		// Only privileged processes are allowed to poison pages.
+		return 0, nil, syserror.EPERM
+	default:
+		// If adv is not a valid value tell the caller.
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+// Mincore implements the syscall mincore(2).
+func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	vec := args[2].Pointer()
+
+	if addr != addr.RoundDown() {
+		return 0, nil, syserror.EINVAL
+	}
+	// "The length argument need not be a multiple of the page size, but since
+	// residency information is returned for whole pages, length is effectively
+	// rounded up to the next multiple of the page size." - mincore(2)
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return 0, nil, syserror.ENOMEM
+	}
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return 0, nil, syserror.ENOMEM
+	}
+
+	// Pretend that all mapped pages are "resident in core".
+	mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
+	// "ENOMEM: addr to addr + length contained unmapped memory."
+	if mapped != uint64(la) {
+		return 0, nil, syserror.ENOMEM
+	}
+	resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize))
+	_, err := t.CopyOut(vec, resident)
+	return 0, nil, err
+}
+
+// Msync implements Linux syscall msync(2).
+func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	flags := args[2].Int()
+
+	// "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
+	// and may additionally include the MS_INVALIDATE bit. ... However, Linux
+	// permits a call to msync() that specifies neither of these flags, with
+	// semantics that are (currently) equivalent to specifying MS_ASYNC." -
+	// msync(2)
+	if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	sync := flags&linux.MS_SYNC != 0
+	if sync && flags&linux.MS_ASYNC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{
+		Sync:       sync,
+		Invalidate: flags&linux.MS_INVALIDATE != 0,
+	})
+	// MSync calls fsync, the same interrupt conversion rules apply, see
+	// mm/msync.c, fsync POSIX.1-2008.
+	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// Mlock implements linux syscall mlock(2).
+func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager)
+}
+
+// Mlock2 implements linux syscall mlock2(2).
+func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	flags := args[2].Int()
+
+	if flags&^(linux.MLOCK_ONFAULT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	mode := memmap.MLockEager
+	if flags&linux.MLOCK_ONFAULT != 0 {
+		mode = memmap.MLockLazy
+	}
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode)
+}
+
+// Munlock implements linux syscall munlock(2).
+func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone)
+}
+
+// Mlockall implements linux syscall mlockall(2).
+func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+
+	if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	mode := memmap.MLockEager
+	if flags&linux.MCL_ONFAULT != 0 {
+		mode = memmap.MLockLazy
+	}
+	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+		Current: flags&linux.MCL_CURRENT != 0,
+		Future:  flags&linux.MCL_FUTURE != 0,
+		Mode:    mode,
+	})
+}
+
+// Munlockall implements linux syscall munlockall(2).
+func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+		Current: true,
+		Future:  true,
+		Mode:    memmap.MLockNone,
+	})
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
new file mode 100644
index 000000000..eb5ff48f5
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -0,0 +1,154 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mount implements Linux syscall mount(2).
+func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sourceAddr := args[0].Pointer()
+	targetAddr := args[1].Pointer()
+	typeAddr := args[2].Pointer()
+	flags := args[3].Uint64()
+	dataAddr := args[4].Pointer()
+
+	fsType, err := t.CopyInString(typeAddr, usermem.PageSize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	sourcePath, _, err := copyInPath(t, sourceAddr, true /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	targetPath, _, err := copyInPath(t, targetAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	data := ""
+	if dataAddr != 0 {
+		// In Linux, a full page is always copied in regardless of null
+		// character placement, and the address is passed to each file system.
+		// Most file systems always treat this data as a string, though, and so
+		// do all of the ones we implement.
+		data, err = t.CopyInString(dataAddr, usermem.PageSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+
+	// Ignore magic value that was required before Linux 2.4.
+	if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL {
+		flags = flags &^ linux.MS_MGC_MSK
+	}
+
+	// Must have CAP_SYS_ADMIN in the mount namespace's associated user
+	// namespace.
+	if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) {
+		return 0, nil, syserror.EPERM
+	}
+
+	const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND |
+		linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE |
+		linux.MS_UNBINDABLE | linux.MS_MOVE
+
+	// Silently allow MS_NOSUID, since we don't implement set-id bits
+	// anyway.
+	const unsupportedFlags = linux.MS_NODEV |
+		linux.MS_NODIRATIME | linux.MS_STRICTATIME
+
+	// Linux just allows passing any flags to mount(2) - it won't fail when
+	// unknown or unsupported flags are passed. Since we don't implement
+	// everything, we fail explicitly on flags that are unimplemented.
+	if flags&(unsupportedOps|unsupportedFlags) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	rsys, ok := fs.FindFilesystem(fsType)
+	if !ok {
+		return 0, nil, syserror.ENODEV
+	}
+	if !rsys.AllowUserMount() {
+		return 0, nil, syserror.EPERM
+	}
+
+	var superFlags fs.MountSourceFlags
+	if flags&linux.MS_NOATIME == linux.MS_NOATIME {
+		superFlags.NoAtime = true
+	}
+	if flags&linux.MS_RDONLY == linux.MS_RDONLY {
+		superFlags.ReadOnly = true
+	}
+	if flags&linux.MS_NOEXEC == linux.MS_NOEXEC {
+		superFlags.NoExec = true
+	}
+
+	rootInode, err := rsys.Mount(t, sourcePath, superFlags, data, nil)
+	if err != nil {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if err := fileOpOn(t, linux.AT_FDCWD, targetPath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		// Mount will take a reference on rootInode if successful.
+		return t.MountNamespace().Mount(t, d, rootInode)
+	}); err != nil {
+		// Something went wrong. Drop our ref on rootInode before
+		// returning the error.
+		rootInode.DecRef()
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// Umount2 implements Linux syscall umount2(2).
+func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Int()
+
+	const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE
+	if flags&unsupported != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Must have CAP_SYS_ADMIN in the mount namespace's associated user
+	// namespace.
+	//
+	// Currently, this is always the init task's user namespace.
+	if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) {
+		return 0, nil, syserror.EPERM
+	}
+
+	resolve := flags&linux.UMOUNT_NOFOLLOW != linux.UMOUNT_NOFOLLOW
+	detachOnly := flags&linux.MNT_DETACH == linux.MNT_DETACH
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		return t.MountNamespace().Unmount(t, d, detachOnly)
+	})
+}
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
new file mode 100644
index 000000000..43c510930
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -0,0 +1,77 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// LINT.IfChange
+
+// pipe2 implements the actual system call with flags.
+func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
+	if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
+		return 0, syserror.EINVAL
+	}
+	r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize)
+
+	r.SetFlags(linuxToFlags(flags).Settable())
+	defer r.DecRef()
+
+	w.SetFlags(linuxToFlags(flags).Settable())
+	defer w.DecRef()
+
+	fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	if _, err := t.CopyOut(addr, fds); err != nil {
+		for _, fd := range fds {
+			if file, _ := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
+		}
+		return 0, err
+	}
+	return 0, nil
+}
+
+// Pipe implements linux syscall pipe(2).
+func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	n, err := pipe2(t, addr, 0)
+	return n, nil, err
+}
+
+// Pipe2 implements linux syscall pipe2(2).
+func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := uint(args[1].Uint())
+
+	n, err := pipe2(t, addr, flags)
+	return n, nil, err
+}
+
+// LINT.ThenChange(vfs2/pipe.go)
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
new file mode 100644
index 000000000..f0198141c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -0,0 +1,545 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// fileCap is the maximum allowable files for poll & select.
+const fileCap = 1024 * 1024
+
+// Masks for "readable", "writable", and "exceptional" events as defined by
+// select(2).
+const (
+	// selectReadEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLIN_SET.
+	selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR
+
+	// selectWriteEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLOUT_SET.
+	selectWriteEvents = linux.POLLOUT | linux.POLLERR
+
+	// selectExceptEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLEX_SET.
+	selectExceptEvents = linux.POLLPRI
+)
+
+// pollState tracks the associated file descriptor and waiter of a PollFD.
+type pollState struct {
+	file   *fs.File
+	waiter waiter.Entry
+}
+
+// initReadiness gets the current ready mask for the file represented by the FD
+// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
+// used to register with the file for event notifications, and a reference to
+// the file is stored in "state".
+func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) {
+	if pfd.FD < 0 {
+		pfd.REvents = 0
+		return
+	}
+
+	file := t.GetFile(pfd.FD)
+	if file == nil {
+		pfd.REvents = linux.POLLNVAL
+		return
+	}
+
+	if ch == nil {
+		defer file.DecRef()
+	} else {
+		state.file = file
+		state.waiter, _ = waiter.NewChannelEntry(ch)
+		file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	}
+
+	r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	pfd.REvents = int16(r.ToLinux()) & pfd.Events
+}
+
+// releaseState releases all the pollState in "state".
+func releaseState(state []pollState) {
+	for i := range state {
+		if state[i].file != nil {
+			state[i].file.EventUnregister(&state[i].waiter)
+			state[i].file.DecRef()
+		}
+	}
+}
+
+// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout"
+// when "timeout" is greater than zero.
+//
+// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or
+// positive if interrupted by a signal.
+func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
+	var ch chan struct{}
+	if timeout != 0 {
+		ch = make(chan struct{}, 1)
+	}
+
+	// Register for event notification in the files involved if we may
+	// block (timeout not zero). Once we find a file that has a non-zero
+	// result, we stop registering for events but still go through all files
+	// to get their ready masks.
+	state := make([]pollState, len(pfd))
+	defer releaseState(state)
+	n := uintptr(0)
+	for i := range pfd {
+		initReadiness(t, &pfd[i], &state[i], ch)
+		if pfd[i].REvents != 0 {
+			n++
+			ch = nil
+		}
+	}
+
+	if timeout == 0 {
+		return timeout, n, nil
+	}
+
+	forever := timeout < 0
+
+	for n == 0 {
+		var err error
+		// Wait for a notification.
+		timeout, err = t.BlockWithTimeout(ch, !forever, timeout)
+		if err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = nil
+			}
+			return timeout, 0, err
+		}
+
+		// We got notified, count how many files are ready. If none,
+		// then this was a spurious notification, and we just go back
+		// to sleep with the remaining timeout.
+		for i := range state {
+			if state[i].file == nil {
+				continue
+			}
+
+			r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events)))
+			rl := int16(r.ToLinux()) & pfd[i].Events
+			if rl != 0 {
+				pfd[i].REvents = rl
+				n++
+			}
+		}
+	}
+
+	return timeout, n, nil
+}
+
+// CopyInPollFDs copies an array of struct pollfd unless nfds exceeds the max.
+func CopyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) {
+	if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
+		return nil, syserror.EINVAL
+	}
+
+	pfd := make([]linux.PollFD, nfds)
+	if nfds > 0 {
+		if _, err := t.CopyIn(addr, &pfd); err != nil {
+			return nil, err
+		}
+	}
+
+	return pfd, nil
+}
+
+func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
+	pfd, err := CopyInPollFDs(t, addr, nfds)
+	if err != nil {
+		return timeout, 0, err
+	}
+
+	// Compatibility warning: Linux adds POLLHUP and POLLERR just before
+	// polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
+	// polling, changing event masks here is an application-visible difference.
+	// (Linux also doesn't copy out event masks at all, only revents.)
+	for i := range pfd {
+		pfd[i].Events |= linux.POLLHUP | linux.POLLERR
+	}
+	remainingTimeout, n, err := pollBlock(t, pfd, timeout)
+	err = syserror.ConvertIntr(err, syserror.EINTR)
+
+	// The poll entries are copied out regardless of whether
+	// any are set or not. This aligns with the Linux behavior.
+	if nfds > 0 && err == nil {
+		if _, err := t.CopyOut(addr, pfd); err != nil {
+			return remainingTimeout, 0, err
+		}
+	}
+
+	return remainingTimeout, n, err
+}
+
+// CopyInFDSet copies an fd set from select(2)/pselect(2).
+func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
+	set := make([]byte, nBytes)
+
+	if addr != 0 {
+		if _, err := t.CopyIn(addr, &set); err != nil {
+			return nil, err
+		}
+		// If we only use part of the last byte, mask out the extraneous bits.
+		//
+		// N.B. This only works on little-endian architectures.
+		if nBitsInLastPartialByte != 0 {
+			set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
+		}
+	}
+	return set, nil
+}
+
+func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
+	if nfds < 0 || nfds > fileCap {
+		return 0, syserror.EINVAL
+	}
+
+	// Calculate the size of the fd sets (one bit per fd).
+	nBytes := (nfds + 7) / 8
+	nBitsInLastPartialByte := nfds % 8
+
+	// Capture all the provided input vectors.
+	r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+	w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+	e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+
+	// Count how many FDs are actually being requested so that we can build
+	// a PollFD array.
+	fdCount := 0
+	for i := 0; i < nBytes; i++ {
+		v := r[i] | w[i] | e[i]
+		for v != 0 {
+			v &= (v - 1)
+			fdCount++
+		}
+	}
+
+	// Build the PollFD array.
+	pfd := make([]linux.PollFD, 0, fdCount)
+	var fd int32
+	for i := 0; i < nBytes; i++ {
+		rV, wV, eV := r[i], w[i], e[i]
+		v := rV | wV | eV
+		m := byte(1)
+		for j := 0; j < 8; j++ {
+			if (v & m) != 0 {
+				// Make sure the fd is valid and decrement the reference
+				// immediately to ensure we don't leak. Note, another thread
+				// might be about to close fd. This is racy, but that's
+				// OK. Linux is racy in the same way.
+				file := t.GetFile(fd)
+				if file == nil {
+					return 0, syserror.EBADF
+				}
+				file.DecRef()
+
+				var mask int16
+				if (rV & m) != 0 {
+					mask |= selectReadEvents
+				}
+
+				if (wV & m) != 0 {
+					mask |= selectWriteEvents
+				}
+
+				if (eV & m) != 0 {
+					mask |= selectExceptEvents
+				}
+
+				pfd = append(pfd, linux.PollFD{
+					FD:     fd,
+					Events: mask,
+				})
+			}
+
+			fd++
+			m <<= 1
+		}
+	}
+
+	// Do the syscall, then count the number of bits set.
+	if _, _, err = pollBlock(t, pfd, timeout); err != nil {
+		return 0, syserror.ConvertIntr(err, syserror.EINTR)
+	}
+
+	// r, w, and e are currently event mask bitsets; unset bits corresponding
+	// to events that *didn't* occur.
+	bitSetCount := uintptr(0)
+	for idx := range pfd {
+		events := pfd[idx].REvents
+		i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
+		m := byte(1) << j
+		if r[i]&m != 0 {
+			if (events & selectReadEvents) != 0 {
+				bitSetCount++
+			} else {
+				r[i] &^= m
+			}
+		}
+		if w[i]&m != 0 {
+			if (events & selectWriteEvents) != 0 {
+				bitSetCount++
+			} else {
+				w[i] &^= m
+			}
+		}
+		if e[i]&m != 0 {
+			if (events & selectExceptEvents) != 0 {
+				bitSetCount++
+			} else {
+				e[i] &^= m
+			}
+		}
+	}
+
+	// Copy updated vectors back.
+	if readFDs != 0 {
+		if _, err := t.CopyOut(readFDs, r); err != nil {
+			return 0, err
+		}
+	}
+
+	if writeFDs != 0 {
+		if _, err := t.CopyOut(writeFDs, w); err != nil {
+			return 0, err
+		}
+	}
+
+	if exceptFDs != 0 {
+		if _, err := t.CopyOut(exceptFDs, e); err != nil {
+			return 0, err
+		}
+	}
+
+	return bitSetCount, nil
+}
+
+// timeoutRemaining returns the amount of time remaining for the specified
+// timeout or 0 if it has elapsed.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
+	now := t.Kernel().MonotonicClock().Now()
+	remaining := timeout - now.Sub(startNs)
+	if remaining < 0 {
+		remaining = 0
+	}
+	return remaining
+}
+
+// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
+	return copyTimespecOut(t, timespecAddr, &tsRemaining)
+}
+
+// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
+	return copyTimevalOut(t, timevalAddr, &tvRemaining)
+}
+
+// pollRestartBlock encapsulates the state required to restart poll(2) via
+// restart_syscall(2).
+//
+// +stateify savable
+type pollRestartBlock struct {
+	pfdAddr usermem.Addr
+	nfds    uint
+	timeout time.Duration
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return poll(t, p.pfdAddr, p.nfds, p.timeout)
+}
+
+func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
+	remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	// On an interrupt poll(2) is restarted with the remaining timeout.
+	if err == syserror.EINTR {
+		t.SetSyscallRestartBlock(&pollRestartBlock{
+			pfdAddr: pfdAddr,
+			nfds:    nfds,
+			timeout: remainingTimeout,
+		})
+		return 0, kernel.ERESTART_RESTARTBLOCK
+	}
+	return n, err
+}
+
+// Poll implements linux syscall poll(2).
+func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timeout := time.Duration(args[2].Int()) * time.Millisecond
+	n, err := poll(t, pfdAddr, nfds, timeout)
+	return n, nil, err
+}
+
+// Ppoll implements linux syscall ppoll(2).
+func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timespecAddr := args[2].Pointer()
+	maskAddr := args[3].Pointer()
+	maskSize := uint(args[4].Uint())
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if maskAddr != 0 {
+		mask, err := CopyInSigSet(t, maskAddr, maskSize)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		oldmask := t.SignalMask()
+		t.SetSignalMask(mask)
+		t.SetSavedSignalMask(oldmask)
+	}
+
+	_, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// doPoll returns EINTR if interrupted, but ppoll is normally restartable
+	// if interrupted by something other than a signal handled by the
+	// application (i.e. returns ERESTARTNOHAND). However, if
+	// copyOutTimespecRemaining failed, then the restarted ppoll would use the
+	// wrong timeout, so the error should be left as EINTR.
+	//
+	// Note that this means that if err is nil but copyErr is not, copyErr is
+	// ignored. This is consistent with Linux.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Select implements linux syscall select(2).
+func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timevalAddr := args[4].Pointer()
+
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timevalAddr != 0 {
+		timeval, err := copyTimevalIn(t, timevalAddr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if timeval.Sec < 0 || timeval.Usec < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		timeout = time.Duration(timeval.ToNsecCapped())
+	}
+	startNs := t.Kernel().MonotonicClock().Now()
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Pselect implements linux syscall pselect(2).
+func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timespecAddr := args[4].Pointer()
+	maskWithSizeAddr := args[5].Pointer()
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if maskWithSizeAddr != 0 {
+		maskAddr, size, err := copyInSigSetWithSize(t, maskWithSizeAddr)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		if maskAddr != 0 {
+			mask, err := CopyInSigSet(t, maskAddr, size)
+			if err != nil {
+				return 0, nil, err
+			}
+			oldmask := t.SignalMask()
+			t.SetSignalMask(mask)
+			t.SetSavedSignalMask(oldmask)
+		}
+	}
+
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
new file mode 100644
index 000000000..f92bf8096
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -0,0 +1,228 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Prctl implements linux syscall prctl(2).
+// It has a list of subfunctions which operate on the process. The arguments are
+// all based on each subfunction.
+func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	option := args[0].Int()
+
+	switch option {
+	case linux.PR_SET_PDEATHSIG:
+		sig := linux.Signal(args[1].Int())
+		if sig != 0 && !sig.IsValid() {
+			return 0, nil, syserror.EINVAL
+		}
+		t.SetParentDeathSignal(sig)
+		return 0, nil, nil
+
+	case linux.PR_GET_PDEATHSIG:
+		_, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal()))
+		return 0, nil, err
+
+	case linux.PR_GET_DUMPABLE:
+		d := t.MemoryManager().Dumpability()
+		switch d {
+		case mm.NotDumpable:
+			return linux.SUID_DUMP_DISABLE, nil, nil
+		case mm.UserDumpable:
+			return linux.SUID_DUMP_USER, nil, nil
+		case mm.RootDumpable:
+			return linux.SUID_DUMP_ROOT, nil, nil
+		default:
+			panic(fmt.Sprintf("Unknown dumpability %v", d))
+		}
+
+	case linux.PR_SET_DUMPABLE:
+		var d mm.Dumpability
+		switch args[1].Int() {
+		case linux.SUID_DUMP_DISABLE:
+			d = mm.NotDumpable
+		case linux.SUID_DUMP_USER:
+			d = mm.UserDumpable
+		default:
+			// N.B. Userspace may not pass SUID_DUMP_ROOT.
+			return 0, nil, syserror.EINVAL
+		}
+		t.MemoryManager().SetDumpability(d)
+		return 0, nil, nil
+
+	case linux.PR_GET_KEEPCAPS:
+		if t.Credentials().KeepCaps {
+			return 1, nil, nil
+		}
+
+		return 0, nil, nil
+
+	case linux.PR_SET_KEEPCAPS:
+		val := args[1].Int()
+		// prctl(2): arg2 must be either 0 (permitted capabilities are cleared)
+		// or 1 (permitted capabilities are kept).
+		if val == 0 {
+			t.SetKeepCaps(false)
+		} else if val == 1 {
+			t.SetKeepCaps(true)
+		} else {
+			return 0, nil, syserror.EINVAL
+		}
+
+		return 0, nil, nil
+
+	case linux.PR_SET_NAME:
+		addr := args[1].Pointer()
+		name, err := t.CopyInString(addr, linux.TASK_COMM_LEN-1)
+		if err != nil && err != syserror.ENAMETOOLONG {
+			return 0, nil, err
+		}
+		t.SetName(name)
+
+	case linux.PR_GET_NAME:
+		addr := args[1].Pointer()
+		buf := t.CopyScratchBuffer(linux.TASK_COMM_LEN)
+		len := copy(buf, t.Name())
+		if len < linux.TASK_COMM_LEN {
+			buf[len] = 0
+			len++
+		}
+		_, err := t.CopyOut(addr, buf[:len])
+		if err != nil {
+			return 0, nil, err
+		}
+
+	case linux.PR_SET_MM:
+		if !t.HasCapability(linux.CAP_SYS_RESOURCE) {
+			return 0, nil, syserror.EPERM
+		}
+
+		switch args[1].Int() {
+		case linux.PR_SET_MM_EXE_FILE:
+			fd := args[2].Int()
+
+			file := t.GetFile(fd)
+			if file == nil {
+				return 0, nil, syserror.EBADF
+			}
+			defer file.DecRef()
+
+			// They trying to set exe to a non-file?
+			if !fs.IsFile(file.Dirent.Inode.StableAttr) {
+				return 0, nil, syserror.EBADF
+			}
+
+			// Set the underlying executable.
+			t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file))
+
+		case linux.PR_SET_MM_AUXV,
+			linux.PR_SET_MM_START_CODE,
+			linux.PR_SET_MM_END_CODE,
+			linux.PR_SET_MM_START_DATA,
+			linux.PR_SET_MM_END_DATA,
+			linux.PR_SET_MM_START_STACK,
+			linux.PR_SET_MM_START_BRK,
+			linux.PR_SET_MM_BRK,
+			linux.PR_SET_MM_ARG_START,
+			linux.PR_SET_MM_ARG_END,
+			linux.PR_SET_MM_ENV_START,
+			linux.PR_SET_MM_ENV_END:
+
+			t.Kernel().EmitUnimplementedEvent(t)
+			fallthrough
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+
+	case linux.PR_SET_NO_NEW_PRIVS:
+		if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		// PR_SET_NO_NEW_PRIVS is assumed to always be set.
+		// See kernel.Task.updateCredsForExecLocked.
+		return 0, nil, nil
+
+	case linux.PR_GET_NO_NEW_PRIVS:
+		if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		return 1, nil, nil
+
+	case linux.PR_SET_SECCOMP:
+		if args[1].Int() != linux.SECCOMP_MODE_FILTER {
+			// Unsupported mode.
+			return 0, nil, syserror.EINVAL
+		}
+
+		return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer())
+
+	case linux.PR_GET_SECCOMP:
+		return uintptr(t.SeccompMode()), nil, nil
+
+	case linux.PR_CAPBSET_READ:
+		cp := linux.Capability(args[1].Uint64())
+		if !cp.Ok() {
+			return 0, nil, syserror.EINVAL
+		}
+		var rv uintptr
+		if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 {
+			rv = 1
+		}
+		return rv, nil, nil
+
+	case linux.PR_CAPBSET_DROP:
+		cp := linux.Capability(args[1].Uint64())
+		if !cp.Ok() {
+			return 0, nil, syserror.EINVAL
+		}
+		return 0, nil, t.DropBoundingCapability(cp)
+
+	case linux.PR_GET_TIMING,
+		linux.PR_SET_TIMING,
+		linux.PR_GET_TSC,
+		linux.PR_SET_TSC,
+		linux.PR_TASK_PERF_EVENTS_DISABLE,
+		linux.PR_TASK_PERF_EVENTS_ENABLE,
+		linux.PR_GET_TIMERSLACK,
+		linux.PR_SET_TIMERSLACK,
+		linux.PR_MCE_KILL,
+		linux.PR_MCE_KILL_GET,
+		linux.PR_GET_TID_ADDRESS,
+		linux.PR_SET_CHILD_SUBREAPER,
+		linux.PR_GET_CHILD_SUBREAPER,
+		linux.PR_GET_THP_DISABLE,
+		linux.PR_SET_THP_DISABLE,
+		linux.PR_MPX_ENABLE_MANAGEMENT,
+		linux.PR_MPX_DISABLE_MANAGEMENT:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+		fallthrough
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go
new file mode 100644
index 000000000..c0aa0fd60
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_random.go
@@ -0,0 +1,92 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"io"
+	"math"
+
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+	_GRND_NONBLOCK = 0x1
+	_GRND_RANDOM   = 0x2
+)
+
+// GetRandom implements the linux syscall getrandom(2).
+//
+// In a multi-tenant/shared environment, the only valid implementation is to
+// fetch data from the urandom pool, otherwise starvation attacks become
+// possible. The urandom pool is also expected to have plenty of entropy, thus
+// the GRND_RANDOM flag is ignored. The GRND_NONBLOCK flag does not apply, as
+// the pool will already be initialized.
+func GetRandom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	flags := args[2].Int()
+
+	// Flags are checked for validity but otherwise ignored. See above.
+	if flags & ^(_GRND_NONBLOCK|_GRND_RANDOM) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if length > math.MaxInt32 {
+		length = math.MaxInt32
+	}
+	ar, ok := addr.ToRange(uint64(length))
+	if !ok {
+		return 0, nil, syserror.EFAULT
+	}
+
+	// "If the urandom source has been initialized, reads of up to 256 bytes
+	// will always return as many bytes as requested and will not be
+	// interrupted by signals. No such guarantees apply for larger buffer
+	// sizes." - getrandom(2)
+	min := int(length)
+	if min > 256 {
+		min = 256
+	}
+	n, err := t.MemoryManager().CopyOutFrom(t, usermem.AddrRangeSeqOf(ar), safemem.FromIOReader{&randReader{-1, min}}, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if n >= int64(min) {
+		return uintptr(n), nil, nil
+	}
+	return 0, nil, err
+}
+
+// randReader is a io.Reader that handles partial reads from rand.Reader.
+type randReader struct {
+	done int
+	min  int
+}
+
+// Read implements io.Reader.Read.
+func (r *randReader) Read(dst []byte) (int, error) {
+	if r.done >= r.min {
+		return rand.Reader.Read(dst)
+	}
+	min := r.min - r.done
+	if min > len(dst) {
+		min = len(dst)
+	}
+	return io.ReadAtLeast(rand.Reader, dst, min)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
new file mode 100644
index 000000000..071b4bacc
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -0,0 +1,394 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// LINT.IfChange
+
+const (
+	// EventMaskRead contains events that can be triggered on reads.
+	EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+)
+
+// Read implements linux syscall read(2).  Note that we try to get a buffer that
+// is exactly the size requested because some applications like qemu expect
+// they can do large reads all at once.  Bug for bug.  Same for other read
+// calls below.
+func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := readv(t, file, dst)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+}
+
+// Readahead implements readahead(2).
+func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	size := args[2].SizeT()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is valid.
+	if int(size) < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Return EINVAL; if the underlying file type does not support readahead,
+	// then Linux will return EINVAL to indicate as much. In the future, we
+	// may extend this function to actually support readahead hints.
+	return 0, nil, syserror.EINVAL
+}
+
+// Pread64 implements linux syscall pread64(2).
+func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is reading at an offset supported?
+	if !file.Flags().Pread {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := preadv(t, file, dst, offset)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+}
+
+// Readv implements linux syscall readv(2).
+func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := readv(t, file, dst)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+}
+
+// Preadv implements linux syscall preadv(2).
+func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is reading at an offset supported?
+	if !file.Flags().Pread {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := preadv(t, file, dst, offset)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+}
+
+// Preadv2 implements linux syscall preadv2(2).
+func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the syscall is
+	// preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the linux internal call
+	// (https://elixir.bootlin.com/linux/v4.18/source/fs/read_write.c#L1248)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 5th argument.
+
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := int(args[5].Int())
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is reading at an offset supported?
+	if offset > -1 && !file.Flags().Pread {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check flags field.
+	// Note: gVisor does not implement the RWF_HIPRI feature, but the flag is
+	// accepted as a valid flag argument for preadv2.
+	if flags&^linux.RWF_VALID != 0 {
+		return 0, nil, syserror.EOPNOTSUPP
+	}
+
+	// Read the iovecs that specify the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// If preadv2 is called with an offset of -1, readv is called.
+	if offset == -1 {
+		n, err := readv(t, file, dst)
+		t.IOUsage().AccountReadSyscall(n)
+		return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+	}
+
+	n, err := preadv(t, file, dst, offset)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+}
+
+func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) {
+	n, err := f.Readv(t, dst)
+	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+		if n > 0 {
+			// Queue notification if we read anything.
+			f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+		}
+		return n, err
+	}
+
+	// Sockets support read timeouts.
+	var haveDeadline bool
+	var deadline ktime.Time
+	if s, ok := f.FileOperations.(socket.Socket); ok {
+		dl := s.RecvTimeout()
+		if dl < 0 && err == syserror.ErrWouldBlock {
+			return n, err
+		}
+		if dl > 0 {
+			deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+			haveDeadline = true
+		}
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	f.EventRegister(&w, EventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst64(n)
+
+		// Issue the request and break out if it completes with anything
+		// other than "would block".
+		n, err = f.Readv(t, dst)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = syserror.ErrWouldBlock
+			}
+			break
+		}
+	}
+
+	f.EventUnregister(&w)
+
+	if total > 0 {
+		// Queue notification if we read anything.
+		f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+	}
+
+	return total, err
+}
+
+func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	n, err := f.Preadv(t, dst, offset)
+	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+		if n > 0 {
+			// Queue notification if we read anything.
+			f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	f.EventRegister(&w, EventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst64(n)
+
+		// Issue the request and break out if it completes with anything
+		// other than "would block".
+		n, err = f.Preadv(t, dst, offset+total)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.Block(ch); err != nil {
+			break
+		}
+	}
+
+	f.EventUnregister(&w)
+
+	if total > 0 {
+		// Queue notification if we read anything.
+		f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+	}
+
+	return total, err
+}
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
new file mode 100644
index 000000000..d5d5b6959
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -0,0 +1,224 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// rlimit describes an implementation of 'struct rlimit', which may vary from
+// system-to-system.
+type rlimit interface {
+	// toLimit converts an rlimit to a limits.Limit.
+	toLimit() *limits.Limit
+
+	// fromLimit converts a limits.Limit to an rlimit.
+	fromLimit(lim limits.Limit)
+
+	// copyIn copies an rlimit from the untrusted app to the kernel.
+	copyIn(t *kernel.Task, addr usermem.Addr) error
+
+	// copyOut copies an rlimit from the kernel to the untrusted app.
+	copyOut(t *kernel.Task, addr usermem.Addr) error
+}
+
+// newRlimit returns the appropriate rlimit type for 'struct rlimit' on this system.
+func newRlimit(t *kernel.Task) (rlimit, error) {
+	switch t.Arch().Width() {
+	case 8:
+		// On 64-bit system, struct rlimit and struct rlimit64 are identical.
+		return &rlimit64{}, nil
+	default:
+		return nil, syserror.ENOSYS
+	}
+}
+
+type rlimit64 struct {
+	Cur uint64
+	Max uint64
+}
+
+func (r *rlimit64) toLimit() *limits.Limit {
+	return &limits.Limit{
+		Cur: limits.FromLinux(r.Cur),
+		Max: limits.FromLinux(r.Max),
+	}
+}
+
+func (r *rlimit64) fromLimit(lim limits.Limit) {
+	*r = rlimit64{
+		Cur: limits.ToLinux(lim.Cur),
+		Max: limits.ToLinux(lim.Max),
+	}
+}
+
+func (r *rlimit64) copyIn(t *kernel.Task, addr usermem.Addr) error {
+	_, err := t.CopyIn(addr, r)
+	return err
+}
+
+func (r *rlimit64) copyOut(t *kernel.Task, addr usermem.Addr) error {
+	_, err := t.CopyOut(addr, *r)
+	return err
+}
+
+func makeRlimit64(lim limits.Limit) *rlimit64 {
+	return &rlimit64{Cur: lim.Cur, Max: lim.Max}
+}
+
+// setableLimits is the set of supported setable limits.
+var setableLimits = map[limits.LimitType]struct{}{
+	limits.NumberOfFiles: {},
+	limits.AS:            {},
+	limits.CPU:           {},
+	limits.Data:          {},
+	limits.FileSize:      {},
+	limits.MemoryLocked:  {},
+	limits.Stack:         {},
+	// These are not enforced, but we include them here to avoid returning
+	// EPERM, since some apps expect them to succeed.
+	limits.Core:         {},
+	limits.ProcessCount: {},
+}
+
+func prlimit64(t *kernel.Task, resource limits.LimitType, newLim *limits.Limit) (limits.Limit, error) {
+	if newLim == nil {
+		return t.ThreadGroup().Limits().Get(resource), nil
+	}
+
+	if _, ok := setableLimits[resource]; !ok {
+		return limits.Limit{}, syserror.EPERM
+	}
+
+	// "A privileged process (under Linux: one with the CAP_SYS_RESOURCE
+	// capability in the initial user namespace) may make arbitrary changes
+	// to either limit value."
+	privileged := t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.Kernel().RootUserNamespace())
+
+	oldLim, err := t.ThreadGroup().Limits().Set(resource, *newLim, privileged)
+	if err != nil {
+		return limits.Limit{}, err
+	}
+
+	if resource == limits.CPU {
+		t.NotifyRlimitCPUUpdated()
+	}
+	return oldLim, nil
+}
+
+// Getrlimit implements linux syscall getrlimit(2).
+func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	resource, ok := limits.FromLinuxResource[int(args[0].Int())]
+	if !ok {
+		// Return err; unknown limit.
+		return 0, nil, syserror.EINVAL
+	}
+	addr := args[1].Pointer()
+	rlim, err := newRlimit(t)
+	if err != nil {
+		return 0, nil, err
+	}
+	lim, err := prlimit64(t, resource, nil)
+	if err != nil {
+		return 0, nil, err
+	}
+	rlim.fromLimit(lim)
+	return 0, nil, rlim.copyOut(t, addr)
+}
+
+// Setrlimit implements linux syscall setrlimit(2).
+func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	resource, ok := limits.FromLinuxResource[int(args[0].Int())]
+	if !ok {
+		// Return err; unknown limit.
+		return 0, nil, syserror.EINVAL
+	}
+	addr := args[1].Pointer()
+	rlim, err := newRlimit(t)
+	if err != nil {
+		return 0, nil, err
+	}
+	if err := rlim.copyIn(t, addr); err != nil {
+		return 0, nil, syserror.EFAULT
+	}
+	_, err = prlimit64(t, resource, rlim.toLimit())
+	return 0, nil, err
+}
+
+// Prlimit64 implements linux syscall prlimit64(2).
+func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := kernel.ThreadID(args[0].Int())
+	resource, ok := limits.FromLinuxResource[int(args[1].Int())]
+	if !ok {
+		// Return err; unknown limit.
+		return 0, nil, syserror.EINVAL
+	}
+	newRlimAddr := args[2].Pointer()
+	oldRlimAddr := args[3].Pointer()
+
+	var newLim *limits.Limit
+	if newRlimAddr != 0 {
+		var nrl rlimit64
+		if err := nrl.copyIn(t, newRlimAddr); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+		newLim = nrl.toLimit()
+	}
+
+	if tid < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	ot := t
+	if tid > 0 {
+		if ot = t.PIDNamespace().TaskWithID(tid); ot == nil {
+			return 0, nil, syserror.ESRCH
+		}
+	}
+
+	// "To set or get the resources of a process other than itself, the caller
+	// must have the CAP_SYS_RESOURCE capability, or the real, effective, and
+	// saved set user IDs of the target process must match the real user ID of
+	// the caller and the real, effective, and saved set group IDs of the
+	// target process must match the real group ID of the caller."
+	if ot != t && !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) {
+		cred, tcred := t.Credentials(), ot.Credentials()
+		if cred.RealKUID != tcred.RealKUID ||
+			cred.RealKUID != tcred.EffectiveKUID ||
+			cred.RealKUID != tcred.SavedKUID ||
+			cred.RealKGID != tcred.RealKGID ||
+			cred.RealKGID != tcred.EffectiveKGID ||
+			cred.RealKGID != tcred.SavedKGID {
+			return 0, nil, syserror.EPERM
+		}
+	}
+
+	oldLim, err := prlimit64(ot, resource, newLim)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if oldRlimAddr != 0 {
+		if err := makeRlimit64(oldLim).copyOut(t, oldRlimAddr); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+	}
+
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_rseq.go b/pkg/sentry/syscalls/linux/sys_rseq.go
new file mode 100644
index 000000000..90db10ea6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rseq.go
@@ -0,0 +1,48 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// RSeq implements syscall rseq(2).
+func RSeq(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Uint()
+	flags := args[2].Int()
+	signature := args[3].Uint()
+
+	if !t.RSeqAvailable() {
+		// Event for applications that want rseq on a configuration
+		// that doesn't support them.
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.ENOSYS
+	}
+
+	switch flags {
+	case 0:
+		// Register.
+		return 0, nil, t.SetRSeq(addr, length, signature)
+	case linux.RSEQ_FLAG_UNREGISTER:
+		return 0, nil, t.ClearRSeq(addr, length, signature)
+	default:
+		// Unknown flag.
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go
new file mode 100644
index 000000000..1674c7445
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rusage.go
@@ -0,0 +1,112 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func getrusage(t *kernel.Task, which int32) linux.Rusage {
+	var cs usage.CPUStats
+
+	switch which {
+	case linux.RUSAGE_SELF:
+		cs = t.ThreadGroup().CPUStats()
+
+	case linux.RUSAGE_CHILDREN:
+		cs = t.ThreadGroup().JoinedChildCPUStats()
+
+	case linux.RUSAGE_THREAD:
+		cs = t.CPUStats()
+
+	case linux.RUSAGE_BOTH:
+		tg := t.ThreadGroup()
+		cs = tg.CPUStats()
+		cs.Accumulate(tg.JoinedChildCPUStats())
+	}
+
+	return linux.Rusage{
+		UTime:  linux.NsecToTimeval(cs.UserTime.Nanoseconds()),
+		STime:  linux.NsecToTimeval(cs.SysTime.Nanoseconds()),
+		NVCSw:  int64(cs.VoluntarySwitches),
+		MaxRSS: int64(t.MaxRSS(which) / 1024),
+	}
+}
+
+// Getrusage implements linux syscall getrusage(2).
+//	marked "y" are supported now
+//	marked "*" are not used on Linux
+//	marked "p" are pending for support
+//
+//	y    struct timeval ru_utime; /* user CPU time used */
+//	y    struct timeval ru_stime; /* system CPU time used */
+//	p    long   ru_maxrss;        /* maximum resident set size */
+//	*    long   ru_ixrss;         /* integral shared memory size */
+//	*    long   ru_idrss;         /* integral unshared data size */
+//	*    long   ru_isrss;         /* integral unshared stack size */
+//	p    long   ru_minflt;        /* page reclaims (soft page faults) */
+//	p    long   ru_majflt;        /* page faults (hard page faults) */
+//	*    long   ru_nswap;         /* swaps */
+//	p    long   ru_inblock;       /* block input operations */
+//	p    long   ru_oublock;       /* block output operations */
+//	*    long   ru_msgsnd;        /* IPC messages sent */
+//	*    long   ru_msgrcv;        /* IPC messages received */
+//	*    long   ru_nsignals;      /* signals received */
+//	y    long   ru_nvcsw;         /* voluntary context switches */
+//	y    long   ru_nivcsw;        /* involuntary context switches */
+func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	which := args[0].Int()
+	addr := args[1].Pointer()
+
+	if which != linux.RUSAGE_SELF && which != linux.RUSAGE_CHILDREN && which != linux.RUSAGE_THREAD {
+		return 0, nil, syserror.EINVAL
+	}
+
+	ru := getrusage(t, which)
+	_, err := t.CopyOut(addr, &ru)
+	return 0, nil, err
+}
+
+// Times implements linux syscall times(2).
+func Times(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	// Calculate the ticks first, and figure out if any additional work is
+	// necessary. Linux allows for a NULL addr, in which case only the
+	// return value is meaningful. We don't need to do anything else.
+	ticks := uintptr(ktime.NowFromContext(t).Nanoseconds() / linux.ClockTick.Nanoseconds())
+	if addr == 0 {
+		return ticks, nil, nil
+	}
+
+	cs1 := t.ThreadGroup().CPUStats()
+	cs2 := t.ThreadGroup().JoinedChildCPUStats()
+	r := linux.Tms{
+		UTime:  linux.ClockTFromDuration(cs1.UserTime),
+		STime:  linux.ClockTFromDuration(cs1.SysTime),
+		CUTime: linux.ClockTFromDuration(cs2.UserTime),
+		CSTime: linux.ClockTFromDuration(cs2.SysTime),
+	}
+	if _, err := t.CopyOut(addr, &r); err != nil {
+		return 0, nil, err
+	}
+
+	return ticks, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
new file mode 100644
index 000000000..99f6993f5
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -0,0 +1,99 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+	onlyScheduler = linux.SCHED_NORMAL
+	onlyPriority  = 0
+)
+
+// SchedParam replicates struct sched_param in sched.h.
+type SchedParam struct {
+	schedPriority int64
+}
+
+// SchedGetparam implements linux syscall sched_getparam(2).
+func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := args[0].Int()
+	param := args[1].Pointer()
+	if param == 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if pid < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
+		return 0, nil, syserror.ESRCH
+	}
+	r := SchedParam{schedPriority: onlyPriority}
+	if _, err := t.CopyOut(param, r); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// SchedGetscheduler implements linux syscall sched_getscheduler(2).
+func SchedGetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := args[0].Int()
+	if pid < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
+		return 0, nil, syserror.ESRCH
+	}
+	return onlyScheduler, nil, nil
+}
+
+// SchedSetscheduler implements linux syscall sched_setscheduler(2).
+func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := args[0].Int()
+	policy := args[1].Int()
+	param := args[2].Pointer()
+	if pid < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if policy != onlyScheduler {
+		return 0, nil, syserror.EINVAL
+	}
+	if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
+		return 0, nil, syserror.ESRCH
+	}
+	var r SchedParam
+	if _, err := t.CopyIn(param, &r); err != nil {
+		return 0, nil, syserror.EINVAL
+	}
+	if r.schedPriority != onlyPriority {
+		return 0, nil, syserror.EINVAL
+	}
+	return 0, nil, nil
+}
+
+// SchedGetPriorityMax implements linux syscall sched_get_priority_max(2).
+func SchedGetPriorityMax(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return onlyPriority, nil, nil
+}
+
+// SchedGetPriorityMin implements linux syscall sched_get_priority_min(2).
+func SchedGetPriorityMin(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return onlyPriority, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
new file mode 100644
index 000000000..5b7a66f4d
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -0,0 +1,76 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
+type userSockFprog struct {
+	// Len is the length of the filter in BPF instructions.
+	Len uint16
+
+	_ [6]byte // padding for alignment
+
+	// Filter is a user pointer to the struct sock_filter array that makes up
+	// the filter program. Filter is a uint64 rather than a usermem.Addr
+	// because usermem.Addr is actually uintptr, which is not a fixed-size
+	// type, and encoding/binary.Read objects to this.
+	Filter uint64
+}
+
+// seccomp applies a seccomp policy to the current task.
+func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error {
+	// We only support SECCOMP_SET_MODE_FILTER at the moment.
+	if mode != linux.SECCOMP_SET_MODE_FILTER {
+		// Unsupported mode.
+		return syserror.EINVAL
+	}
+
+	tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0
+
+	// The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC.
+	if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 {
+		// Unsupported flag.
+		return syserror.EINVAL
+	}
+
+	var fprog userSockFprog
+	if _, err := t.CopyIn(addr, &fprog); err != nil {
+		return err
+	}
+	filter := make([]linux.BPFInstruction, int(fprog.Len))
+	if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil {
+		return err
+	}
+	compiledFilter, err := bpf.Compile(filter)
+	if err != nil {
+		t.Debugf("Invalid seccomp-bpf filter: %v", err)
+		return syserror.EINVAL
+	}
+
+	return t.AppendSyscallFilter(compiledFilter, tsync)
+}
+
+// Seccomp implements linux syscall seccomp(2).
+func Seccomp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, seccomp(t, args[0].Uint64(), args[1].Uint64(), args[2].Pointer())
+}
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
new file mode 100644
index 000000000..5f54f2456
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -0,0 +1,241 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"math"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const opsMax = 500 // SEMOPM
+
+// Semget handles: semget(key_t key, int nsems, int semflg)
+func Semget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	key := args[0].Int()
+	nsems := args[1].Int()
+	flag := args[2].Int()
+
+	private := key == linux.IPC_PRIVATE
+	create := flag&linux.IPC_CREAT == linux.IPC_CREAT
+	exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL
+	mode := linux.FileMode(flag & 0777)
+
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set, err := r.FindOrCreate(t, key, nsems, mode, private, create, exclusive)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(set.ID), nil, nil
+}
+
+// Semop handles: semop(int semid, struct sembuf *sops, size_t nsops)
+func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Int()
+	sembufAddr := args[1].Pointer()
+	nsops := args[2].SizeT()
+
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return 0, nil, syserror.EINVAL
+	}
+	if nsops <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if nsops > opsMax {
+		return 0, nil, syserror.E2BIG
+	}
+
+	ops := make([]linux.Sembuf, nsops)
+	if _, err := t.CopyIn(sembufAddr, ops); err != nil {
+		return 0, nil, err
+	}
+
+	creds := auth.CredentialsFromContext(t)
+	pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
+	for {
+		ch, num, err := set.ExecuteOps(t, ops, creds, int32(pid))
+		if ch == nil || err != nil {
+			// We're done (either on success or a failure).
+			return 0, nil, err
+		}
+		if err = t.Block(ch); err != nil {
+			set.AbortWait(num, ch)
+			return 0, nil, err
+		}
+	}
+}
+
+// Semctl handles: semctl(int semid, int semnum, int cmd, ...)
+func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Int()
+	num := args[1].Int()
+	cmd := args[2].Int()
+
+	switch cmd {
+	case linux.SETVAL:
+		val := args[3].Int()
+		if val > math.MaxInt16 {
+			return 0, nil, syserror.ERANGE
+		}
+		return 0, nil, setVal(t, id, num, int16(val))
+
+	case linux.SETALL:
+		array := args[3].Pointer()
+		return 0, nil, setValAll(t, id, array)
+
+	case linux.GETVAL:
+		v, err := getVal(t, id, num)
+		return uintptr(v), nil, err
+
+	case linux.GETALL:
+		array := args[3].Pointer()
+		return 0, nil, getValAll(t, id, array)
+
+	case linux.IPC_RMID:
+		return 0, nil, remove(t, id)
+
+	case linux.IPC_SET:
+		arg := args[3].Pointer()
+		s := linux.SemidDS{}
+		if _, err := t.CopyIn(arg, &s); err != nil {
+			return 0, nil, err
+		}
+
+		perms := fs.FilePermsFromMode(linux.FileMode(s.SemPerm.Mode & 0777))
+		return 0, nil, ipcSet(t, id, auth.UID(s.SemPerm.UID), auth.GID(s.SemPerm.GID), perms)
+
+	case linux.GETPID:
+		v, err := getPID(t, id, num)
+		return uintptr(v), nil, err
+
+	case linux.IPC_INFO,
+		linux.SEM_INFO,
+		linux.IPC_STAT,
+		linux.SEM_STAT,
+		linux.SEM_STAT_ANY,
+		linux.GETNCNT,
+		linux.GETZCNT:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+		fallthrough
+
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+func remove(t *kernel.Task, id int32) error {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	creds := auth.CredentialsFromContext(t)
+	return r.RemoveID(id, creds)
+}
+
+func ipcSet(t *kernel.Task, id int32, uid auth.UID, gid auth.GID, perms fs.FilePermissions) error {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return syserror.EINVAL
+	}
+
+	creds := auth.CredentialsFromContext(t)
+	kuid := creds.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return syserror.EINVAL
+	}
+	kgid := creds.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return syserror.EINVAL
+	}
+	owner := fs.FileOwner{UID: kuid, GID: kgid}
+	return set.Change(t, creds, owner, perms)
+}
+
+func setVal(t *kernel.Task, id int32, num int32, val int16) error {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return syserror.EINVAL
+	}
+	creds := auth.CredentialsFromContext(t)
+	pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
+	return set.SetVal(t, num, val, creds, int32(pid))
+}
+
+func setValAll(t *kernel.Task, id int32, array usermem.Addr) error {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return syserror.EINVAL
+	}
+	vals := make([]uint16, set.Size())
+	if _, err := t.CopyIn(array, vals); err != nil {
+		return err
+	}
+	creds := auth.CredentialsFromContext(t)
+	pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
+	return set.SetValAll(t, vals, creds, int32(pid))
+}
+
+func getVal(t *kernel.Task, id int32, num int32) (int16, error) {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return 0, syserror.EINVAL
+	}
+	creds := auth.CredentialsFromContext(t)
+	return set.GetVal(num, creds)
+}
+
+func getValAll(t *kernel.Task, id int32, array usermem.Addr) error {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return syserror.EINVAL
+	}
+	creds := auth.CredentialsFromContext(t)
+	vals, err := set.GetValAll(creds)
+	if err != nil {
+		return err
+	}
+	_, err = t.CopyOut(array, vals)
+	return err
+}
+
+func getPID(t *kernel.Task, id int32, num int32) (int32, error) {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return 0, syserror.EINVAL
+	}
+	creds := auth.CredentialsFromContext(t)
+	gpid, err := set.GetPID(num, creds)
+	if err != nil {
+		return 0, err
+	}
+	// Convert pid from init namespace to the caller's namespace.
+	tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(gpid))
+	if tg == nil {
+		return 0, nil
+	}
+	return int32(tg.ID()), nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
new file mode 100644
index 000000000..4a8bc24a2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -0,0 +1,161 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/shm"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Shmget implements shmget(2).
+func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	key := shm.Key(args[0].Int())
+	size := uint64(args[1].SizeT())
+	flag := args[2].Int()
+
+	private := key == linux.IPC_PRIVATE
+	create := flag&linux.IPC_CREAT == linux.IPC_CREAT
+	exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL
+	mode := linux.FileMode(flag & 0777)
+
+	pid := int32(t.ThreadGroup().ID())
+	r := t.IPCNamespace().ShmRegistry()
+	segment, err := r.FindOrCreate(t, pid, key, size, mode, private, create, exclusive)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer segment.DecRef()
+	return uintptr(segment.ID), nil, nil
+}
+
+// findSegment retrives a shm segment by the given id.
+//
+// findSegment returns a reference on Shm.
+func findSegment(t *kernel.Task, id shm.ID) (*shm.Shm, error) {
+	r := t.IPCNamespace().ShmRegistry()
+	segment := r.FindByID(id)
+	if segment == nil {
+		// No segment with provided id.
+		return nil, syserror.EINVAL
+	}
+	return segment, nil
+}
+
+// Shmat implements shmat(2).
+func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := shm.ID(args[0].Int())
+	addr := args[1].Pointer()
+	flag := args[2].Int()
+
+	segment, err := findSegment(t, id)
+	if err != nil {
+		return 0, nil, syserror.EINVAL
+	}
+	defer segment.DecRef()
+
+	opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{
+		Execute:  flag&linux.SHM_EXEC == linux.SHM_EXEC,
+		Readonly: flag&linux.SHM_RDONLY == linux.SHM_RDONLY,
+		Remap:    flag&linux.SHM_REMAP == linux.SHM_REMAP,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	addr, err = t.MemoryManager().MMap(t, opts)
+	return uintptr(addr), nil, err
+}
+
+// Shmdt implements shmdt(2).
+func Shmdt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	err := t.MemoryManager().DetachShm(t, addr)
+	return 0, nil, err
+}
+
+// Shmctl implements shmctl(2).
+func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := shm.ID(args[0].Int())
+	cmd := args[1].Int()
+	buf := args[2].Pointer()
+
+	r := t.IPCNamespace().ShmRegistry()
+
+	switch cmd {
+	case linux.SHM_STAT:
+		// Technically, we should be treating id as "an index into the kernel's
+		// internal array that maintains information about all shared memory
+		// segments on the system". Since we don't track segments in an array,
+		// we'll just pretend the shmid is the index and do the same thing as
+		// IPC_STAT. Linux also uses the index as the shmid.
+		fallthrough
+	case linux.IPC_STAT:
+		segment, err := findSegment(t, id)
+		if err != nil {
+			return 0, nil, syserror.EINVAL
+		}
+		defer segment.DecRef()
+
+		stat, err := segment.IPCStat(t)
+		if err == nil {
+			_, err = t.CopyOut(buf, stat)
+		}
+		return 0, nil, err
+
+	case linux.IPC_INFO:
+		params := r.IPCInfo()
+		_, err := t.CopyOut(buf, params)
+		return 0, nil, err
+
+	case linux.SHM_INFO:
+		info := r.ShmInfo()
+		_, err := t.CopyOut(buf, info)
+		return 0, nil, err
+	}
+
+	// Remaining commands refer to a specific segment.
+	segment, err := findSegment(t, id)
+	if err != nil {
+		return 0, nil, syserror.EINVAL
+	}
+	defer segment.DecRef()
+
+	switch cmd {
+	case linux.IPC_SET:
+		var ds linux.ShmidDS
+		_, err = t.CopyIn(buf, &ds)
+		if err != nil {
+			return 0, nil, err
+		}
+		err = segment.Set(t, &ds)
+		return 0, nil, err
+
+	case linux.IPC_RMID:
+		segment.MarkDestroyed()
+		return 0, nil, nil
+
+	case linux.SHM_LOCK, linux.SHM_UNLOCK:
+		// We currently do not support memory locking anywhere.
+		// mlock(2)/munlock(2) are currently stubbed out as no-ops so do the
+		// same here.
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, nil
+
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
new file mode 100644
index 000000000..d2b0012ae
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -0,0 +1,590 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"math"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/signalfd"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// "For a process to have permission to send a signal it must
+// - either be privileged (CAP_KILL), or
+// - the real or effective user ID of the sending process must be equal to the
+// real or saved set-user-ID of the target process.
+//
+// In the case of SIGCONT it suffices when the sending and receiving processes
+// belong to the same session." - kill(2)
+//
+// Equivalent to kernel/signal.c:check_kill_permission.
+func mayKill(t *kernel.Task, target *kernel.Task, sig linux.Signal) bool {
+	// kernel/signal.c:check_kill_permission also allows a signal if the
+	// sending and receiving tasks share a thread group, which is not
+	// mentioned in kill(2) since kill does not allow task-level
+	// granularity in signal sending.
+	if t.ThreadGroup() == target.ThreadGroup() {
+		return true
+	}
+
+	if t.HasCapabilityIn(linux.CAP_KILL, target.UserNamespace()) {
+		return true
+	}
+
+	creds := t.Credentials()
+	tcreds := target.Credentials()
+	if creds.EffectiveKUID == tcreds.SavedKUID ||
+		creds.EffectiveKUID == tcreds.RealKUID ||
+		creds.RealKUID == tcreds.SavedKUID ||
+		creds.RealKUID == tcreds.RealKUID {
+		return true
+	}
+
+	if sig == linux.SIGCONT && target.ThreadGroup().Session() == t.ThreadGroup().Session() {
+		return true
+	}
+	return false
+}
+
+// Kill implements linux syscall kill(2).
+func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := kernel.ThreadID(args[0].Int())
+	sig := linux.Signal(args[1].Int())
+
+	switch {
+	case pid > 0:
+		// "If pid is positive, then signal sig is sent to the process with the
+		// ID specified by pid." - kill(2)
+		// This loops to handle races with execve where target dies between
+		// TaskWithID and SendGroupSignal. Compare Linux's
+		// kernel/signal.c:kill_pid_info().
+		for {
+			target := t.PIDNamespace().TaskWithID(pid)
+			if target == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			if !mayKill(t, target, sig) {
+				return 0, nil, syserror.EPERM
+			}
+			info := &arch.SignalInfo{
+				Signo: int32(sig),
+				Code:  arch.SignalInfoUser,
+			}
+			info.SetPid(int32(target.PIDNamespace().IDOfTask(t)))
+			info.SetUid(int32(t.Credentials().RealKUID.In(target.UserNamespace()).OrOverflow()))
+			if err := target.SendGroupSignal(info); err != syserror.ESRCH {
+				return 0, nil, err
+			}
+		}
+	case pid == -1:
+		// "If pid equals -1, then sig is sent to every process for which the
+		// calling process has permission to send signals, except for process 1
+		// (init), but see below. ... POSIX.1-2001 requires that kill(-1,sig)
+		// send sig to all processes that the calling process may send signals
+		// to, except possibly for some implementation-defined system
+		// processes. Linux allows a process to signal itself, but on Linux the
+		// call kill(-1,sig) does not signal the calling process."
+		var (
+			lastErr   error
+			delivered int
+		)
+		for _, tg := range t.PIDNamespace().ThreadGroups() {
+			if tg == t.ThreadGroup() {
+				continue
+			}
+			if t.PIDNamespace().IDOfThreadGroup(tg) == kernel.InitTID {
+				continue
+			}
+
+			// If pid == -1, the returned error is the last non-EPERM error
+			// from any call to group_send_sig_info.
+			if !mayKill(t, tg.Leader(), sig) {
+				continue
+			}
+			// Here and below, whether or not kill returns an error may
+			// depend on the iteration order. We at least implement the
+			// semantics documented by the man page: "On success (at least
+			// one signal was sent), zero is returned."
+			info := &arch.SignalInfo{
+				Signo: int32(sig),
+				Code:  arch.SignalInfoUser,
+			}
+			info.SetPid(int32(tg.PIDNamespace().IDOfTask(t)))
+			info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow()))
+			err := tg.SendSignal(info)
+			if err == syserror.ESRCH {
+				// ESRCH is ignored because it means the task
+				// exited while we were iterating.  This is a
+				// race which would not normally exist on
+				// Linux, so we suppress it.
+				continue
+			}
+			delivered++
+			if err != nil {
+				lastErr = err
+			}
+		}
+		if delivered > 0 {
+			return 0, nil, lastErr
+		}
+		return 0, nil, syserror.ESRCH
+	default:
+		// "If pid equals 0, then sig is sent to every process in the process
+		// group of the calling process."
+		//
+		// "If pid is less than -1, then sig is sent to every process
+		// in the process group whose ID is -pid."
+		pgid := kernel.ProcessGroupID(-pid)
+		if pgid == 0 {
+			pgid = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())
+		}
+
+		// If pid != -1 (i.e. signalling a process group), the returned error
+		// is the last error from any call to group_send_sig_info.
+		lastErr := syserror.ESRCH
+		for _, tg := range t.PIDNamespace().ThreadGroups() {
+			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid {
+				if !mayKill(t, tg.Leader(), sig) {
+					lastErr = syserror.EPERM
+					continue
+				}
+
+				info := &arch.SignalInfo{
+					Signo: int32(sig),
+					Code:  arch.SignalInfoUser,
+				}
+				info.SetPid(int32(tg.PIDNamespace().IDOfTask(t)))
+				info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow()))
+				// See note above regarding ESRCH race above.
+				if err := tg.SendSignal(info); err != syserror.ESRCH {
+					lastErr = err
+				}
+			}
+		}
+
+		return 0, nil, lastErr
+	}
+}
+
+func tkillSigInfo(sender, receiver *kernel.Task, sig linux.Signal) *arch.SignalInfo {
+	info := &arch.SignalInfo{
+		Signo: int32(sig),
+		Code:  arch.SignalInfoTkill,
+	}
+	info.SetPid(int32(receiver.PIDNamespace().IDOfThreadGroup(sender.ThreadGroup())))
+	info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+	return info
+}
+
+// Tkill implements linux syscall tkill(2).
+func Tkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := kernel.ThreadID(args[0].Int())
+	sig := linux.Signal(args[1].Int())
+
+	// N.B. Inconsistent with man page, linux actually rejects calls with
+	// tid <=0 by EINVAL. This isn't the same for all signal calls.
+	if tid <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	target := t.PIDNamespace().TaskWithID(tid)
+	if target == nil {
+		return 0, nil, syserror.ESRCH
+	}
+
+	if !mayKill(t, target, sig) {
+		return 0, nil, syserror.EPERM
+	}
+	return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig))
+}
+
+// Tgkill implements linux syscall tgkill(2).
+func Tgkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tgid := kernel.ThreadID(args[0].Int())
+	tid := kernel.ThreadID(args[1].Int())
+	sig := linux.Signal(args[2].Int())
+
+	// N.B. Inconsistent with man page, linux actually rejects calls with
+	// tgid/tid <=0 by EINVAL. This isn't the same for all signal calls.
+	if tgid <= 0 || tid <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	targetTG := t.PIDNamespace().ThreadGroupWithID(tgid)
+	target := t.PIDNamespace().TaskWithID(tid)
+	if targetTG == nil || target == nil || target.ThreadGroup() != targetTG {
+		return 0, nil, syserror.ESRCH
+	}
+
+	if !mayKill(t, target, sig) {
+		return 0, nil, syserror.EPERM
+	}
+	return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig))
+}
+
+// RtSigaction implements linux syscall rt_sigaction(2).
+func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sig := linux.Signal(args[0].Int())
+	newactarg := args[1].Pointer()
+	oldactarg := args[2].Pointer()
+	sigsetsize := args[3].SizeT()
+
+	if sigsetsize != linux.SignalSetSize {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var newactptr *arch.SignalAct
+	if newactarg != 0 {
+		newact, err := t.CopyInSignalAct(newactarg)
+		if err != nil {
+			return 0, nil, err
+		}
+		newactptr = &newact
+	}
+	oldact, err := t.ThreadGroup().SetSignalAct(sig, newactptr)
+	if err != nil {
+		return 0, nil, err
+	}
+	if oldactarg != 0 {
+		if err := t.CopyOutSignalAct(oldactarg, &oldact); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// Sigreturn implements linux syscall sigreturn(2).
+func Sigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ctrl, err := t.SignalReturn(false)
+	return 0, ctrl, err
+}
+
+// RtSigreturn implements linux syscall rt_sigreturn(2).
+func RtSigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ctrl, err := t.SignalReturn(true)
+	return 0, ctrl, err
+}
+
+// RtSigprocmask implements linux syscall rt_sigprocmask(2).
+func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	how := args[0].Int()
+	setaddr := args[1].Pointer()
+	oldaddr := args[2].Pointer()
+	sigsetsize := args[3].SizeT()
+
+	if sigsetsize != linux.SignalSetSize {
+		return 0, nil, syserror.EINVAL
+	}
+	oldmask := t.SignalMask()
+	if setaddr != 0 {
+		mask, err := CopyInSigSet(t, setaddr, sigsetsize)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		switch how {
+		case linux.SIG_BLOCK:
+			t.SetSignalMask(oldmask | mask)
+		case linux.SIG_UNBLOCK:
+			t.SetSignalMask(oldmask &^ mask)
+		case linux.SIG_SETMASK:
+			t.SetSignalMask(mask)
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+	}
+	if oldaddr != 0 {
+		return 0, nil, copyOutSigSet(t, oldaddr, oldmask)
+	}
+
+	return 0, nil, nil
+}
+
+// Sigaltstack implements linux syscall sigaltstack(2).
+func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	setaddr := args[0].Pointer()
+	oldaddr := args[1].Pointer()
+
+	alt := t.SignalStack()
+	if oldaddr != 0 {
+		if err := t.CopyOutSignalStack(oldaddr, &alt); err != nil {
+			return 0, nil, err
+		}
+	}
+	if setaddr != 0 {
+		alt, err := t.CopyInSignalStack(setaddr)
+		if err != nil {
+			return 0, nil, err
+		}
+		// The signal stack cannot be changed if the task is currently
+		// on the stack. This is enforced at the lowest level because
+		// these semantics apply to changing the signal stack via a
+		// ucontext during a signal handler.
+		if !t.SetSignalStack(alt) {
+			return 0, nil, syserror.EPERM
+		}
+	}
+
+	return 0, nil, nil
+}
+
+// Pause implements linux syscall pause(2).
+func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND)
+}
+
+// RtSigpending implements linux syscall rt_sigpending(2).
+func RtSigpending(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	pending := t.PendingSignals()
+	_, err := pending.CopyOut(t, addr)
+	return 0, nil, err
+}
+
+// RtSigtimedwait implements linux syscall rt_sigtimedwait(2).
+func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sigset := args[0].Pointer()
+	siginfo := args[1].Pointer()
+	timespec := args[2].Pointer()
+	sigsetsize := args[3].SizeT()
+
+	mask, err := CopyInSigSet(t, sigset, sigsetsize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var timeout time.Duration
+	if timespec != 0 {
+		d, err := copyTimespecIn(t, timespec)
+		if err != nil {
+			return 0, nil, err
+		}
+		if !d.Valid() {
+			return 0, nil, syserror.EINVAL
+		}
+		timeout = time.Duration(d.ToNsecCapped())
+	} else {
+		timeout = time.Duration(math.MaxInt64)
+	}
+
+	si, err := t.Sigtimedwait(mask, timeout)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if siginfo != 0 {
+		si.FixSignalCodeForUser()
+		if _, err := si.CopyOut(t, siginfo); err != nil {
+			return 0, nil, err
+		}
+	}
+	return uintptr(si.Signo), nil, nil
+}
+
+// RtSigqueueinfo implements linux syscall rt_sigqueueinfo(2).
+func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := kernel.ThreadID(args[0].Int())
+	sig := linux.Signal(args[1].Int())
+	infoAddr := args[2].Pointer()
+
+	// Copy in the info.
+	//
+	// We must ensure that the Signo is set (Linux overrides this in the
+	// same way), and that the code is in the allowed set. This same logic
+	// appears below in RtSigtgqueueinfo and should be kept in sync.
+	var info arch.SignalInfo
+	if _, err := info.CopyIn(t, infoAddr); err != nil {
+		return 0, nil, err
+	}
+	info.Signo = int32(sig)
+
+	// This must loop to handle the race with execve described in Kill.
+	for {
+		// Deliver to the given task's thread group.
+		target := t.PIDNamespace().TaskWithID(pid)
+		if target == nil {
+			return 0, nil, syserror.ESRCH
+		}
+
+		// If the sender is not the receiver, it can't use si_codes used by the
+		// kernel or SI_TKILL.
+		if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t {
+			return 0, nil, syserror.EPERM
+		}
+
+		if !mayKill(t, target, sig) {
+			return 0, nil, syserror.EPERM
+		}
+
+		if err := target.SendGroupSignal(&info); err != syserror.ESRCH {
+			return 0, nil, err
+		}
+	}
+}
+
+// RtTgsigqueueinfo implements linux syscall rt_tgsigqueueinfo(2).
+func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tgid := kernel.ThreadID(args[0].Int())
+	tid := kernel.ThreadID(args[1].Int())
+	sig := linux.Signal(args[2].Int())
+	infoAddr := args[3].Pointer()
+
+	// N.B. Inconsistent with man page, linux actually rejects calls with
+	// tgid/tid <=0 by EINVAL. This isn't the same for all signal calls.
+	if tgid <= 0 || tid <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Copy in the info. See RtSigqueueinfo above.
+	var info arch.SignalInfo
+	if _, err := info.CopyIn(t, infoAddr); err != nil {
+		return 0, nil, err
+	}
+	info.Signo = int32(sig)
+
+	// Deliver to the given task.
+	targetTG := t.PIDNamespace().ThreadGroupWithID(tgid)
+	target := t.PIDNamespace().TaskWithID(tid)
+	if targetTG == nil || target == nil || target.ThreadGroup() != targetTG {
+		return 0, nil, syserror.ESRCH
+	}
+
+	// If the sender is not the receiver, it can't use si_codes used by the
+	// kernel or SI_TKILL.
+	if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t {
+		return 0, nil, syserror.EPERM
+	}
+
+	if !mayKill(t, target, sig) {
+		return 0, nil, syserror.EPERM
+	}
+	return 0, nil, target.SendSignal(&info)
+}
+
+// RtSigsuspend implements linux syscall rt_sigsuspend(2).
+func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sigset := args[0].Pointer()
+
+	// Copy in the signal mask.
+	var mask linux.SignalSet
+	if _, err := mask.CopyIn(t, sigset); err != nil {
+		return 0, nil, err
+	}
+	mask &^= kernel.UnblockableSignals
+
+	// Swap the mask.
+	oldmask := t.SignalMask()
+	t.SetSignalMask(mask)
+	t.SetSavedSignalMask(oldmask)
+
+	// Perform the wait.
+	return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND)
+}
+
+// RestartSyscall implements the linux syscall restart_syscall(2).
+func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	if r := t.SyscallRestartBlock(); r != nil {
+		n, err := r.Restart(t)
+		return n, nil, err
+	}
+	// The restart block should never be nil here, but it's possible
+	// ERESTART_RESTARTBLOCK was set by ptrace without the current syscall
+	// setting up a restart block. If ptrace didn't manipulate the return value,
+	// finding a nil restart block is a bug. Linux ensures that the restart
+	// function is never null by (re)initializing it with one that translates
+	// the restart into EINTR. We'll emulate that behaviour.
+	t.Debugf("Restart block missing in restart_syscall(2). Did ptrace inject a return value of ERESTART_RESTARTBLOCK?")
+	return 0, nil, syserror.EINTR
+}
+
+// sharedSignalfd is shared between the two calls.
+func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) {
+	// Copy in the signal mask.
+	mask, err := CopyInSigSet(t, sigset, sigsetsize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Always check for valid flags, even if not creating.
+	if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is this a change to an existing signalfd?
+	//
+	// The spec indicates that this should adjust the mask.
+	if fd != -1 {
+		file := t.GetFile(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		// Is this a signalfd?
+		if s, ok := file.FileOperations.(*signalfd.SignalOperations); ok {
+			s.SetMask(mask)
+			return 0, nil, nil
+		}
+
+		// Not a signalfd.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Create a new file.
+	file, err := signalfd.New(t, mask)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	// Set appropriate flags.
+	file.SetFlags(fs.SettableFileFlags{
+		NonBlocking: flags&linux.SFD_NONBLOCK != 0,
+	})
+
+	// Create a new descriptor.
+	fd, err = t.NewFDFrom(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.SFD_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Done.
+	return uintptr(fd), nil, nil
+}
+
+// Signalfd implements the linux syscall signalfd(2).
+func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	sigset := args[1].Pointer()
+	sigsetsize := args[2].SizeT()
+	return sharedSignalfd(t, fd, sigset, sigsetsize, 0)
+}
+
+// Signalfd4 implements the linux syscall signalfd4(2).
+func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	sigset := args[1].Pointer()
+	sigsetsize := args[2].SizeT()
+	flags := args[3].Int()
+	return sharedSignalfd(t, fd, sigset, sigsetsize, flags)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
new file mode 100644
index 000000000..0760af77b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -0,0 +1,1138 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// LINT.IfChange
+
+// minListenBacklog is the minimum reasonable backlog for listening sockets.
+const minListenBacklog = 8
+
+// maxListenBacklog is the maximum allowed backlog for listening sockets.
+const maxListenBacklog = 1024
+
+// maxAddrLen is the maximum socket address length we're willing to accept.
+const maxAddrLen = 200
+
+// maxOptLen is the maximum sockopt parameter length we're willing to accept.
+const maxOptLen = 1024 * 8
+
+// maxControlLen is the maximum length of the msghdr.msg_control buffer we're
+// willing to accept. Note that this limit is smaller than Linux, which allows
+// buffers upto INT_MAX.
+const maxControlLen = 10 * 1024 * 1024
+
+// nameLenOffset is the offset from the start of the MessageHeader64 struct to
+// the NameLen field.
+const nameLenOffset = 8
+
+// controlLenOffset is the offset form the start of the MessageHeader64 struct
+// to the ControlLen field.
+const controlLenOffset = 40
+
+// flagsOffset is the offset form the start of the MessageHeader64 struct
+// to the Flags field.
+const flagsOffset = 48
+
+const sizeOfInt32 = 4
+
+// messageHeader64Len is the length of a MessageHeader64 struct.
+var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+
+// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
+var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+
+// baseRecvFlags are the flags that are accepted across recvmsg(2),
+// recvmmsg(2), and recvfrom(2).
+const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC
+
+// MessageHeader64 is the 64-bit representation of the msghdr struct used in
+// the recvmsg and sendmsg syscalls.
+type MessageHeader64 struct {
+	// Name is the optional pointer to a network address buffer.
+	Name uint64
+
+	// NameLen is the length of the buffer pointed to by Name.
+	NameLen uint32
+	_       uint32
+
+	// Iov is a pointer to an array of io vectors that describe the memory
+	// locations involved in the io operation.
+	Iov uint64
+
+	// IovLen is the length of the array pointed to by Iov.
+	IovLen uint64
+
+	// Control is the optional pointer to ancillary control data.
+	Control uint64
+
+	// ControlLen is the length of the data pointed to by Control.
+	ControlLen uint64
+
+	// Flags on the sent/received message.
+	Flags int32
+	_     int32
+}
+
+// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
+// the recvmmsg and sendmmsg syscalls.
+type multipleMessageHeader64 struct {
+	msgHdr MessageHeader64
+	msgLen uint32
+	_      int32
+}
+
+// CopyInMessageHeader64 copies a message header from user to kernel memory.
+func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
+	b := t.CopyScratchBuffer(52)
+	if _, err := t.CopyInBytes(addr, b); err != nil {
+		return err
+	}
+
+	msg.Name = usermem.ByteOrder.Uint64(b[0:])
+	msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
+	msg.Iov = usermem.ByteOrder.Uint64(b[16:])
+	msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
+	msg.Control = usermem.ByteOrder.Uint64(b[32:])
+	msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
+	msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
+
+	return nil
+}
+
+// CaptureAddress allocates memory for and copies a socket address structure
+// from the untrusted address space range.
+func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
+	if addrlen > maxAddrLen {
+		return nil, syserror.EINVAL
+	}
+
+	addrBuf := make([]byte, addrlen)
+	if _, err := t.CopyInBytes(addr, addrBuf); err != nil {
+		return nil, err
+	}
+
+	return addrBuf, nil
+}
+
+// writeAddress writes a sockaddr structure and its length to an output buffer
+// in the unstrusted address space range. If the address is bigger than the
+// buffer, it is truncated.
+func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+	// Get the buffer length.
+	var bufLen uint32
+	if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+		return err
+	}
+
+	if int32(bufLen) < 0 {
+		return syserror.EINVAL
+	}
+
+	// Write the length unconditionally.
+	if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+		return err
+	}
+
+	if addr == nil {
+		return nil
+	}
+
+	if bufLen > addrLen {
+		bufLen = addrLen
+	}
+
+	// Copy as much of the address as will fit in the buffer.
+	encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+	if bufLen > uint32(len(encodedAddr)) {
+		bufLen = uint32(len(encodedAddr))
+	}
+	_, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)])
+	return err
+}
+
+// Socket implements the linux syscall socket(2).
+func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Create the new socket.
+	s, e := socket.New(t, domain, linux.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	s.SetFlags(fs.SettableFileFlags{
+		NonBlocking: stype&linux.SOCK_NONBLOCK != 0,
+	})
+	defer s.DecRef()
+
+	fd, err := t.NewFDFrom(0, s, kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// SocketPair implements the linux syscall socketpair(2).
+func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+	socks := args[3].Pointer()
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	fileFlags := fs.SettableFileFlags{
+		NonBlocking: stype&linux.SOCK_NONBLOCK != 0,
+	}
+
+	// Create the socket pair.
+	s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	s1.SetFlags(fileFlags)
+	s2.SetFlags(fileFlags)
+	defer s1.DecRef()
+	defer s2.DecRef()
+
+	// Create the FDs for the sockets.
+	fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Copy the file descriptors out.
+	if _, err := t.CopyOut(socks, fds); err != nil {
+		for _, fd := range fds {
+			if file, _ := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
+		}
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// Connect implements the linux syscall connect(2).
+func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	blocking := !file.Flags().NonBlocking
+	return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS)
+}
+
+// accept is the implementation of the accept syscall. It is called by accept
+// and accept4 syscall handlers.
+func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) {
+	// Check that no unsupported flags are passed in.
+	if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	// Call the syscall implementation for this socket, then copy the
+	// output address if one is specified.
+	blocking := !file.Flags().NonBlocking
+
+	peerRequested := addrLen != 0
+	nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	if peerRequested {
+		// NOTE(magi): Linux does not give you an error if it can't
+		// write the data back out so neither do we.
+		if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL {
+			return 0, err
+		}
+	}
+	return uintptr(nfd), nil
+}
+
+// Accept4 implements the linux syscall accept4(2).
+func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+	flags := int(args[3].Int())
+
+	n, err := accept(t, fd, addr, addrlen, flags)
+	return n, nil, err
+}
+
+// Accept implements the linux syscall accept(2).
+func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	n, err := accept(t, fd, addr, addrlen, 0)
+	return n, nil, err
+}
+
+// Bind implements the linux syscall bind(2).
+func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, s.Bind(t, a).ToError()
+}
+
+// Listen implements the linux syscall listen(2).
+func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	backlog := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Per Linux, the backlog is silently capped to reasonable values.
+	if backlog <= 0 {
+		backlog = minListenBacklog
+	}
+	if backlog > maxListenBacklog {
+		backlog = maxListenBacklog
+	}
+
+	return 0, nil, s.Listen(t, int(backlog)).ToError()
+}
+
+// Shutdown implements the linux syscall shutdown(2).
+func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	how := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Validate how, then call syscall implementation.
+	switch how {
+	case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR:
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, s.Shutdown(t, int(how)).ToError()
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2).
+func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLenAddr := args[4].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Read the length. Reject negative values.
+	optLen := int32(0)
+	if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+		return 0, nil, err
+	}
+	if optLen < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Call syscall implementation then copy both value and value len out.
+	v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen))
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+
+	vLen := int32(binary.Size(v))
+	if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+		return 0, nil, err
+	}
+
+	if v != nil {
+		if _, err := t.CopyOut(optValAddr, v); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	return 0, nil, nil
+}
+
+// getSockOpt tries to handle common socket options, or dispatches to a specific
+// socket implementation.
+func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
+	if level == linux.SOL_SOCKET {
+		switch name {
+		case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
+			if len < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+		}
+
+		switch name {
+		case linux.SO_TYPE:
+			_, skType, _ := s.Type()
+			return int32(skType), nil
+		case linux.SO_DOMAIN:
+			family, _, _ := s.Type()
+			return int32(family), nil
+		case linux.SO_PROTOCOL:
+			_, _, protocol := s.Type()
+			return int32(protocol), nil
+		}
+	}
+
+	return s.GetSockOpt(t, level, name, optValAddr, len)
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2).
+//
+// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket.
+func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLen := args[4].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	if optLen < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if optLen > maxOptLen {
+		return 0, nil, syserror.EINVAL
+	}
+	buf := t.CopyScratchBuffer(int(optLen))
+	if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+		return 0, nil, err
+	}
+
+	// Call syscall implementation.
+	if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, nil
+}
+
+// GetSockName implements the linux syscall getsockname(2).
+func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Get the socket name and copy it to the caller.
+	v, vl, err := s.GetSockName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// GetPeerName implements the linux syscall getpeername(2).
+func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Get the socket peer name and copy it to the caller.
+	v, vl, err := s.GetPeerName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// RecvMsg implements the linux syscall recvmsg(2).
+func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.RecvTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline)
+	return n, nil, err
+}
+
+// RecvMMsg implements the linux syscall recvmmsg(2).
+func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+	toPtr := args[4].Pointer()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if toPtr != 0 {
+		ts, err := copyTimespecIn(t, toPtr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if !ts.Valid() {
+			return 0, nil, syserror.EINVAL
+		}
+		deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration())
+		haveDeadline = true
+	}
+
+	if !haveDeadline {
+		if dl := s.RecvTimeout(); dl > 0 {
+			deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+			haveDeadline = true
+		} else if dl < 0 {
+			flags |= linux.MSG_DONTWAIT
+		}
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		var n uintptr
+		if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
+	// Capture the message header and io vectors.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syserror.EMSGSIZE
+	}
+	dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	// FIXME(b/63594852): Pretend we have an empty error queue.
+	if flags&linux.MSG_ERRQUEUE != 0 {
+		return 0, syserror.EAGAIN
+	}
+
+	// Fast path when no control message nor name buffers are provided.
+	if msg.ControlLen == 0 && msg.NameLen == 0 {
+		n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
+		if err != nil {
+			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
+		}
+		if !cms.Unix.Empty() {
+			mflags |= linux.MSG_CTRUNC
+			cms.Release()
+		}
+
+		if int(msg.Flags) != mflags {
+			// Copy out the flags to the caller.
+			if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+				return 0, err
+			}
+		}
+
+		return uintptr(n), nil
+	}
+
+	if msg.ControlLen > maxControlLen {
+		return 0, syserror.ENOBUFS
+	}
+	n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	defer cms.Release()
+
+	controlData := make([]byte, 0, msg.ControlLen)
+	controlData = control.PackControlMessages(t, cms, controlData)
+
+	if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
+		creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
+		controlData, mflags = control.PackCredentials(t, creds, controlData, mflags)
+	}
+
+	if cms.Unix.Rights != nil {
+		controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
+	}
+
+	// Copy the address to the caller.
+	if msg.NameLen != 0 {
+		if err := writeAddress(t, sender, senderLen, usermem.Addr(msg.Name), usermem.Addr(msgPtr+nameLenOffset)); err != nil {
+			return 0, err
+		}
+	}
+
+	// Copy the control data to the caller.
+	if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+		return 0, err
+	}
+	if len(controlData) > 0 {
+		if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	// Copy out the flags to the caller.
+	if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+		return 0, err
+	}
+
+	return uintptr(n), nil
+}
+
+// recvFrom is the implementation of the recvfrom syscall. It is called by
+// recvfrom and recv syscall handlers.
+func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) {
+	if int(bufLen) < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.RecvTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
+	cm.Release()
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+
+	// Copy the address to the caller.
+	if nameLenPtr != 0 {
+		if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil {
+			return 0, err
+		}
+	}
+
+	return uintptr(n), nil
+}
+
+// RecvFrom implements the linux syscall recvfrom(2).
+func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLenPtr := args[5].Pointer()
+
+	n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr)
+	return n, nil, err
+}
+
+// SendMsg implements the linux syscall sendmsg(2).
+func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := sendSingleMsg(t, s, file, msgPtr, flags)
+	return n, nil, err
+}
+
+// SendMMsg implements the linux syscall sendmmsg(2).
+func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		var n uintptr
+		if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr usermem.Addr, flags int32) (uintptr, error) {
+	// Capture the message header.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	var controlData []byte
+	if msg.ControlLen > 0 {
+		// Put an upper bound to prevent large allocations.
+		if msg.ControlLen > maxControlLen {
+			return 0, syserror.ENOBUFS
+		}
+		controlData = make([]byte, msg.ControlLen)
+		if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	if msg.NameLen != 0 {
+		var err error
+		to, err = CaptureAddress(t, usermem.Addr(msg.Name), msg.NameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	// Read data then call the sendmsg implementation.
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syserror.EMSGSIZE
+	}
+	src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	controlMessages, err := control.Parse(t, s, controlData)
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.SendTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
+	err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
+	if err != nil {
+		controlMessages.Release()
+	}
+	return uintptr(n), err
+}
+
+// sendTo is the implementation of the sendto syscall. It is called by sendto
+// and send syscall handlers.
+func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) {
+	bl := int(bufLen)
+	if bl < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	var err error
+	if namePtr != 0 {
+		to, err = CaptureAddress(t, namePtr, nameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.SendTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
+	return uintptr(n), handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
+}
+
+// SendTo implements the linux syscall sendto(2).
+func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLen := args[5].Uint()
+
+	n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
+	return n, nil, err
+}
+
+// LINT.ThenChange(./vfs2/socket.go)
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
new file mode 100644
index 000000000..77c78889d
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -0,0 +1,337 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// doSplice implements a blocking splice operation.
+func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) {
+	if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 || (opts.SrcStart+opts.Length < 0) {
+		return 0, syserror.EINVAL
+	}
+
+	if opts.Length > int64(kernel.MAX_RW_COUNT) {
+		opts.Length = int64(kernel.MAX_RW_COUNT)
+	}
+
+	var (
+		total int64
+		n     int64
+		err   error
+		inCh  chan struct{}
+		outCh chan struct{}
+	)
+	for opts.Length > 0 {
+		n, err = fs.Splice(t, outFile, inFile, opts)
+		opts.Length -= n
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		} else if err == syserror.ErrWouldBlock && nonBlocking {
+			break
+		}
+
+		// Note that the blocking behavior here is a bit different than the
+		// normal pattern. Because we need to have both data to read and data
+		// to write simultaneously, we actually explicitly block on both of
+		// these cases in turn before returning to the splice operation.
+		if inFile.Readiness(EventMaskRead) == 0 {
+			if inCh == nil {
+				inCh = make(chan struct{}, 1)
+				inW, _ := waiter.NewChannelEntry(inCh)
+				inFile.EventRegister(&inW, EventMaskRead)
+				defer inFile.EventUnregister(&inW)
+				continue // Need to refresh readiness.
+			}
+			if err = t.Block(inCh); err != nil {
+				break
+			}
+		}
+		if outFile.Readiness(EventMaskWrite) == 0 {
+			if outCh == nil {
+				outCh = make(chan struct{}, 1)
+				outW, _ := waiter.NewChannelEntry(outCh)
+				outFile.EventRegister(&outW, EventMaskWrite)
+				defer outFile.EventUnregister(&outW)
+				continue // Need to refresh readiness.
+			}
+			if err = t.Block(outCh); err != nil {
+				break
+			}
+		}
+	}
+
+	if total > 0 {
+		// On Linux, inotify behavior is not very consistent with splice(2). We try
+		// our best to emulate Linux for very basic calls to splice, where for some
+		// reason, events are generated for output files, but not input files.
+		outFile.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+	}
+	return total, err
+}
+
+// Sendfile implements linux system call sendfile(2).
+func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	outFD := args[0].Int()
+	inFD := args[1].Int()
+	offsetAddr := args[2].Pointer()
+	count := int64(args[3].SizeT())
+
+	// Get files.
+	inFile := t.GetFile(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+
+	if !inFile.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	outFile := t.GetFile(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	if !outFile.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Verify that the outfile Append flag is not set.
+	if outFile.Flags().Append {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Verify that we have a regular infile. This is a requirement; the
+	// same check appears in Linux (fs/splice.c:splice_direct_to_actor).
+	if !fs.IsRegular(inFile.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var (
+		n   int64
+		err error
+	)
+	if offsetAddr != 0 {
+		// Verify that when offset address is not null, infile must be
+		// seekable. The fs.Splice routine itself validates basic read.
+		if !inFile.Flags().Pread {
+			return 0, nil, syserror.ESPIPE
+		}
+
+		// Copy in the offset.
+		var offset int64
+		if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+			return 0, nil, err
+		}
+
+		// Do the splice.
+		n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
+			Length:    count,
+			SrcOffset: true,
+			SrcStart:  offset,
+		}, outFile.Flags().NonBlocking)
+
+		// Copy out the new offset.
+		if _, err := t.CopyOut(offsetAddr, n+offset); err != nil {
+			return 0, nil, err
+		}
+	} else {
+		// Send data using splice.
+		n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
+			Length: count,
+		}, outFile.Flags().NonBlocking)
+	}
+
+	// Sendfile can't lose any data because inFD is always a regual file.
+	if n != 0 {
+		err = nil
+	}
+
+	// We can only pass a single file to handleIOError, so pick inFile
+	// arbitrarily. This is used only for debugging purposes.
+	return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "sendfile", inFile)
+}
+
+// Splice implements splice(2).
+func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	inFD := args[0].Int()
+	inOffset := args[1].Pointer()
+	outFD := args[2].Int()
+	outOffset := args[3].Pointer()
+	count := int64(args[4].SizeT())
+	flags := args[5].Int()
+
+	// Check for invalid flags.
+	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get files.
+	outFile := t.GetFile(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	inFile := t.GetFile(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+
+	// The operation is non-blocking if anything is non-blocking.
+	//
+	// N.B. This is a rather simplistic heuristic that avoids some
+	// poor edge case behavior since the exact semantics here are
+	// underspecified and vary between versions of Linux itself.
+	nonBlock := inFile.Flags().NonBlocking || outFile.Flags().NonBlocking || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+	// Construct our options.
+	//
+	// Note that exactly one of the underlying buffers must be a pipe. We
+	// don't actually have this constraint internally, but we enforce it
+	// for the semantics of the call.
+	opts := fs.SpliceOpts{
+		Length: count,
+	}
+	inFileAttr := inFile.Dirent.Inode.StableAttr
+	outFileAttr := outFile.Dirent.Inode.StableAttr
+	switch {
+	case fs.IsPipe(inFileAttr) && !fs.IsPipe(outFileAttr):
+		if inOffset != 0 {
+			return 0, nil, syserror.ESPIPE
+		}
+		if outOffset != 0 {
+			if !outFile.Flags().Pwrite {
+				return 0, nil, syserror.EINVAL
+			}
+
+			var offset int64
+			if _, err := t.CopyIn(outOffset, &offset); err != nil {
+				return 0, nil, err
+			}
+
+			// Use the destination offset.
+			opts.DstOffset = true
+			opts.DstStart = offset
+		}
+	case !fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr):
+		if outOffset != 0 {
+			return 0, nil, syserror.ESPIPE
+		}
+		if inOffset != 0 {
+			if !inFile.Flags().Pread {
+				return 0, nil, syserror.EINVAL
+			}
+
+			var offset int64
+			if _, err := t.CopyIn(inOffset, &offset); err != nil {
+				return 0, nil, err
+			}
+
+			// Use the source offset.
+			opts.SrcOffset = true
+			opts.SrcStart = offset
+		}
+	case fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr):
+		if inOffset != 0 || outOffset != 0 {
+			return 0, nil, syserror.ESPIPE
+		}
+
+		// We may not refer to the same pipe; otherwise it's a continuous loop.
+		if inFileAttr.InodeID == outFileAttr.InodeID {
+			return 0, nil, syserror.EINVAL
+		}
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Splice data.
+	n, err := doSplice(t, outFile, inFile, opts, nonBlock)
+
+	// Special files can have additional requirements for granularity.  For
+	// example, read from eventfd returns EINVAL if a size is less 8 bytes.
+	// Inotify is another example. read will return EINVAL is a buffer is
+	// too small to return the next event, but a size of an event isn't
+	// fixed, it is sizeof(struct inotify_event) + {NAME_LEN} + 1.
+	if n != 0 && err != nil && (fs.IsAnonymous(inFileAttr) || fs.IsAnonymous(outFileAttr)) {
+		err = nil
+	}
+
+	// See above; inFile is chosen arbitrarily here.
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "splice", inFile)
+}
+
+// Tee imlements tee(2).
+func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	inFD := args[0].Int()
+	outFD := args[1].Int()
+	count := int64(args[2].SizeT())
+	flags := args[3].Int()
+
+	// Check for invalid flags.
+	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get files.
+	outFile := t.GetFile(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	inFile := t.GetFile(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+
+	// All files must be pipes.
+	if !fs.IsPipe(inFile.Dirent.Inode.StableAttr) || !fs.IsPipe(outFile.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// We may not refer to the same pipe; see above.
+	if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// The operation is non-blocking if anything is non-blocking.
+	nonBlock := inFile.Flags().NonBlocking || outFile.Flags().NonBlocking || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+	// Splice data.
+	n, err := doSplice(t, outFile, inFile, fs.SpliceOpts{
+		Length: count,
+		Dup:    true,
+	}, nonBlock)
+
+	// Tee doesn't change a state of inFD, so it can't lose any data.
+	if n != 0 {
+		err = nil
+	}
+
+	// See above; inFile is chosen arbitrarily here.
+	return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "tee", inFile)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
new file mode 100644
index 000000000..46ebf27a2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -0,0 +1,290 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// LINT.IfChange
+
+// Stat implements linux syscall stat(2).
+func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		return stat(t, d, dirPath, statAddr)
+	})
+}
+
+// Fstatat implements linux syscall newfstatat, i.e. fstatat(2).
+func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	statAddr := args[2].Pointer()
+	flags := args[3].Int()
+
+	path, dirPath, err := copyInPath(t, addr, flags&linux.AT_EMPTY_PATH != 0)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if path == "" {
+		// Annoying. What's wrong with fstat?
+		file := t.GetFile(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		return 0, nil, fstat(t, file, statAddr)
+	}
+
+	// If the path ends in a slash (i.e. dirPath is true) or if AT_SYMLINK_NOFOLLOW is unset,
+	// then we must resolve the final component.
+	resolve := dirPath || flags&linux.AT_SYMLINK_NOFOLLOW == 0
+
+	return 0, nil, fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		return stat(t, d, dirPath, statAddr)
+	})
+}
+
+// Lstat implements linux syscall lstat(2).
+func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// If the path ends in a slash (i.e. dirPath is true), then we *do*
+	// want to resolve the final component.
+	resolve := dirPath
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		return stat(t, d, dirPath, statAddr)
+	})
+}
+
+// Fstat implements linux syscall fstat(2).
+func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	statAddr := args[1].Pointer()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, fstat(t, file, statAddr)
+}
+
+// stat implements stat from the given *fs.Dirent.
+func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) error {
+	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+		return syserror.ENOTDIR
+	}
+	uattr, err := d.Inode.UnstableAttr(t)
+	if err != nil {
+		return err
+	}
+	s := statFromAttrs(t, d.Inode.StableAttr, uattr)
+	_, err = s.CopyOut(t, statAddr)
+	return err
+}
+
+// fstat implements fstat for the given *fs.File.
+func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error {
+	uattr, err := f.UnstableAttr(t)
+	if err != nil {
+		return err
+	}
+	s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr)
+	_, err = s.CopyOut(t, statAddr)
+	return err
+}
+
+// Statx implements linux syscall statx(2).
+func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	flags := args[2].Int()
+	mask := args[3].Uint()
+	statxAddr := args[4].Pointer()
+
+	if mask&linux.STATX__RESERVED != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if flags&^(linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH|linux.AT_STATX_SYNC_TYPE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if flags&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, dirPath, err := copyInPath(t, pathAddr, flags&linux.AT_EMPTY_PATH != 0)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if path == "" {
+		file := t.GetFile(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+		uattr, err := file.UnstableAttr(t)
+		if err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, statx(t, file.Dirent.Inode.StableAttr, uattr, statxAddr)
+	}
+
+	resolve := dirPath || flags&linux.AT_SYMLINK_NOFOLLOW == 0
+
+	return 0, nil, fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+		uattr, err := d.Inode.UnstableAttr(t)
+		if err != nil {
+			return err
+		}
+		return statx(t, d.Inode.StableAttr, uattr, statxAddr)
+	})
+}
+
+func statx(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr, statxAddr usermem.Addr) error {
+	// "[T]he kernel may return fields that weren't requested and may fail to
+	// return fields that were requested, depending on what the backing
+	// filesystem supports.
+	// [...]
+	// A filesystem may also fill in fields that the caller didn't ask for
+	// if it has values for them available and the information is available
+	// at no extra cost. If this happens, the corresponding bits will be
+	// set in stx_mask." -- statx(2)
+	//
+	// We fill in all the values we have (which currently does not include
+	// btime, see b/135608823), regardless of what the user asked for. The
+	// STATX_BASIC_STATS mask indicates that all fields are present except
+	// for btime.
+
+	devMajor, devMinor := linux.DecodeDeviceID(uint32(sattr.DeviceID))
+	s := linux.Statx{
+		// TODO(b/135608823): Support btime, and then change this to
+		// STATX_ALL to indicate presence of btime.
+		Mask: linux.STATX_BASIC_STATS,
+
+		// No attributes, and none supported.
+		Attributes:     0,
+		AttributesMask: 0,
+
+		Blksize:   uint32(sattr.BlockSize),
+		Nlink:     uint32(uattr.Links),
+		UID:       uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:       uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Mode:      uint16(sattr.Type.LinuxType()) | uint16(uattr.Perms.LinuxMode()),
+		Ino:       sattr.InodeID,
+		Size:      uint64(uattr.Size),
+		Blocks:    uint64(uattr.Usage) / 512,
+		Atime:     uattr.AccessTime.StatxTimestamp(),
+		Ctime:     uattr.StatusChangeTime.StatxTimestamp(),
+		Mtime:     uattr.ModificationTime.StatxTimestamp(),
+		RdevMajor: uint32(sattr.DeviceFileMajor),
+		RdevMinor: sattr.DeviceFileMinor,
+		DevMajor:  uint32(devMajor),
+		DevMinor:  devMinor,
+	}
+	_, err := t.CopyOut(statxAddr, &s)
+	return err
+}
+
+// Statfs implements linux syscall statfs(2).
+func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	statfsAddr := args[1].Pointer()
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		return statfsImpl(t, d, statfsAddr)
+	})
+}
+
+// Fstatfs implements linux syscall fstatfs(2).
+func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	statfsAddr := args[1].Pointer()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, statfsImpl(t, file.Dirent, statfsAddr)
+}
+
+// statfsImpl implements the linux syscall statfs and fstatfs based on a Dirent,
+// copying the statfs structure out to addr on success, otherwise an error is
+// returned.
+func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
+	info, err := d.Inode.StatFS(t)
+	if err != nil {
+		return err
+	}
+	// Construct the statfs structure and copy it out.
+	statfs := linux.Statfs{
+		Type: info.Type,
+		// Treat block size and fragment size as the same, as
+		// most consumers of this structure will expect one
+		// or the other to be filled in.
+		BlockSize: d.Inode.StableAttr.BlockSize,
+		Blocks:    info.TotalBlocks,
+		// We don't have the concept of reserved blocks, so
+		// report blocks free the same as available blocks.
+		// This is a normal thing for filesystems, to do, see
+		// udf, hugetlbfs, tmpfs, among others.
+		BlocksFree:      info.FreeBlocks,
+		BlocksAvailable: info.FreeBlocks,
+		Files:           info.TotalFiles,
+		FilesFree:       info.FreeFiles,
+		// Same as Linux for simple_statfs, see fs/libfs.c.
+		NameLength:   linux.NAME_MAX,
+		FragmentSize: d.Inode.StableAttr.BlockSize,
+		// Leave other fields 0 like simple_statfs does.
+	}
+	_, err = t.CopyOut(addr, &statfs)
+	return err
+}
+
+// LINT.ThenChange(vfs2/stat.go)
diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
new file mode 100644
index 000000000..0a04a6113
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// LINT.IfChange
+
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+	return linux.Stat{
+		Dev:     sattr.DeviceID,
+		Ino:     sattr.InodeID,
+		Nlink:   uattr.Links,
+		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+		Size:    uattr.Size,
+		Blksize: sattr.BlockSize,
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	}
+}
+
+// LINT.ThenChange(vfs2/stat_amd64.go)
diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
new file mode 100644
index 000000000..5a3b1bfad
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// LINT.IfChange
+
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+	return linux.Stat{
+		Dev:     sattr.DeviceID,
+		Ino:     sattr.InodeID,
+		Nlink:   uint32(uattr.Links),
+		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+		Size:    uattr.Size,
+		Blksize: int32(sattr.BlockSize),
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	}
+}
+
+// LINT.ThenChange(vfs2/stat_arm64.go)
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
new file mode 100644
index 000000000..5ad465ae3
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -0,0 +1,141 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// LINT.IfChange
+
+// Sync implements linux system call sync(2).
+func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	t.MountNamespace().SyncAll(t)
+	// Sync is always successful.
+	return 0, nil, nil
+}
+
+// Syncfs implements linux system call syncfs(2).
+func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Use "sync-the-world" for now, it's guaranteed that fd is at least
+	// on the root filesystem.
+	return Sync(t, args)
+}
+
+// Fsync implements linux syscall fsync(2).
+func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll)
+	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// Fdatasync implements linux syscall fdatasync(2).
+//
+// At the moment, it just calls Fsync, which is a big hammer, but correct.
+func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData)
+	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// SyncFileRange implements linux syscall sync_file_rage(2)
+func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	var err error
+
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	nbytes := args[2].Int64()
+	uflags := args[3].Uint()
+
+	if offset < 0 || offset+nbytes < offset {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if uflags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|
+		linux.SYNC_FILE_RANGE_WRITE|
+		linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if nbytes == 0 {
+		nbytes = fs.FileMaxOffset
+	}
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// SYNC_FILE_RANGE_WAIT_BEFORE waits upon write-out of all pages in the
+	// specified range that have already been submitted to the device
+	// driver for write-out before performing any write.
+	if uflags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 &&
+		uflags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 {
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.ENOSYS
+	}
+
+	// SYNC_FILE_RANGE_WRITE initiates write-out of all dirty pages in the
+	// specified range which are not presently submitted write-out.
+	//
+	// It looks impossible to implement this functionality without a
+	// massive rework of the vfs subsystem. file.Fsync() take a file lock
+	// for the entire operation, so even if it is running in a go routing,
+	// it blocks other file operations instead of flushing data in the
+	// background.
+	//
+	// It should be safe to skipped this flag while nobody uses
+	// SYNC_FILE_RANGE_WAIT_BEFORE.
+
+	// SYNC_FILE_RANGE_WAIT_AFTER waits upon write-out of all pages in the
+	// range after performing any write.
+	//
+	// In Linux, sync_file_range() doesn't writes out the  file's
+	// meta-data, but fdatasync() does if a file size is changed.
+	if uflags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 {
+		err = file.Fsync(t, offset, fs.FileMaxOffset, fs.SyncData)
+	}
+
+	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// LINT.ThenChange(vfs2/sync.go)
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
new file mode 100644
index 000000000..297de052a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -0,0 +1,48 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+)
+
+// Sysinfo implements the sysinfo syscall as described in man 2 sysinfo.
+func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	mf := t.Kernel().MemoryFile()
+	mf.UpdateUsage()
+	_, totalUsage := usage.MemoryAccounting.Copy()
+	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
+	memFree := totalSize - totalUsage
+	if memFree > totalSize {
+		// Underflow.
+		memFree = 0
+	}
+
+	// Only a subset of the fields in sysinfo_t make sense to return.
+	si := linux.Sysinfo{
+		Procs:    uint16(len(t.PIDNamespace().Tasks())),
+		Uptime:   t.Kernel().MonotonicClock().Now().Seconds(),
+		TotalRAM: totalSize,
+		FreeRAM:  memFree,
+		Unit:     1,
+	}
+	_, err := t.CopyOut(addr, si)
+	return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_syslog.go b/pkg/sentry/syscalls/linux/sys_syslog.go
new file mode 100644
index 000000000..40c8bb061
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_syslog.go
@@ -0,0 +1,61 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+	_SYSLOG_ACTION_READ_ALL    = 3
+	_SYSLOG_ACTION_SIZE_BUFFER = 10
+)
+
+// logBufLen is the default syslog buffer size on Linux.
+const logBufLen = 1 << 17
+
+// Syslog implements part of Linux syscall syslog.
+//
+// Only the unpriviledged commands are implemented, allowing applications to
+// read a fun dmesg.
+func Syslog(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	command := args[0].Int()
+	buf := args[1].Pointer()
+	size := int(args[2].Int())
+
+	switch command {
+	case _SYSLOG_ACTION_READ_ALL:
+		if size < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if size > logBufLen {
+			size = logBufLen
+		}
+
+		log := t.Kernel().Syslog().Log()
+		if len(log) > size {
+			log = log[:size]
+		}
+
+		n, err := t.CopyOutBytes(buf, log)
+		return uintptr(n), nil, err
+	case _SYSLOG_ACTION_SIZE_BUFFER:
+		return logBufLen, nil, nil
+	default:
+		return 0, nil, syserror.ENOSYS
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
new file mode 100644
index 000000000..00915fdde
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -0,0 +1,769 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"path"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+	// ExecMaxTotalSize is the maximum length of all argv and envv entries.
+	//
+	// N.B. The behavior here is different than Linux. Linux provides a limit on
+	// individual arguments of 32 pages, and an aggregate limit of at least 32 pages
+	// but otherwise bounded by min(stack size / 4, 8 MB * 3 / 4). We don't implement
+	// any behavior based on the stack size, and instead provide a fixed hard-limit of
+	// 2 MB (which should work well given that 8 MB stack limits are common).
+	ExecMaxTotalSize = 2 * 1024 * 1024
+
+	// ExecMaxElemSize is the maximum length of a single argv or envv entry.
+	ExecMaxElemSize = 32 * usermem.PageSize
+
+	// exitSignalMask is the signal mask to be sent at exit. Same as CSIGNAL in linux.
+	exitSignalMask = 0xff
+)
+
+// Getppid implements linux syscall getppid(2).
+func Getppid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	parent := t.Parent()
+	if parent == nil {
+		return 0, nil, nil
+	}
+	return uintptr(t.PIDNamespace().IDOfThreadGroup(parent.ThreadGroup())), nil, nil
+}
+
+// Getpid implements linux syscall getpid(2).
+func Getpid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return uintptr(t.ThreadGroup().ID()), nil, nil
+}
+
+// Gettid implements linux syscall gettid(2).
+func Gettid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return uintptr(t.ThreadID()), nil, nil
+}
+
+// Execve implements linux syscall execve(2).
+func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	filenameAddr := args[0].Pointer()
+	argvAddr := args[1].Pointer()
+	envvAddr := args[2].Pointer()
+
+	return execveat(t, linux.AT_FDCWD, filenameAddr, argvAddr, envvAddr, 0)
+}
+
+// Execveat implements linux syscall execveat(2).
+func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := args[0].Int()
+	pathnameAddr := args[1].Pointer()
+	argvAddr := args[2].Pointer()
+	envvAddr := args[3].Pointer()
+	flags := args[4].Int()
+
+	return execveat(t, dirFD, pathnameAddr, argvAddr, envvAddr, flags)
+}
+
+func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
+	pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var argv, envv []string
+	if argvAddr != 0 {
+		var err error
+		argv, err = t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+	if envvAddr != 0 {
+		var err error
+		envv, err = t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	atEmptyPath := flags&linux.AT_EMPTY_PATH != 0
+	if !atEmptyPath && len(pathname) == 0 {
+		return 0, nil, syserror.ENOENT
+	}
+	resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0
+
+	root := t.FSContext().RootDirectory()
+	defer root.DecRef()
+
+	var wd *fs.Dirent
+	var executable fsbridge.File
+	var closeOnExec bool
+	if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) {
+		// Even if the pathname is absolute, we may still need the wd
+		// for interpreter scripts if the path of the interpreter is
+		// relative.
+		wd = t.FSContext().WorkingDirectory()
+	} else {
+		// Need to extract the given FD.
+		f, fdFlags := t.FDTable().Get(dirFD)
+		if f == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer f.DecRef()
+		closeOnExec = fdFlags.CloseOnExec
+
+		if atEmptyPath && len(pathname) == 0 {
+			// TODO(gvisor.dev/issue/160): Linux requires only execute permission,
+			// not read. However, our backing filesystems may prevent us from reading
+			// the file without read permission. Additionally, a task with a
+			// non-readable executable has additional constraints on access via
+			// ptrace and procfs.
+			if err := f.Dirent.Inode.CheckPermission(t, fs.PermMask{Read: true, Execute: true}); err != nil {
+				return 0, nil, err
+			}
+			executable = fsbridge.NewFSFile(f)
+		} else {
+			wd = f.Dirent
+			wd.IncRef()
+			if !fs.IsDir(wd.Inode.StableAttr) {
+				return 0, nil, syserror.ENOTDIR
+			}
+		}
+	}
+	if wd != nil {
+		defer wd.DecRef()
+	}
+
+	// Load the new TaskContext.
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
+	loadArgs := loader.LoadArgs{
+		Opener:              fsbridge.NewFSLookup(t.MountNamespace(), root, wd),
+		RemainingTraversals: &remainingTraversals,
+		ResolveFinal:        resolveFinal,
+		Filename:            pathname,
+		File:                executable,
+		CloseOnExec:         closeOnExec,
+		Argv:                argv,
+		Envv:                envv,
+		Features:            t.Arch().FeatureSet(),
+	}
+
+	tc, se := t.Kernel().LoadTaskImage(t, loadArgs)
+	if se != nil {
+		return 0, nil, se.ToError()
+	}
+
+	ctrl, err := t.Execve(tc)
+	return 0, ctrl, err
+}
+
+// Exit implements linux syscall exit(2).
+func Exit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	status := int(args[0].Int())
+	t.PrepareExit(kernel.ExitStatus{Code: status})
+	return 0, kernel.CtrlDoExit, nil
+}
+
+// ExitGroup implements linux syscall exit_group(2).
+func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	status := int(args[0].Int())
+	t.PrepareGroupExit(kernel.ExitStatus{Code: status})
+	return 0, kernel.CtrlDoExit, nil
+}
+
+// clone is used by Clone, Fork, and VFork.
+func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr, childTID usermem.Addr, tls usermem.Addr) (uintptr, *kernel.SyscallControl, error) {
+	opts := kernel.CloneOptions{
+		SharingOptions: kernel.SharingOptions{
+			NewAddressSpace:     flags&linux.CLONE_VM == 0,
+			NewSignalHandlers:   flags&linux.CLONE_SIGHAND == 0,
+			NewThreadGroup:      flags&linux.CLONE_THREAD == 0,
+			TerminationSignal:   linux.Signal(flags & exitSignalMask),
+			NewPIDNamespace:     flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID,
+			NewUserNamespace:    flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER,
+			NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET,
+			NewFiles:            flags&linux.CLONE_FILES == 0,
+			NewFSContext:        flags&linux.CLONE_FS == 0,
+			NewUTSNamespace:     flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS,
+			NewIPCNamespace:     flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC,
+		},
+		Stack:         stack,
+		SetTLS:        flags&linux.CLONE_SETTLS == linux.CLONE_SETTLS,
+		TLS:           tls,
+		ChildClearTID: flags&linux.CLONE_CHILD_CLEARTID == linux.CLONE_CHILD_CLEARTID,
+		ChildSetTID:   flags&linux.CLONE_CHILD_SETTID == linux.CLONE_CHILD_SETTID,
+		ChildTID:      childTID,
+		ParentSetTID:  flags&linux.CLONE_PARENT_SETTID == linux.CLONE_PARENT_SETTID,
+		ParentTID:     parentTID,
+		Vfork:         flags&linux.CLONE_VFORK == linux.CLONE_VFORK,
+		Untraced:      flags&linux.CLONE_UNTRACED == linux.CLONE_UNTRACED,
+		InheritTracer: flags&linux.CLONE_PTRACE == linux.CLONE_PTRACE,
+	}
+	ntid, ctrl, err := t.Clone(&opts)
+	return uintptr(ntid), ctrl, err
+}
+
+// Fork implements Linux syscall fork(2).
+func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// "A call to fork() is equivalent to a call to clone(2) specifying flags
+	// as just SIGCHLD." - fork(2)
+	return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0)
+}
+
+// Vfork implements Linux syscall vfork(2).
+func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// """
+	// A call to vfork() is equivalent to calling clone(2) with flags specified as:
+	//
+	//     CLONE_VM | CLONE_VFORK | SIGCHLD
+	// """ - vfork(2)
+	return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0)
+}
+
+// parseCommonWaitOptions applies the options common to wait4 and waitid to
+// wopts.
+func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error {
+	switch options & (linux.WCLONE | linux.WALL) {
+	case 0:
+		wopts.NonCloneTasks = true
+	case linux.WCLONE:
+		wopts.CloneTasks = true
+	case linux.WALL:
+		wopts.NonCloneTasks = true
+		wopts.CloneTasks = true
+	default:
+		return syserror.EINVAL
+	}
+	if options&linux.WCONTINUED != 0 {
+		wopts.Events |= kernel.EventGroupContinue
+	}
+	if options&linux.WNOHANG == 0 {
+		wopts.BlockInterruptErr = kernel.ERESTARTSYS
+	}
+	if options&linux.WNOTHREAD == 0 {
+		wopts.SiblingChildren = true
+	}
+	return nil
+}
+
+// wait4 waits for the given child process to exit.
+func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) {
+	if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
+		return 0, syserror.EINVAL
+	}
+	wopts := kernel.WaitOptions{
+		Events:       kernel.EventExit | kernel.EventTraceeStop,
+		ConsumeEvent: true,
+	}
+	// There are four cases to consider:
+	//
+	// pid < -1    any child process whose process group ID is equal to the absolute value of pid
+	// pid == -1   any child process
+	// pid == 0    any child process whose process group ID is equal to that of the calling process
+	// pid > 0     the child whose process ID is equal to the value of pid
+	switch {
+	case pid < -1:
+		wopts.SpecificPGID = kernel.ProcessGroupID(-pid)
+	case pid == -1:
+		// Any process is the default.
+	case pid == 0:
+		wopts.SpecificPGID = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())
+	default:
+		wopts.SpecificTID = kernel.ThreadID(pid)
+	}
+
+	if err := parseCommonWaitOptions(&wopts, options); err != nil {
+		return 0, err
+	}
+	if options&linux.WUNTRACED != 0 {
+		wopts.Events |= kernel.EventChildGroupStop
+	}
+
+	wr, err := t.Wait(&wopts)
+	if err != nil {
+		if err == kernel.ErrNoWaitableEvent {
+			return 0, nil
+		}
+		return 0, err
+	}
+	if statusAddr != 0 {
+		if _, err := t.CopyOut(statusAddr, wr.Status); err != nil {
+			return 0, err
+		}
+	}
+	if rusageAddr != 0 {
+		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
+		if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+			return 0, err
+		}
+	}
+	return uintptr(wr.TID), nil
+}
+
+// Wait4 implements linux syscall wait4(2).
+func Wait4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := int(args[0].Int())
+	statusAddr := args[1].Pointer()
+	options := int(args[2].Uint())
+	rusageAddr := args[3].Pointer()
+
+	n, err := wait4(t, pid, statusAddr, options, rusageAddr)
+	return n, nil, err
+}
+
+// WaitPid implements linux syscall waitpid(2).
+func WaitPid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := int(args[0].Int())
+	statusAddr := args[1].Pointer()
+	options := int(args[2].Uint())
+
+	n, err := wait4(t, pid, statusAddr, options, 0)
+	return n, nil, err
+}
+
+// Waitid implements linux syscall waitid(2).
+func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	idtype := args[0].Int()
+	id := args[1].Int()
+	infop := args[2].Pointer()
+	options := int(args[3].Uint())
+	rusageAddr := args[4].Pointer()
+
+	if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	wopts := kernel.WaitOptions{
+		Events:       kernel.EventTraceeStop,
+		ConsumeEvent: options&linux.WNOWAIT == 0,
+	}
+	switch idtype {
+	case linux.P_ALL:
+	case linux.P_PID:
+		wopts.SpecificTID = kernel.ThreadID(id)
+	case linux.P_PGID:
+		wopts.SpecificPGID = kernel.ProcessGroupID(id)
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	if err := parseCommonWaitOptions(&wopts, options); err != nil {
+		return 0, nil, err
+	}
+	if options&linux.WEXITED != 0 {
+		wopts.Events |= kernel.EventExit
+	}
+	if options&linux.WSTOPPED != 0 {
+		wopts.Events |= kernel.EventChildGroupStop
+	}
+
+	wr, err := t.Wait(&wopts)
+	if err != nil {
+		if err == kernel.ErrNoWaitableEvent {
+			err = nil
+			// "If WNOHANG was specified in options and there were no children
+			// in a waitable state, then waitid() returns 0 immediately and the
+			// state of the siginfo_t structure pointed to by infop is
+			// unspecified." - waitid(2). But Linux's waitid actually zeroes
+			// out the fields it would set for a successful waitid in this case
+			// as well.
+			if infop != 0 {
+				var si arch.SignalInfo
+				_, err = t.CopyOut(infop, &si)
+			}
+		}
+		return 0, nil, err
+	}
+	if rusageAddr != 0 {
+		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
+		if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+			return 0, nil, err
+		}
+	}
+	if infop == 0 {
+		return 0, nil, nil
+	}
+	si := arch.SignalInfo{
+		Signo: int32(linux.SIGCHLD),
+	}
+	si.SetPid(int32(wr.TID))
+	si.SetUid(int32(wr.UID))
+	// TODO(b/73541790): convert kernel.ExitStatus to functions and make
+	// WaitResult.Status a linux.WaitStatus.
+	s := syscall.WaitStatus(wr.Status)
+	switch {
+	case s.Exited():
+		si.Code = arch.CLD_EXITED
+		si.SetStatus(int32(s.ExitStatus()))
+	case s.Signaled():
+		si.Code = arch.CLD_KILLED
+		si.SetStatus(int32(s.Signal()))
+	case s.CoreDump():
+		si.Code = arch.CLD_DUMPED
+		si.SetStatus(int32(s.Signal()))
+	case s.Stopped():
+		if wr.Event == kernel.EventTraceeStop {
+			si.Code = arch.CLD_TRAPPED
+			si.SetStatus(int32(s.TrapCause()))
+		} else {
+			si.Code = arch.CLD_STOPPED
+			si.SetStatus(int32(s.StopSignal()))
+		}
+	case s.Continued():
+		si.Code = arch.CLD_CONTINUED
+		si.SetStatus(int32(linux.SIGCONT))
+	default:
+		t.Warningf("waitid got incomprehensible wait status %d", s)
+	}
+	_, err = t.CopyOut(infop, &si)
+	return 0, nil, err
+}
+
+// SetTidAddress implements linux syscall set_tid_address(2).
+func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	// Always succeed, return caller's tid.
+	t.SetClearTID(addr)
+	return uintptr(t.ThreadID()), nil, nil
+}
+
+// Unshare implements linux syscall unshare(2).
+func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	opts := kernel.SharingOptions{
+		NewAddressSpace:     flags&linux.CLONE_VM == linux.CLONE_VM,
+		NewSignalHandlers:   flags&linux.CLONE_SIGHAND == linux.CLONE_SIGHAND,
+		NewThreadGroup:      flags&linux.CLONE_THREAD == linux.CLONE_THREAD,
+		NewPIDNamespace:     flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID,
+		NewUserNamespace:    flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER,
+		NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET,
+		NewFiles:            flags&linux.CLONE_FILES == linux.CLONE_FILES,
+		NewFSContext:        flags&linux.CLONE_FS == linux.CLONE_FS,
+		NewUTSNamespace:     flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS,
+		NewIPCNamespace:     flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC,
+	}
+	// "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2)
+	if opts.NewPIDNamespace {
+		opts.NewThreadGroup = true
+	}
+	// "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since
+	// Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS."
+	if opts.NewUserNamespace {
+		opts.NewThreadGroup = true
+		opts.NewFSContext = true
+	}
+	return 0, nil, t.Unshare(&opts)
+}
+
+// SchedYield implements linux syscall sched_yield(2).
+func SchedYield(t *kernel.Task, _ arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	t.Yield()
+	return 0, nil, nil
+}
+
+// SchedSetaffinity implements linux syscall sched_setaffinity(2).
+func SchedSetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := args[0].Int()
+	size := args[1].SizeT()
+	maskAddr := args[2].Pointer()
+
+	var task *kernel.Task
+	if tid == 0 {
+		task = t
+	} else {
+		task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
+		if task == nil {
+			return 0, nil, syserror.ESRCH
+		}
+	}
+
+	mask := sched.NewCPUSet(t.Kernel().ApplicationCores())
+	if size > mask.Size() {
+		size = mask.Size()
+	}
+	if _, err := t.CopyInBytes(maskAddr, mask[:size]); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, task.SetCPUMask(mask)
+}
+
+// SchedGetaffinity implements linux syscall sched_getaffinity(2).
+func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := args[0].Int()
+	size := args[1].SizeT()
+	maskAddr := args[2].Pointer()
+
+	// This limitation is because linux stores the cpumask
+	// in an array of "unsigned long" so the buffer needs to
+	// be a multiple of the word size.
+	if size&(t.Arch().Width()-1) > 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var task *kernel.Task
+	if tid == 0 {
+		task = t
+	} else {
+		task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
+		if task == nil {
+			return 0, nil, syserror.ESRCH
+		}
+	}
+
+	mask := task.CPUMask()
+	// The buffer needs to be big enough to hold a cpumask with
+	// all possible cpus.
+	if size < mask.Size() {
+		return 0, nil, syserror.EINVAL
+	}
+	_, err := t.CopyOutBytes(maskAddr, mask)
+
+	// NOTE: The syscall interface is slightly different than the glibc
+	// interface. The raw sched_getaffinity syscall returns the number of
+	// bytes used to represent a cpu mask.
+	return uintptr(mask.Size()), nil, err
+}
+
+// Getcpu implements linux syscall getcpu(2).
+func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	cpu := args[0].Pointer()
+	node := args[1].Pointer()
+	// third argument to this system call is nowadays unused.
+
+	if cpu != 0 {
+		buf := t.CopyScratchBuffer(4)
+		usermem.ByteOrder.PutUint32(buf, uint32(t.CPU()))
+		if _, err := t.CopyOutBytes(cpu, buf); err != nil {
+			return 0, nil, err
+		}
+	}
+	// We always return node 0.
+	if node != 0 {
+		if _, err := t.MemoryManager().ZeroOut(t, node, 4, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// Setpgid implements the linux syscall setpgid(2).
+func Setpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// Note that throughout this function, pgid is interpreted with respect
+	// to t's namespace, not with respect to the selected ThreadGroup's
+	// namespace (which may be different).
+	pid := kernel.ThreadID(args[0].Int())
+	pgid := kernel.ProcessGroupID(args[1].Int())
+
+	// "If pid is zero, then the process ID of the calling process is used."
+	tg := t.ThreadGroup()
+	if pid != 0 {
+		ot := t.PIDNamespace().TaskWithID(pid)
+		if ot == nil {
+			return 0, nil, syserror.ESRCH
+		}
+		tg = ot.ThreadGroup()
+		if tg.Leader() != ot {
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Setpgid only operates on child threadgroups.
+		if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) {
+			return 0, nil, syserror.ESRCH
+		}
+	}
+
+	// "If pgid is zero, then the PGID of the process specified by pid is made
+	// the same as its process ID."
+	defaultPGID := kernel.ProcessGroupID(t.PIDNamespace().IDOfThreadGroup(tg))
+	if pgid == 0 {
+		pgid = defaultPGID
+	} else if pgid < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// If the pgid is the same as the group, then create a new one. Otherwise,
+	// we attempt to join an existing process group.
+	if pgid == defaultPGID {
+		// For convenience, errors line up with Linux syscall API.
+		if err := tg.CreateProcessGroup(); err != nil {
+			// Is the process group already as expected? If so,
+			// just return success. This is the same behavior as
+			// Linux.
+			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == defaultPGID {
+				return 0, nil, nil
+			}
+			return 0, nil, err
+		}
+	} else {
+		// Same as CreateProcessGroup, above.
+		if err := tg.JoinProcessGroup(t.PIDNamespace(), pgid, tg != t.ThreadGroup()); err != nil {
+			// See above.
+			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid {
+				return 0, nil, nil
+			}
+			return 0, nil, err
+		}
+	}
+
+	// Success.
+	return 0, nil, nil
+}
+
+// Getpgrp implements the linux syscall getpgrp(2).
+func Getpgrp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return uintptr(t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), nil, nil
+}
+
+// Getpgid implements the linux syscall getpgid(2).
+func Getpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := kernel.ThreadID(args[0].Int())
+	if tid == 0 {
+		return Getpgrp(t, args)
+	}
+
+	target := t.PIDNamespace().TaskWithID(tid)
+	if target == nil {
+		return 0, nil, syserror.ESRCH
+	}
+
+	return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil
+}
+
+// Setsid implements the linux syscall setsid(2).
+func Setsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.ThreadGroup().CreateSession()
+}
+
+// Getsid implements the linux syscall getsid(2).
+func Getsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := kernel.ThreadID(args[0].Int())
+	if tid == 0 {
+		return uintptr(t.PIDNamespace().IDOfSession(t.ThreadGroup().Session())), nil, nil
+	}
+
+	target := t.PIDNamespace().TaskWithID(tid)
+	if target == nil {
+		return 0, nil, syserror.ESRCH
+	}
+
+	return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil
+}
+
+// Getpriority pretends to implement the linux syscall getpriority(2).
+//
+// This is a stub; real priorities require a full scheduler.
+func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	which := args[0].Int()
+	who := kernel.ThreadID(args[1].Int())
+
+	switch which {
+	case linux.PRIO_PROCESS:
+		// Look for who, return ESRCH if not found.
+		var task *kernel.Task
+		if who == 0 {
+			task = t
+		} else {
+			task = t.PIDNamespace().TaskWithID(who)
+		}
+
+		if task == nil {
+			return 0, nil, syserror.ESRCH
+		}
+
+		// From kernel/sys.c:getpriority:
+		// "To avoid negative return values, 'getpriority()'
+		// will not return the normal nice-value, but a negated
+		// value that has been offset by 20"
+		return uintptr(20 - task.Niceness()), nil, nil
+	case linux.PRIO_USER:
+		fallthrough
+	case linux.PRIO_PGRP:
+		// PRIO_USER and PRIO_PGRP have no further implementation yet.
+		return 0, nil, nil
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+// Setpriority pretends to implement the linux syscall setpriority(2).
+//
+// This is a stub; real priorities require a full scheduler.
+func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	which := args[0].Int()
+	who := kernel.ThreadID(args[1].Int())
+	niceval := int(args[2].Int())
+
+	// In the kernel's implementation, values outside the range
+	// of [-20, 19] are truncated to these minimum and maximum
+	// values.
+	if niceval < -20 /* min niceval */ {
+		niceval = -20
+	} else if niceval > 19 /* max niceval */ {
+		niceval = 19
+	}
+
+	switch which {
+	case linux.PRIO_PROCESS:
+		// Look for who, return ESRCH if not found.
+		var task *kernel.Task
+		if who == 0 {
+			task = t
+		} else {
+			task = t.PIDNamespace().TaskWithID(who)
+		}
+
+		if task == nil {
+			return 0, nil, syserror.ESRCH
+		}
+
+		task.SetNiceness(niceval)
+	case linux.PRIO_USER:
+		fallthrough
+	case linux.PRIO_PGRP:
+		// PRIO_USER and PRIO_PGRP have no further implementation yet.
+		return 0, nil, nil
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, nil
+}
+
+// Ptrace implements linux system call ptrace(2).
+func Ptrace(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	req := args[0].Int64()
+	pid := kernel.ThreadID(args[1].Int())
+	addr := args[2].Pointer()
+	data := args[3].Pointer()
+
+	return 0, nil, t.Ptrace(req, pid, addr, data)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
new file mode 100644
index 000000000..2d2aa0819
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -0,0 +1,342 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"fmt"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// The most significant 29 bits hold either a pid or a file descriptor.
+func pidOfClockID(c int32) kernel.ThreadID {
+	return kernel.ThreadID(^(c >> 3))
+}
+
+// whichCPUClock returns one of CPUCLOCK_PERF, CPUCLOCK_VIRT, CPUCLOCK_SCHED or
+// CLOCK_FD.
+func whichCPUClock(c int32) int32 {
+	return c & linux.CPUCLOCK_CLOCK_MASK
+}
+
+// isCPUClockPerThread returns true if the CPUCLOCK_PERTHREAD bit is set in the
+// clock id.
+func isCPUClockPerThread(c int32) bool {
+	return c&linux.CPUCLOCK_PERTHREAD_MASK != 0
+}
+
+// isValidCPUClock returns checks that the cpu clock id is valid.
+func isValidCPUClock(c int32) bool {
+	// Bits 0, 1, and 2 cannot all be set.
+	if c&7 == 7 {
+		return false
+	}
+	if whichCPUClock(c) >= linux.CPUCLOCK_MAX {
+		return false
+	}
+	return true
+}
+
+// targetTask returns the kernel.Task for the given clock id.
+func targetTask(t *kernel.Task, c int32) *kernel.Task {
+	pid := pidOfClockID(c)
+	if pid == 0 {
+		return t
+	}
+	return t.PIDNamespace().TaskWithID(pid)
+}
+
+// ClockGetres implements linux syscall clock_getres(2).
+func ClockGetres(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := int32(args[0].Int())
+	addr := args[1].Pointer()
+	r := linux.Timespec{
+		Sec:  0,
+		Nsec: 1,
+	}
+
+	if _, err := getClock(t, clockID); err != nil {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if addr == 0 {
+		// Don't need to copy out.
+		return 0, nil, nil
+	}
+
+	return 0, nil, copyTimespecOut(t, addr, &r)
+}
+
+type cpuClocker interface {
+	UserCPUClock() ktime.Clock
+	CPUClock() ktime.Clock
+}
+
+func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) {
+	if clockID < 0 {
+		if !isValidCPUClock(clockID) {
+			return nil, syserror.EINVAL
+		}
+
+		targetTask := targetTask(t, clockID)
+		if targetTask == nil {
+			return nil, syserror.EINVAL
+		}
+
+		var target cpuClocker
+		if isCPUClockPerThread(clockID) {
+			target = targetTask
+		} else {
+			target = targetTask.ThreadGroup()
+		}
+
+		switch whichCPUClock(clockID) {
+		case linux.CPUCLOCK_VIRT:
+			return target.UserCPUClock(), nil
+		case linux.CPUCLOCK_PROF, linux.CPUCLOCK_SCHED:
+			// CPUCLOCK_SCHED is approximated by CPUCLOCK_PROF.
+			return target.CPUClock(), nil
+		default:
+			return nil, syserror.EINVAL
+		}
+	}
+
+	switch clockID {
+	case linux.CLOCK_REALTIME, linux.CLOCK_REALTIME_COARSE:
+		return t.Kernel().RealtimeClock(), nil
+	case linux.CLOCK_MONOTONIC, linux.CLOCK_MONOTONIC_COARSE,
+		linux.CLOCK_MONOTONIC_RAW, linux.CLOCK_BOOTTIME:
+		// CLOCK_MONOTONIC approximates CLOCK_MONOTONIC_RAW.
+		// CLOCK_BOOTTIME is internally mapped to CLOCK_MONOTONIC, as:
+		// - CLOCK_BOOTTIME should behave as CLOCK_MONOTONIC while also
+		//   including suspend time.
+		// - gVisor has no concept of suspend/resume.
+		// - CLOCK_MONOTONIC already includes save/restore time, which is
+		//   the closest to suspend time.
+		return t.Kernel().MonotonicClock(), nil
+	case linux.CLOCK_PROCESS_CPUTIME_ID:
+		return t.ThreadGroup().CPUClock(), nil
+	case linux.CLOCK_THREAD_CPUTIME_ID:
+		return t.CPUClock(), nil
+	default:
+		return nil, syserror.EINVAL
+	}
+}
+
+// ClockGettime implements linux syscall clock_gettime(2).
+func ClockGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := int32(args[0].Int())
+	addr := args[1].Pointer()
+
+	c, err := getClock(t, clockID)
+	if err != nil {
+		return 0, nil, err
+	}
+	ts := c.Now().Timespec()
+	return 0, nil, copyTimespecOut(t, addr, &ts)
+}
+
+// ClockSettime implements linux syscall clock_settime(2).
+func ClockSettime(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, syserror.EPERM
+}
+
+// Time implements linux syscall time(2).
+func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	r := t.Kernel().RealtimeClock().Now().TimeT()
+	if addr == usermem.Addr(0) {
+		return uintptr(r), nil, nil
+	}
+
+	if _, err := t.CopyOut(addr, r); err != nil {
+		return 0, nil, err
+	}
+	return uintptr(r), nil, nil
+}
+
+// clockNanosleepRestartBlock encapsulates the state required to restart
+// clock_nanosleep(2) via restart_syscall(2).
+//
+// +stateify savable
+type clockNanosleepRestartBlock struct {
+	c        ktime.Clock
+	duration time.Duration
+	rem      usermem.Addr
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (n *clockNanosleepRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return 0, clockNanosleepFor(t, n.c, n.duration, n.rem)
+}
+
+// clockNanosleepUntil blocks until a specified time.
+//
+// If blocking is interrupted, the syscall is restarted with the original
+// arguments.
+func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, ts linux.Timespec) error {
+	notifier, tchan := ktime.NewChannelNotifier()
+	timer := ktime.NewTimer(c, notifier)
+
+	// Turn on the timer.
+	timer.Swap(ktime.Setting{
+		Period:  0,
+		Enabled: true,
+		Next:    ktime.FromTimespec(ts),
+	})
+
+	err := t.BlockWithTimer(nil, tchan)
+
+	timer.Destroy()
+
+	// Did we just block until the timeout happened?
+	if err == syserror.ETIMEDOUT {
+		return nil
+	}
+
+	return syserror.ConvertIntr(err, kernel.ERESTARTNOHAND)
+}
+
+// clockNanosleepFor blocks for a specified duration.
+//
+// If blocking is interrupted, the syscall is restarted with the remaining
+// duration timeout.
+func clockNanosleepFor(t *kernel.Task, c ktime.Clock, dur time.Duration, rem usermem.Addr) error {
+	timer, start, tchan := ktime.After(c, dur)
+
+	err := t.BlockWithTimer(nil, tchan)
+
+	after := c.Now()
+
+	timer.Destroy()
+
+	switch err {
+	case syserror.ETIMEDOUT:
+		// Slept for entire timeout.
+		return nil
+	case syserror.ErrInterrupted:
+		// Interrupted.
+		remaining := dur - after.Sub(start)
+		if remaining < 0 {
+			remaining = time.Duration(0)
+		}
+
+		// Copy out remaining time.
+		if rem != 0 {
+			timeleft := linux.NsecToTimespec(remaining.Nanoseconds())
+			if err := copyTimespecOut(t, rem, &timeleft); err != nil {
+				return err
+			}
+		}
+
+		// Arrange for a restart with the remaining duration.
+		t.SetSyscallRestartBlock(&clockNanosleepRestartBlock{
+			c:        c,
+			duration: remaining,
+			rem:      rem,
+		})
+		return kernel.ERESTART_RESTARTBLOCK
+	default:
+		panic(fmt.Sprintf("Impossible BlockWithTimer error %v", err))
+	}
+}
+
+// Nanosleep implements linux syscall Nanosleep(2).
+func Nanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	rem := args[1].Pointer()
+
+	ts, err := copyTimespecIn(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if !ts.Valid() {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Just like linux, we cap the timeout with the max number that int64 can
+	// represent which is roughly 292 years.
+	dur := time.Duration(ts.ToNsecCapped()) * time.Nanosecond
+	return 0, nil, clockNanosleepFor(t, t.Kernel().MonotonicClock(), dur, rem)
+}
+
+// ClockNanosleep implements linux syscall clock_nanosleep(2).
+func ClockNanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := int32(args[0].Int())
+	flags := args[1].Int()
+	addr := args[2].Pointer()
+	rem := args[3].Pointer()
+
+	req, err := copyTimespecIn(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if !req.Valid() {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Only allow clock constants also allowed by Linux.
+	if clockID > 0 {
+		if clockID != linux.CLOCK_REALTIME &&
+			clockID != linux.CLOCK_MONOTONIC &&
+			clockID != linux.CLOCK_PROCESS_CPUTIME_ID {
+			return 0, nil, syserror.EINVAL
+		}
+	}
+
+	c, err := getClock(t, clockID)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if flags&linux.TIMER_ABSTIME != 0 {
+		return 0, nil, clockNanosleepUntil(t, c, req)
+	}
+
+	dur := time.Duration(req.ToNsecCapped()) * time.Nanosecond
+	return 0, nil, clockNanosleepFor(t, c, dur, rem)
+}
+
+// Gettimeofday implements linux syscall gettimeofday(2).
+func Gettimeofday(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tv := args[0].Pointer()
+	tz := args[1].Pointer()
+
+	if tv != usermem.Addr(0) {
+		nowTv := t.Kernel().RealtimeClock().Now().Timeval()
+		if err := copyTimevalOut(t, tv, &nowTv); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	if tz != usermem.Addr(0) {
+		// Ask the time package for the timezone.
+		_, offset := time.Now().Zone()
+		// This int32 array mimics linux's struct timezone.
+		timezone := [2]int32{-int32(offset) / 60, 0}
+		_, err := t.CopyOut(tz, timezone)
+		return 0, nil, err
+	}
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
new file mode 100644
index 000000000..a4c400f87
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -0,0 +1,203 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const nsecPerSec = int64(time.Second)
+
+// copyItimerValIn copies an ItimerVal from the untrusted app range to the
+// kernel.  The ItimerVal may be either 32 or 64 bits.
+// A NULL address is allowed because because Linux allows
+// setitimer(which, NULL, &old_value) which disables the timer.
+// There is a KERN_WARN message saying this misfeature will be removed.
+// However, that hasn't happened as of 3.19, so we continue to support it.
+func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) {
+	if addr == usermem.Addr(0) {
+		return linux.ItimerVal{}, nil
+	}
+
+	switch t.Arch().Width() {
+	case 8:
+		// Native size, just copy directly.
+		var itv linux.ItimerVal
+		if _, err := t.CopyIn(addr, &itv); err != nil {
+			return linux.ItimerVal{}, err
+		}
+
+		return itv, nil
+	default:
+		return linux.ItimerVal{}, syserror.ENOSYS
+	}
+}
+
+// copyItimerValOut copies an ItimerVal to the untrusted app range.
+// The ItimerVal may be either 32 or 64 bits.
+// A NULL address is allowed, in which case no copy takes place
+func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) error {
+	if addr == usermem.Addr(0) {
+		return nil
+	}
+
+	switch t.Arch().Width() {
+	case 8:
+		// Native size, just copy directly.
+		_, err := t.CopyOut(addr, itv)
+		return err
+	default:
+		return syserror.ENOSYS
+	}
+}
+
+// Getitimer implements linux syscall getitimer(2).
+func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := args[0].Int()
+	val := args[1].Pointer()
+
+	olditv, err := t.Getitimer(timerID)
+	if err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, copyItimerValOut(t, val, &olditv)
+}
+
+// Setitimer implements linux syscall setitimer(2).
+func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := args[0].Int()
+	newVal := args[1].Pointer()
+	oldVal := args[2].Pointer()
+
+	newitv, err := copyItimerValIn(t, newVal)
+	if err != nil {
+		return 0, nil, err
+	}
+	olditv, err := t.Setitimer(timerID, newitv)
+	if err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, copyItimerValOut(t, oldVal, &olditv)
+}
+
+// Alarm implements linux syscall alarm(2).
+func Alarm(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	duration := time.Duration(args[0].Uint()) * time.Second
+
+	olditv, err := t.Setitimer(linux.ITIMER_REAL, linux.ItimerVal{
+		Value: linux.DurationToTimeval(duration),
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	olddur := olditv.Value.ToDuration()
+	secs := olddur.Round(time.Second).Nanoseconds() / nsecPerSec
+	if secs == 0 && olddur != 0 {
+		// We can't return 0 if an alarm was previously scheduled.
+		secs = 1
+	}
+	return uintptr(secs), nil, nil
+}
+
+// TimerCreate implements linux syscall timer_create(2).
+func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := args[0].Int()
+	sevp := args[1].Pointer()
+	timerIDp := args[2].Pointer()
+
+	c, err := getClock(t, clockID)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var sev *linux.Sigevent
+	if sevp != 0 {
+		sev = &linux.Sigevent{}
+		if _, err = t.CopyIn(sevp, sev); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	id, err := t.IntervalTimerCreate(c, sev)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if _, err := t.CopyOut(timerIDp, &id); err != nil {
+		t.IntervalTimerDelete(id)
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// TimerSettime implements linux syscall timer_settime(2).
+func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := linux.TimerID(args[0].Value)
+	flags := args[1].Int()
+	newValAddr := args[2].Pointer()
+	oldValAddr := args[3].Pointer()
+
+	var newVal linux.Itimerspec
+	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+		return 0, nil, err
+	}
+	oldVal, err := t.IntervalTimerSettime(timerID, newVal, flags&linux.TIMER_ABSTIME != 0)
+	if err != nil {
+		return 0, nil, err
+	}
+	if oldValAddr != 0 {
+		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// TimerGettime implements linux syscall timer_gettime(2).
+func TimerGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := linux.TimerID(args[0].Value)
+	curValAddr := args[1].Pointer()
+
+	curVal, err := t.IntervalTimerGettime(timerID)
+	if err != nil {
+		return 0, nil, err
+	}
+	_, err = t.CopyOut(curValAddr, &curVal)
+	return 0, nil, err
+}
+
+// TimerGetoverrun implements linux syscall timer_getoverrun(2).
+func TimerGetoverrun(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := linux.TimerID(args[0].Value)
+
+	o, err := t.IntervalTimerGetoverrun(timerID)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(o), nil, nil
+}
+
+// TimerDelete implements linux syscall timer_delete(2).
+func TimerDelete(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := linux.TimerID(args[0].Value)
+	return 0, nil, t.IntervalTimerDelete(timerID)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
new file mode 100644
index 000000000..cf49b43db
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -0,0 +1,121 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TimerfdCreate implements Linux syscall timerfd_create(2).
+func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := args[0].Int()
+	flags := args[1].Int()
+
+	if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var c ktime.Clock
+	switch clockID {
+	case linux.CLOCK_REALTIME:
+		c = t.Kernel().RealtimeClock()
+	case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME:
+		c = t.Kernel().MonotonicClock()
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+	f := timerfd.NewFile(t, c)
+	defer f.DecRef()
+	f.SetFlags(fs.SettableFileFlags{
+		NonBlocking: flags&linux.TFD_NONBLOCK != 0,
+	})
+
+	fd, err := t.NewFDFrom(0, f, kernel.FDFlags{
+		CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// TimerfdSettime implements Linux syscall timerfd_settime(2).
+func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	flags := args[1].Int()
+	newValAddr := args[2].Pointer()
+	oldValAddr := args[3].Pointer()
+
+	if flags&^(linux.TFD_TIMER_ABSTIME) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	tf, ok := f.FileOperations.(*timerfd.TimerOperations)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var newVal linux.Itimerspec
+	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+		return 0, nil, err
+	}
+	newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tf.Clock())
+	if err != nil {
+		return 0, nil, err
+	}
+	tm, oldS := tf.SetTime(newS)
+	if oldValAddr != 0 {
+		oldVal := ktime.ItimerspecFromSetting(tm, oldS)
+		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// TimerfdGettime implements Linux syscall timerfd_gettime(2).
+func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	curValAddr := args[1].Pointer()
+
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	tf, ok := f.FileOperations.(*timerfd.TimerOperations)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	tm, s := tf.GetTime()
+	curVal := ktime.ItimerspecFromSetting(tm, s)
+	_, err := t.CopyOut(curValAddr, &curVal)
+	return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_tls_amd64.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
new file mode 100644
index 000000000..b3eb96a1c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
@@ -0,0 +1,52 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//+build amd64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// ArchPrctl implements linux syscall arch_prctl(2).
+// It sets architecture-specific process or thread state for t.
+func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	switch args[0].Int() {
+	case linux.ARCH_GET_FS:
+		addr := args[1].Pointer()
+		fsbase := t.Arch().TLS()
+		_, err := t.CopyOut(addr, uint64(fsbase))
+		if err != nil {
+			return 0, nil, err
+		}
+
+	case linux.ARCH_SET_FS:
+		fsbase := args[1].Uint64()
+		if !t.Arch().SetTLS(uintptr(fsbase)) {
+			return 0, nil, syserror.EPERM
+		}
+
+	case linux.ARCH_GET_GS, linux.ARCH_SET_GS:
+		t.Kernel().EmitUnimplementedEvent(t)
+		fallthrough
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_tls_arm64.go b/pkg/sentry/syscalls/linux/sys_tls_arm64.go
new file mode 100644
index 000000000..fb08a356e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_tls_arm64.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//+build arm64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// ArchPrctl is not defined for ARM64.
+func ArchPrctl(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, syserror.ENOSYS
+}
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
new file mode 100644
index 000000000..e9d702e8e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -0,0 +1,95 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Uname implements linux syscall uname.
+func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	version := t.SyscallTable().Version
+
+	uts := t.UTSNamespace()
+
+	// Fill in structure fields.
+	var u linux.UtsName
+	copy(u.Sysname[:], version.Sysname)
+	copy(u.Nodename[:], uts.HostName())
+	copy(u.Release[:], version.Release)
+	copy(u.Version[:], version.Version)
+	// build tag above.
+	switch t.SyscallTable().Arch {
+	case arch.AMD64:
+		copy(u.Machine[:], "x86_64")
+	case arch.ARM64:
+		copy(u.Machine[:], "aarch64")
+	default:
+		copy(u.Machine[:], "unknown")
+	}
+	copy(u.Domainname[:], uts.DomainName())
+
+	// Copy out the result.
+	va := args[0].Pointer()
+	_, err := t.CopyOut(va, u)
+	return 0, nil, err
+}
+
+// Setdomainname implements Linux syscall setdomainname.
+func Setdomainname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nameAddr := args[0].Pointer()
+	size := args[1].Int()
+
+	utsns := t.UTSNamespace()
+	if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) {
+		return 0, nil, syserror.EPERM
+	}
+	if size < 0 || size > linux.UTSLen {
+		return 0, nil, syserror.EINVAL
+	}
+
+	name, err := t.CopyInString(nameAddr, int(size))
+	if err != nil {
+		return 0, nil, err
+	}
+
+	utsns.SetDomainName(name)
+	return 0, nil, nil
+}
+
+// Sethostname implements Linux syscall sethostname.
+func Sethostname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nameAddr := args[0].Pointer()
+	size := args[1].Int()
+
+	utsns := t.UTSNamespace()
+	if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) {
+		return 0, nil, syserror.EPERM
+	}
+	if size < 0 || size > linux.UTSLen {
+		return 0, nil, syserror.EINVAL
+	}
+
+	name := make([]byte, size)
+	if _, err := t.CopyInBytes(nameAddr, name); err != nil {
+		return 0, nil, err
+	}
+
+	utsns.SetHostName(string(name))
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
new file mode 100644
index 000000000..6ec0de96e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -0,0 +1,364 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// LINT.IfChange
+
+const (
+	// EventMaskWrite contains events that can be triggered on writes.
+	//
+	// Note that EventHUp is not going to happen for pipes but may for
+	// implementations of poll on some sockets, see net/core/datagram.c.
+	EventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr
+)
+
+// Write implements linux syscall write(2).
+func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is writable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := writev(t, file, src)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+}
+
+// Pwrite64 implements linux syscall pwrite64(2).
+func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is writing at an offset supported?
+	if !file.Flags().Pwrite {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is writable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwritev(t, file, src, offset)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+}
+
+// Writev implements linux syscall writev(2).
+func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is writable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := writev(t, file, src)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+}
+
+// Pwritev implements linux syscall pwritev(2).
+func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is writing at an offset supported?
+	if !file.Flags().Pwrite {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is writable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwritev(t, file, src, offset)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+}
+
+// Pwritev2 implements linux syscall pwritev2(2).
+func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the syscall is
+	// pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the linux internal call
+	// (https://elixir.bootlin.com/linux/v4.18/source/fs/read_write.c#L1354)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 5th argument.
+
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := int(args[5].Int())
+
+	if int(args[4].Int())&0x4 == 1 {
+		return 0, nil, syserror.EACCES
+	}
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is writing at an offset supported?
+	if offset > -1 && !file.Flags().Pwrite {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Note: gVisor does not implement the RWF_HIPRI feature, but the flag is
+	// accepted as a valid flag argument for pwritev2.
+	if flags&^linux.RWF_VALID != 0 {
+		return uintptr(flags), nil, syserror.EOPNOTSUPP
+	}
+
+	// Check that the file is writeable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// If pwritev2 is called with an offset of -1, writev is called.
+	if offset == -1 {
+		n, err := writev(t, file, src)
+		t.IOUsage().AccountWriteSyscall(n)
+		return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+	}
+
+	n, err := pwritev(t, file, src, offset)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+}
+
+func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) {
+	n, err := f.Writev(t, src)
+	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+		if n > 0 {
+			// Queue notification if we wrote anything.
+			f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+		}
+		return n, err
+	}
+
+	// Sockets support write timeouts.
+	var haveDeadline bool
+	var deadline ktime.Time
+	if s, ok := f.FileOperations.(socket.Socket); ok {
+		dl := s.SendTimeout()
+		if dl < 0 && err == syserror.ErrWouldBlock {
+			return n, err
+		}
+		if dl > 0 {
+			deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+			haveDeadline = true
+		}
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	f.EventRegister(&w, EventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst64(n)
+
+		// Issue the request and break out if it completes with
+		// anything other than "would block".
+		n, err = f.Writev(t, src)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = syserror.ErrWouldBlock
+			}
+			break
+		}
+	}
+
+	f.EventUnregister(&w)
+
+	if total > 0 {
+		// Queue notification if we wrote anything.
+		f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+	}
+
+	return total, err
+}
+
+func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	n, err := f.Pwritev(t, src, offset)
+	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+		if n > 0 {
+			// Queue notification if we wrote anything.
+			f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	f.EventRegister(&w, EventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst64(n)
+
+		// Issue the request and break out if it completes with
+		// anything other than "would block".
+		n, err = f.Pwritev(t, src, offset+total)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.Block(ch); err != nil {
+			break
+		}
+	}
+
+	f.EventUnregister(&w)
+
+	if total > 0 {
+		// Queue notification if we wrote anything.
+		f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+	}
+
+	return total, err
+}
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
new file mode 100644
index 000000000..c24946160
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -0,0 +1,432 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// LINT.IfChange
+
+// GetXattr implements linux syscall getxattr(2).
+func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getXattrFromPath(t, args, true)
+}
+
+// LGetXattr implements linux syscall lgetxattr(2).
+func LGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getXattrFromPath(t, args, false)
+}
+
+// FGetXattr implements linux syscall fgetxattr(2).
+func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := uint64(args[3].SizeT())
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	n, err := getXattr(t, f.Dirent, nameAddr, valueAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(n), nil, nil
+}
+
+func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := uint64(args[3].SizeT())
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n := 0
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		n, err = getXattr(t, d, nameAddr, valueAddr, size)
+		return err
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(n), nil, nil
+}
+
+// getXattr implements getxattr(2) from the given *fs.Dirent.
+func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64) (int, error) {
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, err
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
+		return 0, err
+	}
+
+	// TODO(b/148380782): Support xattrs in namespaces other than "user".
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	// If getxattr(2) is called with size 0, the size of the value will be
+	// returned successfully even if it is nonzero. In that case, we need to
+	// retrieve the entire attribute value so we can return the correct size.
+	requestedSize := size
+	if size == 0 || size > linux.XATTR_SIZE_MAX {
+		requestedSize = linux.XATTR_SIZE_MAX
+	}
+
+	value, err := d.Inode.GetXattr(t, name, requestedSize)
+	if err != nil {
+		return 0, err
+	}
+	n := len(value)
+	if uint64(n) > requestedSize {
+		return 0, syserror.ERANGE
+	}
+
+	// Don't copy out the attribute value if size is 0.
+	if size == 0 {
+		return n, nil
+	}
+
+	if _, err = t.CopyOutBytes(valueAddr, []byte(value)); err != nil {
+		return 0, err
+	}
+	return n, nil
+}
+
+// SetXattr implements linux syscall setxattr(2).
+func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return setXattrFromPath(t, args, true)
+}
+
+// LSetXattr implements linux syscall lsetxattr(2).
+func LSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return setXattrFromPath(t, args, false)
+}
+
+// FSetXattr implements linux syscall fsetxattr(2).
+func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := uint64(args[3].SizeT())
+	flags := args[4].Uint()
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags)
+}
+
+func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := uint64(args[3].SizeT())
+	flags := args[4].Uint()
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		return setXattr(t, d, nameAddr, valueAddr, uint64(size), flags)
+	})
+}
+
+// setXattr implements setxattr(2) from the given *fs.Dirent.
+func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error {
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return syserror.EINVAL
+	}
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+		return err
+	}
+
+	if size > linux.XATTR_SIZE_MAX {
+		return syserror.E2BIG
+	}
+	buf := make([]byte, size)
+	if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
+		return err
+	}
+	value := string(buf)
+
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+
+	if err := d.Inode.SetXattr(t, d, name, value, flags); err != nil {
+		return err
+	}
+	d.InotifyEvent(linux.IN_ATTRIB, 0)
+	return nil
+}
+
+func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
+	name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
+	if err != nil {
+		if err == syserror.ENAMETOOLONG {
+			return "", syserror.ERANGE
+		}
+		return "", err
+	}
+	if len(name) == 0 {
+		return "", syserror.ERANGE
+	}
+	return name, nil
+}
+
+// Restrict xattrs to regular files and directories.
+//
+// TODO(b/148380782): In Linux, this restriction technically only applies to
+// xattrs in the "user.*" namespace. Make file type checks specific to the
+// namespace once we allow other xattr prefixes.
+func xattrFileTypeOk(i *fs.Inode) bool {
+	return fs.IsRegular(i.StableAttr) || fs.IsDir(i.StableAttr)
+}
+
+func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error {
+	// Restrict xattrs to regular files and directories.
+	if !xattrFileTypeOk(i) {
+		if perms.Write {
+			return syserror.EPERM
+		}
+		return syserror.ENODATA
+	}
+
+	return i.CheckPermission(t, perms)
+}
+
+// ListXattr implements linux syscall listxattr(2).
+func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listXattrFromPath(t, args, true)
+}
+
+// LListXattr implements linux syscall llistxattr(2).
+func LListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listXattrFromPath(t, args, false)
+}
+
+// FListXattr implements linux syscall flistxattr(2).
+func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	listAddr := args[1].Pointer()
+	size := uint64(args[2].SizeT())
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	n, err := listXattr(t, f.Dirent, listAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(n), nil, nil
+}
+
+func listXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	listAddr := args[1].Pointer()
+	size := uint64(args[2].SizeT())
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n := 0
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		n, err = listXattr(t, d, listAddr, size)
+		return err
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(n), nil, nil
+}
+
+func listXattr(t *kernel.Task, d *fs.Dirent, addr usermem.Addr, size uint64) (int, error) {
+	if !xattrFileTypeOk(d.Inode) {
+		return 0, nil
+	}
+
+	// If listxattr(2) is called with size 0, the buffer size needed to contain
+	// the xattr list will be returned successfully even if it is nonzero. In
+	// that case, we need to retrieve the entire list so we can compute and
+	// return the correct size.
+	requestedSize := size
+	if size == 0 || size > linux.XATTR_SIZE_MAX {
+		requestedSize = linux.XATTR_SIZE_MAX
+	}
+	xattrs, err := d.Inode.ListXattr(t, requestedSize)
+	if err != nil {
+		return 0, err
+	}
+
+	// TODO(b/148380782): support namespaces other than "user".
+	for x := range xattrs {
+		if !strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
+			delete(xattrs, x)
+		}
+	}
+
+	listSize := xattrListSize(xattrs)
+	if listSize > linux.XATTR_SIZE_MAX {
+		return 0, syserror.E2BIG
+	}
+	if uint64(listSize) > requestedSize {
+		return 0, syserror.ERANGE
+	}
+
+	// Don't copy out the attributes if size is 0.
+	if size == 0 {
+		return listSize, nil
+	}
+
+	buf := make([]byte, 0, listSize)
+	for x := range xattrs {
+		buf = append(buf, []byte(x)...)
+		buf = append(buf, 0)
+	}
+	if _, err := t.CopyOutBytes(addr, buf); err != nil {
+		return 0, err
+	}
+
+	return len(buf), nil
+}
+
+func xattrListSize(xattrs map[string]struct{}) int {
+	size := 0
+	for x := range xattrs {
+		size += len(x) + 1
+	}
+	return size
+}
+
+// RemoveXattr implements linux syscall removexattr(2).
+func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return removeXattrFromPath(t, args, true)
+}
+
+// LRemoveXattr implements linux syscall lremovexattr(2).
+func LRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return removeXattrFromPath(t, args, false)
+}
+
+// FRemoveXattr implements linux syscall fremovexattr(2).
+func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	return 0, nil, removeXattr(t, f.Dirent, nameAddr)
+}
+
+func removeXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		return removeXattr(t, d, nameAddr)
+	})
+}
+
+// removeXattr implements removexattr(2) from the given *fs.Dirent.
+func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+		return err
+	}
+
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+
+	if err := d.Inode.RemoveXattr(t, d, name); err != nil {
+		return err
+	}
+	d.InotifyEvent(linux.IN_ATTRIB, 0)
+	return nil
+}
+
+// LINT.ThenChange(vfs2/xattr.go)
diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go
new file mode 100644
index 000000000..ddc3ee26e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/timespec.go
@@ -0,0 +1,111 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// copyTimespecIn copies a Timespec from the untrusted app range to the kernel.
+func copyTimespecIn(t *kernel.Task, addr usermem.Addr) (linux.Timespec, error) {
+	switch t.Arch().Width() {
+	case 8:
+		ts := linux.Timespec{}
+		in := t.CopyScratchBuffer(16)
+		_, err := t.CopyInBytes(addr, in)
+		if err != nil {
+			return ts, err
+		}
+		ts.Sec = int64(usermem.ByteOrder.Uint64(in[0:]))
+		ts.Nsec = int64(usermem.ByteOrder.Uint64(in[8:]))
+		return ts, nil
+	default:
+		return linux.Timespec{}, syserror.ENOSYS
+	}
+}
+
+// copyTimespecOut copies a Timespec to the untrusted app range.
+func copyTimespecOut(t *kernel.Task, addr usermem.Addr, ts *linux.Timespec) error {
+	switch t.Arch().Width() {
+	case 8:
+		out := t.CopyScratchBuffer(16)
+		usermem.ByteOrder.PutUint64(out[0:], uint64(ts.Sec))
+		usermem.ByteOrder.PutUint64(out[8:], uint64(ts.Nsec))
+		_, err := t.CopyOutBytes(addr, out)
+		return err
+	default:
+		return syserror.ENOSYS
+	}
+}
+
+// copyTimevalIn copies a Timeval from the untrusted app range to the kernel.
+func copyTimevalIn(t *kernel.Task, addr usermem.Addr) (linux.Timeval, error) {
+	switch t.Arch().Width() {
+	case 8:
+		tv := linux.Timeval{}
+		in := t.CopyScratchBuffer(16)
+		_, err := t.CopyInBytes(addr, in)
+		if err != nil {
+			return tv, err
+		}
+		tv.Sec = int64(usermem.ByteOrder.Uint64(in[0:]))
+		tv.Usec = int64(usermem.ByteOrder.Uint64(in[8:]))
+		return tv, nil
+	default:
+		return linux.Timeval{}, syserror.ENOSYS
+	}
+}
+
+// copyTimevalOut copies a Timeval to the untrusted app range.
+func copyTimevalOut(t *kernel.Task, addr usermem.Addr, tv *linux.Timeval) error {
+	switch t.Arch().Width() {
+	case 8:
+		out := t.CopyScratchBuffer(16)
+		usermem.ByteOrder.PutUint64(out[0:], uint64(tv.Sec))
+		usermem.ByteOrder.PutUint64(out[8:], uint64(tv.Usec))
+		_, err := t.CopyOutBytes(addr, out)
+		return err
+	default:
+		return syserror.ENOSYS
+	}
+}
+
+// copyTimespecInToDuration copies a Timespec from the untrusted app range,
+// validates it and converts it to a Duration.
+//
+// If the Timespec is larger than what can be represented in a Duration, the
+// returned value is the maximum that Duration will allow.
+//
+// If timespecAddr is NULL, the returned value is negative.
+func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) {
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timespecAddr != 0 {
+		timespec, err := copyTimespecIn(t, timespecAddr)
+		if err != nil {
+			return 0, err
+		}
+		if !timespec.Valid() {
+			return 0, syserror.EINVAL
+		}
+		timeout = time.Duration(timespec.ToNsecCapped())
+	}
+	return timeout, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
new file mode 100644
index 000000000..0c740335b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -0,0 +1,76 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "vfs2",
+    srcs = [
+        "aio.go",
+        "epoll.go",
+        "eventfd.go",
+        "execve.go",
+        "fd.go",
+        "filesystem.go",
+        "fscontext.go",
+        "getdents.go",
+        "inotify.go",
+        "ioctl.go",
+        "lock.go",
+        "memfd.go",
+        "mmap.go",
+        "mount.go",
+        "path.go",
+        "pipe.go",
+        "poll.go",
+        "read_write.go",
+        "setstat.go",
+        "signal.go",
+        "socket.go",
+        "splice.go",
+        "stat.go",
+        "stat_amd64.go",
+        "stat_arm64.go",
+        "sync.go",
+        "timerfd.go",
+        "vfs2.go",
+        "xattr.go",
+    ],
+    marshal = True,
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/bits",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/gohacks",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/eventfd",
+        "//pkg/sentry/fsimpl/pipefs",
+        "//pkg/sentry/fsimpl/signalfd",
+        "//pkg/sentry/fsimpl/timerfd",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/fasync",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/syscalls",
+        "//pkg/sentry/syscalls/linux",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserr",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go
new file mode 100644
index 000000000..e5cdefc50
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/aio.go
@@ -0,0 +1,216 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// IoSubmit implements linux syscall io_submit(2).
+func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Uint64()
+	nrEvents := args[1].Int()
+	addr := args[2].Pointer()
+
+	if nrEvents < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	for i := int32(0); i < nrEvents; i++ {
+		// Copy in the address.
+		cbAddrNative := t.Arch().Native(0)
+		if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
+			if i > 0 {
+				// Some successful.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Copy in this callback.
+		var cb linux.IOCallback
+		cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
+		if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+			if i > 0 {
+				// Some have been successful.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Process this callback.
+		if err := submitCallback(t, id, &cb, cbAddr); err != nil {
+			if i > 0 {
+				// Partial success.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Advance to the next one.
+		addr += usermem.Addr(t.Arch().Width())
+	}
+
+	return uintptr(nrEvents), nil, nil
+}
+
+// submitCallback processes a single callback.
+func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error {
+	if cb.Reserved2 != 0 {
+		return syserror.EINVAL
+	}
+
+	fd := t.GetFileVFS2(cb.FD)
+	if fd == nil {
+		return syserror.EBADF
+	}
+	defer fd.DecRef()
+
+	// Was there an eventFD? Extract it.
+	var eventFD *vfs.FileDescription
+	if cb.Flags&linux.IOCB_FLAG_RESFD != 0 {
+		eventFD = t.GetFileVFS2(cb.ResFD)
+		if eventFD == nil {
+			return syserror.EBADF
+		}
+		defer eventFD.DecRef()
+
+		// Check that it is an eventfd.
+		if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok {
+			return syserror.EINVAL
+		}
+	}
+
+	ioseq, err := memoryFor(t, cb)
+	if err != nil {
+		return err
+	}
+
+	// Check offset for reads/writes.
+	switch cb.OpCode {
+	case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+		if cb.Offset < 0 {
+			return syserror.EINVAL
+		}
+	}
+
+	// Prepare the request.
+	aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id)
+	if !ok {
+		return syserror.EINVAL
+	}
+	if ready := aioCtx.Prepare(); !ready {
+		// Context is busy.
+		return syserror.EAGAIN
+	}
+
+	if eventFD != nil {
+		// The request is set. Make sure there's a ref on the file.
+		//
+		// This is necessary when the callback executes on completion,
+		// which is also what will release this reference.
+		eventFD.IncRef()
+	}
+
+	// Perform the request asynchronously.
+	fd.IncRef()
+	t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx))
+	return nil
+}
+
+func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback {
+	return func(ctx context.Context) {
+		if aioCtx.Dead() {
+			aioCtx.CancelPendingRequest()
+			return
+		}
+		ev := &linux.IOEvent{
+			Data: cb.Data,
+			Obj:  uint64(cbAddr),
+		}
+
+		var err error
+		switch cb.OpCode {
+		case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV:
+			ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{})
+		case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+			ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{})
+		case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC:
+			err = fd.Sync(ctx)
+		}
+
+		// Update the result.
+		if err != nil {
+			err = slinux.HandleIOErrorVFS2(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd)
+			ev.Result = -int64(kernel.ExtractErrno(err, 0))
+		}
+
+		fd.DecRef()
+
+		// Queue the result for delivery.
+		aioCtx.FinishRequest(ev)
+
+		// Notify the event file if one was specified. This needs to happen
+		// *after* queueing the result to avoid racing with the thread we may
+		// wake up.
+		if eventFD != nil {
+			eventFD.Impl().(*eventfd.EventFileDescription).Signal(1)
+			eventFD.DecRef()
+		}
+	}
+}
+
+// memoryFor returns appropriate memory for the given callback.
+func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) {
+	bytes := int(cb.Bytes)
+	if bytes < 0 {
+		// Linux also requires that this field fit in ssize_t.
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+
+	// Since this I/O will be asynchronous with respect to t's task goroutine,
+	// we have no guarantee that t's AddressSpace will be active during the
+	// I/O.
+	switch cb.OpCode {
+	case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE:
+		return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+			AddressSpaceActive: false,
+		})
+
+	case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV:
+		return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+			AddressSpaceActive: false,
+		})
+
+	case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP:
+		return usermem.IOSequence{}, nil
+
+	default:
+		// Not a supported command.
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
new file mode 100644
index 000000000..34c90ae3e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -0,0 +1,228 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"math"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+var sizeofEpollEvent = (*linux.EpollEvent)(nil).SizeBytes()
+
+// EpollCreate1 implements Linux syscall epoll_create1(2).
+func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	if flags&^linux.EPOLL_CLOEXEC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file, err := t.Kernel().VFS().NewEpollInstanceFD()
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// EpollCreate implements Linux syscall epoll_create(2).
+func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := args[0].Int()
+
+	// "Since Linux 2.6.8, the size argument is ignored, but must be greater
+	// than zero" - epoll_create(2)
+	if size <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file, err := t.Kernel().VFS().NewEpollInstanceFD()
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// EpollCtl implements Linux syscall epoll_ctl(2).
+func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := args[0].Int()
+	op := args[1].Int()
+	fd := args[2].Int()
+	eventAddr := args[3].Pointer()
+
+	epfile := t.GetFileVFS2(epfd)
+	if epfile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer epfile.DecRef()
+	ep, ok := epfile.Impl().(*vfs.EpollInstance)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+	if epfile == file {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var event linux.EpollEvent
+	switch op {
+	case linux.EPOLL_CTL_ADD:
+		if _, err := event.CopyIn(t, eventAddr); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, ep.AddInterest(file, fd, event)
+	case linux.EPOLL_CTL_DEL:
+		return 0, nil, ep.DeleteInterest(file, fd)
+	case linux.EPOLL_CTL_MOD:
+		if _, err := event.CopyIn(t, eventAddr); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, ep.ModifyInterest(file, fd, event)
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+// EpollWait implements Linux syscall epoll_wait(2).
+func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := args[0].Int()
+	eventsAddr := args[1].Pointer()
+	maxEvents := int(args[2].Int())
+	timeout := int(args[3].Int())
+
+	var _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS
+	if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS {
+		return 0, nil, syserror.EINVAL
+	}
+
+	epfile := t.GetFileVFS2(epfd)
+	if epfile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer epfile.DecRef()
+	ep, ok := epfile.Impl().(*vfs.EpollInstance)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent,
+	// maxEvents), so that the buffer can be allocated on the stack.
+	var (
+		events       [16]linux.EpollEvent
+		total        int
+		ch           chan struct{}
+		haveDeadline bool
+		deadline     ktime.Time
+	)
+	for {
+		batchEvents := len(events)
+		if batchEvents > maxEvents {
+			batchEvents = maxEvents
+		}
+		n := ep.ReadEvents(events[:batchEvents])
+		maxEvents -= n
+		if n != 0 {
+			// Copy what we read out.
+			copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events[:n])
+			copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
+			eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
+			total += copiedEvents
+			if err != nil {
+				if total != 0 {
+					return uintptr(total), nil, nil
+				}
+				return 0, nil, err
+			}
+			// If we've filled the application's event buffer, we're done.
+			if maxEvents == 0 {
+				return uintptr(total), nil, nil
+			}
+			// Loop if we read a full batch, under the expectation that there
+			// may be more events to read.
+			if n == batchEvents {
+				continue
+			}
+		}
+		// We get here if n != batchEvents. If we read any number of events
+		// (just now, or in a previous iteration of this loop), or if timeout
+		// is 0 (such that epoll_wait should be non-blocking), return the
+		// events we've read so far to the application.
+		if total != 0 || timeout == 0 {
+			return uintptr(total), nil, nil
+		}
+		// In the first iteration of this loop, register with the epoll
+		// instance for readability events, but then immediately continue the
+		// loop since we need to retry ReadEvents() before blocking. In all
+		// subsequent iterations, block until events are available, the timeout
+		// expires, or an interrupt arrives.
+		if ch == nil {
+			var w waiter.Entry
+			w, ch = waiter.NewChannelEntry(nil)
+			epfile.EventRegister(&w, waiter.EventIn)
+			defer epfile.EventUnregister(&w)
+		} else {
+			// Set up the timer if a timeout was specified.
+			if timeout > 0 && !haveDeadline {
+				timeoutDur := time.Duration(timeout) * time.Millisecond
+				deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
+				haveDeadline = true
+			}
+			if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+				if err == syserror.ETIMEDOUT {
+					err = nil
+				}
+				// total must be 0 since otherwise we would have returned
+				// above.
+				return 0, nil, err
+			}
+		}
+	}
+}
+
+// EpollPwait implements Linux syscall epoll_pwait(2).
+func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	maskAddr := args[4].Pointer()
+	maskSize := uint(args[5].Uint())
+
+	if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
+		return 0, nil, err
+	}
+
+	return EpollWait(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go
new file mode 100644
index 000000000..aff1a2070
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go
@@ -0,0 +1,61 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Eventfd2 implements linux syscall eventfd2(2).
+func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	initVal := uint64(args[0].Uint())
+	flags := uint(args[1].Uint())
+	allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC)
+
+	if flags & ^allOps != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	vfsObj := t.Kernel().VFS()
+	fileFlags := uint32(linux.O_RDWR)
+	if flags&linux.EFD_NONBLOCK != 0 {
+		fileFlags |= linux.O_NONBLOCK
+	}
+	semMode := flags&linux.EFD_SEMAPHORE != 0
+	eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer eventfd.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, eventfd, kernel.FDFlags{
+		CloseOnExec: flags&linux.EFD_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// Eventfd implements linux syscall eventfd(2).
+func Eventfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	args[1].Value = 0
+	return Eventfd2(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go
new file mode 100644
index 000000000..aef0078a8
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/execve.go
@@ -0,0 +1,137 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Execve implements linux syscall execve(2).
+func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathnameAddr := args[0].Pointer()
+	argvAddr := args[1].Pointer()
+	envvAddr := args[2].Pointer()
+	return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */)
+}
+
+// Execveat implements linux syscall execveat(2).
+func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathnameAddr := args[1].Pointer()
+	argvAddr := args[2].Pointer()
+	envvAddr := args[3].Pointer()
+	flags := args[4].Int()
+	return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags)
+}
+
+func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
+	if err != nil {
+		return 0, nil, err
+	}
+	var argv, envv []string
+	if argvAddr != 0 {
+		var err error
+		argv, err = t.CopyInVector(argvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+	if envvAddr != 0 {
+		var err error
+		envv, err = t.CopyInVector(envvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	var executable fsbridge.File
+	closeOnExec := false
+	if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute {
+		// We must open the executable ourselves since dirfd is used as the
+		// starting point while resolving path, but the task working directory
+		// is used as the starting point while resolving interpreters (Linux:
+		// fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() =>
+		// do_open_execat(fd=AT_FDCWD)), and the loader package is currently
+		// incapable of handling this correctly.
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return 0, nil, syserror.ENOENT
+		}
+		dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd)
+		if dirfile == nil {
+			return 0, nil, syserror.EBADF
+		}
+		start := dirfile.VirtualDentry()
+		start.IncRef()
+		dirfile.DecRef()
+		closeOnExec = dirfileFlags.CloseOnExec
+		file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{
+			Root:               root,
+			Start:              start,
+			Path:               path,
+			FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+		}, &vfs.OpenOptions{
+			Flags:    linux.O_RDONLY,
+			FileExec: true,
+		})
+		start.DecRef()
+		if err != nil {
+			return 0, nil, err
+		}
+		defer file.DecRef()
+		executable = fsbridge.NewVFSFile(file)
+	}
+
+	// Load the new TaskContext.
+	mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change
+	defer mntns.DecRef()
+	wd := t.FSContext().WorkingDirectoryVFS2()
+	defer wd.DecRef()
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
+	loadArgs := loader.LoadArgs{
+		Opener:              fsbridge.NewVFSLookup(mntns, root, wd),
+		RemainingTraversals: &remainingTraversals,
+		ResolveFinal:        flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+		Filename:            pathname,
+		File:                executable,
+		CloseOnExec:         closeOnExec,
+		Argv:                argv,
+		Envv:                envv,
+		Features:            t.Arch().FeatureSet(),
+	}
+
+	tc, se := t.Kernel().LoadTaskImage(t, loadArgs)
+	if se != nil {
+		return 0, nil, se.ToError()
+	}
+
+	ctrl, err := t.Execve(tc)
+	return 0, ctrl, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
new file mode 100644
index 000000000..517394ba9
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -0,0 +1,355 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Close implements Linux syscall close(2).
+func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	// Note that Remove provides a reference on the file that we may use to
+	// flush. It is still active until we drop the final reference below
+	// (and other reference-holding operations complete).
+	_, file := t.FDTable().Remove(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.OnClose(t)
+	return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file)
+}
+
+// Dup implements Linux syscall dup(2).
+func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
+	if err != nil {
+		return 0, nil, syserror.EMFILE
+	}
+	return uintptr(newFD), nil, nil
+}
+
+// Dup2 implements Linux syscall dup2(2).
+func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := args[0].Int()
+	newfd := args[1].Int()
+
+	if oldfd == newfd {
+		// As long as oldfd is valid, dup2() does nothing and returns newfd.
+		file := t.GetFileVFS2(oldfd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		file.DecRef()
+		return uintptr(newfd), nil, nil
+	}
+
+	return dup3(t, oldfd, newfd, 0)
+}
+
+// Dup3 implements Linux syscall dup3(2).
+func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := args[0].Int()
+	newfd := args[1].Int()
+	flags := args[2].Uint()
+
+	if oldfd == newfd {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return dup3(t, oldfd, newfd, flags)
+}
+
+func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) {
+	if flags&^linux.O_CLOEXEC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(oldfd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(newfd), nil, nil
+}
+
+// Fcntl implements linux syscall fcntl(2).
+func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	cmd := args[1].Int()
+
+	file, flags := t.FDTable().GetVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	switch cmd {
+	case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
+		minfd := args[2].Int()
+		fd, err := t.NewFDFromVFS2(minfd, file, kernel.FDFlags{
+			CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
+		})
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(fd), nil, nil
+	case linux.F_GETFD:
+		return uintptr(flags.ToLinuxFDFlags()), nil, nil
+	case linux.F_SETFD:
+		flags := args[2].Uint()
+		err := t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
+		})
+		return 0, nil, err
+	case linux.F_GETFL:
+		return uintptr(file.StatusFlags()), nil, nil
+	case linux.F_SETFL:
+		return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
+	case linux.F_SETPIPE_SZ:
+		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
+		if !ok {
+			return 0, nil, syserror.EBADF
+		}
+		n, err := pipefile.SetPipeSize(int64(args[2].Int()))
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(n), nil, nil
+	case linux.F_GETOWN:
+		owner, hasOwner := getAsyncOwner(t, file)
+		if !hasOwner {
+			return 0, nil, nil
+		}
+		if owner.Type == linux.F_OWNER_PGRP {
+			return uintptr(-owner.PID), nil, nil
+		}
+		return uintptr(owner.PID), nil, nil
+	case linux.F_SETOWN:
+		who := args[2].Int()
+		ownerType := int32(linux.F_OWNER_PID)
+		if who < 0 {
+			// Check for overflow before flipping the sign.
+			if who-1 > who {
+				return 0, nil, syserror.EINVAL
+			}
+			ownerType = linux.F_OWNER_PGRP
+			who = -who
+		}
+		return 0, nil, setAsyncOwner(t, file, ownerType, who)
+	case linux.F_GETOWN_EX:
+		owner, hasOwner := getAsyncOwner(t, file)
+		if !hasOwner {
+			return 0, nil, nil
+		}
+		_, err := t.CopyOut(args[2].Pointer(), &owner)
+		return 0, nil, err
+	case linux.F_SETOWN_EX:
+		var owner linux.FOwnerEx
+		n, err := t.CopyIn(args[2].Pointer(), &owner)
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(n), nil, setAsyncOwner(t, file, owner.Type, owner.PID)
+	case linux.F_GETPIPE_SZ:
+		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
+		if !ok {
+			return 0, nil, syserror.EBADF
+		}
+		return uintptr(pipefile.PipeSize()), nil, nil
+	case linux.F_GET_SEALS:
+		val, err := tmpfs.GetSeals(file)
+		return uintptr(val), nil, err
+	case linux.F_ADD_SEALS:
+		if !file.IsWritable() {
+			return 0, nil, syserror.EPERM
+		}
+		err := tmpfs.AddSeals(file, args[2].Uint())
+		return 0, nil, err
+	case linux.F_SETLK, linux.F_SETLKW:
+		return 0, nil, posixLock(t, args, file, cmd)
+	default:
+		// TODO(gvisor.dev/issue/2920): Everything else is not yet supported.
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) {
+	a := fd.AsyncHandler()
+	if a == nil {
+		return linux.FOwnerEx{}, false
+	}
+
+	ot, otg, opg := a.(*fasync.FileAsync).Owner()
+	switch {
+	case ot != nil:
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_TID,
+			PID:  int32(t.PIDNamespace().IDOfTask(ot)),
+		}, true
+	case otg != nil:
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_PID,
+			PID:  int32(t.PIDNamespace().IDOfThreadGroup(otg)),
+		}, true
+	case opg != nil:
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_PGRP,
+			PID:  int32(t.PIDNamespace().IDOfProcessGroup(opg)),
+		}, true
+	default:
+		return linux.FOwnerEx{}, true
+	}
+}
+
+func setAsyncOwner(t *kernel.Task, fd *vfs.FileDescription, ownerType, pid int32) error {
+	switch ownerType {
+	case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP:
+		// Acceptable type.
+	default:
+		return syserror.EINVAL
+	}
+
+	a := fd.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync)
+	if pid == 0 {
+		a.ClearOwner()
+		return nil
+	}
+
+	switch ownerType {
+	case linux.F_OWNER_TID:
+		task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid))
+		if task == nil {
+			return syserror.ESRCH
+		}
+		a.SetOwnerTask(t, task)
+		return nil
+	case linux.F_OWNER_PID:
+		tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid))
+		if tg == nil {
+			return syserror.ESRCH
+		}
+		a.SetOwnerThreadGroup(t, tg)
+		return nil
+	case linux.F_OWNER_PGRP:
+		pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid))
+		if pg == nil {
+			return syserror.ESRCH
+		}
+		a.SetOwnerProcessGroup(t, pg)
+		return nil
+	default:
+		return syserror.EINVAL
+	}
+}
+
+func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, cmd int32) error {
+	// Copy in the lock request.
+	flockAddr := args[2].Pointer()
+	var flock linux.Flock
+	if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+		return err
+	}
+
+	var blocker lock.Blocker
+	if cmd == linux.F_SETLKW {
+		blocker = t
+	}
+
+	switch flock.Type {
+	case linux.F_RDLCK:
+		if !file.IsReadable() {
+			return syserror.EBADF
+		}
+		return file.LockPOSIX(t, t.FDTable(), lock.ReadLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker)
+
+	case linux.F_WRLCK:
+		if !file.IsWritable() {
+			return syserror.EBADF
+		}
+		return file.LockPOSIX(t, t.FDTable(), lock.WriteLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker)
+
+	case linux.F_UNLCK:
+		return file.UnlockPOSIX(t, t.FDTable(), uint64(flock.Start), uint64(flock.Len), flock.Whence)
+
+	default:
+		return syserror.EINVAL
+	}
+}
+
+// Fadvise64 implements fadvise64(2).
+// This implementation currently ignores the provided advice.
+func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	length := args[2].Int64()
+	advice := args[3].Int()
+
+	// Note: offset is allowed to be negative.
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// If the FD refers to a pipe or FIFO, return error.
+	if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	switch advice {
+	case linux.POSIX_FADV_NORMAL:
+	case linux.POSIX_FADV_RANDOM:
+	case linux.POSIX_FADV_SEQUENTIAL:
+	case linux.POSIX_FADV_WILLNEED:
+	case linux.POSIX_FADV_DONTNEED:
+	case linux.POSIX_FADV_NOREUSE:
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Sure, whatever.
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
new file mode 100644
index 000000000..6b14c2bef
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -0,0 +1,384 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Link implements Linux syscall link(2).
+func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldpathAddr := args[0].Pointer()
+	newpathAddr := args[1].Pointer()
+	return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
+}
+
+// Linkat implements Linux syscall linkat(2).
+func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	flags := args[4].Int()
+	return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
+}
+
+func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+	if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) {
+		return syserror.ENOENT
+	}
+
+	oldpath, err := copyInPath(t, oldpathAddr)
+	if err != nil {
+		return err
+	}
+	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0))
+	if err != nil {
+		return err
+	}
+	defer oldtpop.Release()
+
+	newpath, err := copyInPath(t, newpathAddr)
+	if err != nil {
+		return err
+	}
+	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer newtpop.Release()
+
+	return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop)
+}
+
+// Mkdir implements Linux syscall mkdir(2).
+func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Mkdirat implements Linux syscall mkdirat(2).
+func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+	return 0, nil, mkdirat(t, dirfd, addr, mode)
+}
+
+func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error {
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{
+		Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()),
+	})
+}
+
+// Mknod implements Linux syscall mknod(2).
+func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	dev := args[2].Uint()
+	return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev)
+}
+
+// Mknodat implements Linux syscall mknodat(2).
+func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+	dev := args[3].Uint()
+	return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev)
+}
+
+func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode linux.FileMode, dev uint32) error {
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+
+	// "Zero file type is equivalent to type S_IFREG." - mknod(2)
+	if mode.FileType() == 0 {
+		mode |= linux.ModeRegular
+	}
+	major, minor := linux.DecodeDeviceID(dev)
+	return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
+		Mode:     mode &^ linux.FileMode(t.FSContext().Umask()),
+		DevMajor: uint32(major),
+		DevMinor: minor,
+	})
+}
+
+// Open implements Linux syscall open(2).
+func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Uint()
+	mode := args[2].ModeT()
+	return openat(t, linux.AT_FDCWD, addr, flags, mode)
+}
+
+// Openat implements Linux syscall openat(2).
+func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	flags := args[2].Uint()
+	mode := args[3].ModeT()
+	return openat(t, dirfd, addr, flags, mode)
+}
+
+// Creat implements Linux syscall creat(2).
+func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode)
+}
+
+func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0))
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
+		Flags: flags | linux.O_LARGEFILE,
+		Mode:  linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()),
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	return uintptr(fd), nil, err
+}
+
+// Rename implements Linux syscall rename(2).
+func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldpathAddr := args[0].Pointer()
+	newpathAddr := args[1].Pointer()
+	return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
+}
+
+// Renameat implements Linux syscall renameat(2).
+func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */)
+}
+
+// Renameat2 implements Linux syscall renameat2(2).
+func Renameat2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	flags := args[4].Uint()
+	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
+}
+
+func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags uint32) error {
+	oldpath, err := copyInPath(t, oldpathAddr)
+	if err != nil {
+		return err
+	}
+	// "If oldpath refers to a symbolic link, the link is renamed" - rename(2)
+	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer oldtpop.Release()
+
+	newpath, err := copyInPath(t, newpathAddr)
+	if err != nil {
+		return err
+	}
+	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer newtpop.Release()
+
+	return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{
+		Flags: flags,
+	})
+}
+
+// Fallocate implements linux system call fallocate(2).
+func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	mode := args[1].Uint64()
+	offset := args[2].Int64()
+	length := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	if !file.IsWritable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	if mode != 0 {
+		return 0, nil, syserror.ENOTSUP
+	}
+
+	if offset < 0 || length <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	size := offset + length
+
+	if size < 0 {
+		return 0, nil, syserror.EFBIG
+	}
+
+	limit := limits.FromContext(t).Get(limits.FileSize).Cur
+
+	if uint64(size) >= limit {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(linux.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	return 0, nil, file.Impl().Allocate(t, mode, uint64(offset), uint64(length))
+
+	// File length modified, generate notification.
+	// TODO(gvisor.dev/issue/1479): Reenable when Inotify is ported.
+	// file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+}
+
+// Rmdir implements Linux syscall rmdir(2).
+func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr)
+}
+
+func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop)
+}
+
+// Unlink implements Linux syscall unlink(2).
+func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr)
+}
+
+func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop)
+}
+
+// Unlinkat implements Linux syscall unlinkat(2).
+func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if flags&^linux.AT_REMOVEDIR != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if flags&linux.AT_REMOVEDIR != 0 {
+		return 0, nil, rmdirat(t, dirfd, pathAddr)
+	}
+	return 0, nil, unlinkat(t, dirfd, pathAddr)
+}
+
+// Symlink implements Linux syscall symlink(2).
+func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	targetAddr := args[0].Pointer()
+	linkpathAddr := args[1].Pointer()
+	return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr)
+}
+
+// Symlinkat implements Linux syscall symlinkat(2).
+func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	targetAddr := args[0].Pointer()
+	newdirfd := args[1].Int()
+	linkpathAddr := args[2].Pointer()
+	return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr)
+}
+
+func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpathAddr usermem.Addr) error {
+	target, err := t.CopyInString(targetAddr, linux.PATH_MAX)
+	if err != nil {
+		return err
+	}
+	if len(target) == 0 {
+		return syserror.ENOENT
+	}
+	linkpath, err := copyInPath(t, linkpathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
new file mode 100644
index 000000000..317409a18
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Getcwd implements Linux syscall getcwd(2).
+func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	size := args[1].SizeT()
+
+	root := t.FSContext().RootDirectoryVFS2()
+	wd := t.FSContext().WorkingDirectoryVFS2()
+	s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd)
+	root.DecRef()
+	wd.DecRef()
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Note this is >= because we need a terminator.
+	if uint(len(s)) >= size {
+		return 0, nil, syserror.ERANGE
+	}
+
+	// Construct a byte slice containing a NUL terminator.
+	buf := t.CopyScratchBuffer(len(s) + 1)
+	copy(buf, s)
+	buf[len(buf)-1] = 0
+
+	// Write the pathname slice.
+	n, err := t.CopyOutBytes(addr, buf)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Chdir implements Linux syscall chdir(2).
+func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetWorkingDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
+
+// Fchdir implements Linux syscall fchdir(2).
+func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetWorkingDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
+
+// Chroot implements Linux syscall chroot(2).
+func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	if !t.HasCapability(linux.CAP_SYS_CHROOT) {
+		return 0, nil, syserror.EPERM
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetRootDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
new file mode 100644
index 000000000..c7c7bf7ce
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -0,0 +1,161 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Getdents implements Linux syscall getdents(2).
+func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getdents(t, args, false /* isGetdents64 */)
+}
+
+// Getdents64 implements Linux syscall getdents64(2).
+func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getdents(t, args, true /* isGetdents64 */)
+}
+
+func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := int(args[2].Uint())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	cb := getGetdentsCallback(t, addr, size, isGetdents64)
+	err := file.IterDirents(t, cb)
+	n := size - cb.remaining
+	putGetdentsCallback(cb)
+	if n == 0 {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+type getdentsCallback struct {
+	t            *kernel.Task
+	addr         usermem.Addr
+	remaining    int
+	isGetdents64 bool
+}
+
+var getdentsCallbackPool = sync.Pool{
+	New: func() interface{} {
+		return &getdentsCallback{}
+	},
+}
+
+func getGetdentsCallback(t *kernel.Task, addr usermem.Addr, size int, isGetdents64 bool) *getdentsCallback {
+	cb := getdentsCallbackPool.Get().(*getdentsCallback)
+	*cb = getdentsCallback{
+		t:            t,
+		addr:         addr,
+		remaining:    size,
+		isGetdents64: isGetdents64,
+	}
+	return cb
+}
+
+func putGetdentsCallback(cb *getdentsCallback) {
+	cb.t = nil
+	getdentsCallbackPool.Put(cb)
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
+	var buf []byte
+	if cb.isGetdents64 {
+		// struct linux_dirent64 {
+		//     ino64_t        d_ino;    /* 64-bit inode number */
+		//     off64_t        d_off;    /* 64-bit offset to next structure */
+		//     unsigned short d_reclen; /* Size of this dirent */
+		//     unsigned char  d_type;   /* File type */
+		//     char           d_name[]; /* Filename (null-terminated) */
+		// };
+		size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
+		size = (size + 7) &^ 7 // round up to multiple of 8
+		if size > cb.remaining {
+			return syserror.EINVAL
+		}
+		buf = cb.t.CopyScratchBuffer(size)
+		usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
+		usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
+		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
+		buf[18] = dirent.Type
+		copy(buf[19:], dirent.Name)
+		// Zero out all remaining bytes in buf, including the NUL terminator
+		// after dirent.Name.
+		bufTail := buf[19+len(dirent.Name):]
+		for i := range bufTail {
+			bufTail[i] = 0
+		}
+	} else {
+		// struct linux_dirent {
+		//     unsigned long  d_ino;     /* Inode number */
+		//     unsigned long  d_off;     /* Offset to next linux_dirent */
+		//     unsigned short d_reclen;  /* Length of this linux_dirent */
+		//     char           d_name[];  /* Filename (null-terminated) */
+		//                       /* length is actually (d_reclen - 2 -
+		//                          offsetof(struct linux_dirent, d_name)) */
+		//     /*
+		//     char           pad;       // Zero padding byte
+		//     char           d_type;    // File type (only since Linux
+		//                               // 2.6.4); offset is (d_reclen - 1)
+		//     */
+		// };
+		if cb.t.Arch().Width() != 8 {
+			panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
+		}
+		size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
+		size = (size + 7) &^ 7 // round up to multiple of sizeof(long)
+		if size > cb.remaining {
+			return syserror.EINVAL
+		}
+		buf = cb.t.CopyScratchBuffer(size)
+		usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
+		usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
+		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
+		copy(buf[18:], dirent.Name)
+		// Zero out all remaining bytes in buf, including the NUL terminator
+		// after dirent.Name and the zero padding byte between the name and
+		// dirent type.
+		bufTail := buf[18+len(dirent.Name) : size-1]
+		for i := range bufTail {
+			bufTail[i] = 0
+		}
+		buf[size-1] = dirent.Type
+	}
+	n, err := cb.t.CopyOutBytes(cb.addr, buf)
+	if err != nil {
+		// Don't report partially-written dirents by advancing cb.addr or
+		// cb.remaining.
+		return err
+	}
+	cb.addr += usermem.Addr(n)
+	cb.remaining -= n
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go
new file mode 100644
index 000000000..5d98134a5
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go
@@ -0,0 +1,137 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC
+
+// InotifyInit1 implements the inotify_init1() syscalls.
+func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	if flags&^allFlags != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags))
+	if err != nil {
+		return 0, nil, err
+	}
+	defer ino.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{
+		CloseOnExec: flags&linux.IN_CLOEXEC != 0,
+	})
+
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// InotifyInit implements the inotify_init() syscalls.
+func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	args[0].Value = 0
+	return InotifyInit1(t, args)
+}
+
+// fdToInotify resolves an fd to an inotify object. If successful, the file will
+// have an extra ref and the caller is responsible for releasing the ref.
+func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) {
+	f := t.GetFileVFS2(fd)
+	if f == nil {
+		// Invalid fd.
+		return nil, nil, syserror.EBADF
+	}
+
+	ino, ok := f.Impl().(*vfs.Inotify)
+	if !ok {
+		// Not an inotify fd.
+		f.DecRef()
+		return nil, nil, syserror.EINVAL
+	}
+
+	return ino, f, nil
+}
+
+// InotifyAddWatch implements the inotify_add_watch() syscall.
+func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	mask := args[2].Uint()
+
+	// "EINVAL: The given event mask contains no valid events."
+	// -- inotify_add_watch(2)
+	if mask&linux.ALL_INOTIFY_BITS == 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link."
+	//  -- inotify(7)
+	follow := followFinalSymlink
+	if mask&linux.IN_DONT_FOLLOW == 0 {
+		follow = nofollowFinalSymlink
+	}
+
+	ino, f, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer f.DecRef()
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	if mask&linux.IN_ONLYDIR != 0 {
+		path.Dir = true
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+	d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{})
+	if err != nil {
+		return 0, nil, err
+	}
+	defer d.DecRef()
+
+	fd, err = ino.AddWatch(d.Dentry(), mask)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// InotifyRmWatch implements the inotify_rm_watch() syscall.
+func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	wd := args[1].Int()
+
+	ino, f, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer f.DecRef()
+	return 0, nil, ino.RmWatch(wd)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
new file mode 100644
index 000000000..fd6ab94b2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -0,0 +1,107 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Ioctl implements Linux syscall ioctl(2).
+func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Handle ioctls that apply to all FDs.
+	switch args[1].Int() {
+	case linux.FIONCLEX:
+		t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+			CloseOnExec: false,
+		})
+		return 0, nil, nil
+
+	case linux.FIOCLEX:
+		t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+			CloseOnExec: true,
+		})
+		return 0, nil, nil
+
+	case linux.FIONBIO:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		flags := file.StatusFlags()
+		if set != 0 {
+			flags |= linux.O_NONBLOCK
+		} else {
+			flags &^= linux.O_NONBLOCK
+		}
+		return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags)
+
+	case linux.FIOASYNC:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		flags := file.StatusFlags()
+		if set != 0 {
+			flags |= linux.O_ASYNC
+		} else {
+			flags &^= linux.O_ASYNC
+		}
+		file.SetStatusFlags(t, t.Credentials(), flags)
+		return 0, nil, nil
+
+	case linux.FIOGETOWN, linux.SIOCGPGRP:
+		var who int32
+		owner, hasOwner := getAsyncOwner(t, file)
+		if hasOwner {
+			if owner.Type == linux.F_OWNER_PGRP {
+				who = -owner.PID
+			} else {
+				who = owner.PID
+			}
+		}
+		_, err := t.CopyOut(args[2].Pointer(), &who)
+		return 0, nil, err
+
+	case linux.FIOSETOWN, linux.SIOCSPGRP:
+		var who int32
+		if _, err := t.CopyIn(args[2].Pointer(), &who); err != nil {
+			return 0, nil, err
+		}
+		ownerType := int32(linux.F_OWNER_PID)
+		if who < 0 {
+			// Check for overflow before flipping the sign.
+			if who-1 > who {
+				return 0, nil, syserror.EINVAL
+			}
+			ownerType = linux.F_OWNER_PGRP
+			who = -who
+		}
+		return 0, nil, setAsyncOwner(t, file, ownerType, who)
+	}
+
+	ret, err := file.Ioctl(t, t.MemoryManager(), args)
+	return ret, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go
new file mode 100644
index 000000000..bf19028c4
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/lock.go
@@ -0,0 +1,64 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Flock implements linux syscall flock(2).
+func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	operation := args[1].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		// flock(2): EBADF fd is not an open file descriptor.
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	nonblocking := operation&linux.LOCK_NB != 0
+	operation &^= linux.LOCK_NB
+
+	var blocker lock.Blocker
+	if !nonblocking {
+		blocker = t
+	}
+
+	switch operation {
+	case linux.LOCK_EX:
+		if err := file.LockBSD(t, lock.WriteLock, blocker); err != nil {
+			return 0, nil, err
+		}
+	case linux.LOCK_SH:
+		if err := file.LockBSD(t, lock.ReadLock, blocker); err != nil {
+			return 0, nil, err
+		}
+	case linux.LOCK_UN:
+		if err := file.UnlockBSD(t); err != nil {
+			return 0, nil, err
+		}
+	default:
+		// flock(2): EINVAL operation is invalid.
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go
new file mode 100644
index 000000000..bbe248d17
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go
@@ -0,0 +1,63 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+	memfdPrefix     = "memfd:"
+	memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix)
+	memfdAllFlags   = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
+)
+
+// MemfdCreate implements the linux syscall memfd_create(2).
+func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Uint()
+
+	if flags&^memfdAllFlags != 0 {
+		// Unknown bits in flags.
+		return 0, nil, syserror.EINVAL
+	}
+
+	allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
+	cloExec := flags&linux.MFD_CLOEXEC != 0
+
+	name, err := t.CopyInString(addr, memfdMaxNameLen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	shmMount := t.Kernel().ShmMount()
+	file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: cloExec,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go
new file mode 100644
index 000000000..60a43f0a0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go
@@ -0,0 +1,92 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mmap implements Linux syscall mmap(2).
+func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	prot := args[2].Int()
+	flags := args[3].Int()
+	fd := args[4].Int()
+	fixed := flags&linux.MAP_FIXED != 0
+	private := flags&linux.MAP_PRIVATE != 0
+	shared := flags&linux.MAP_SHARED != 0
+	anon := flags&linux.MAP_ANONYMOUS != 0
+	map32bit := flags&linux.MAP_32BIT != 0
+
+	// Require exactly one of MAP_PRIVATE and MAP_SHARED.
+	if private == shared {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts := memmap.MMapOpts{
+		Length:   args[1].Uint64(),
+		Offset:   args[5].Uint64(),
+		Addr:     args[0].Pointer(),
+		Fixed:    fixed,
+		Unmap:    fixed,
+		Map32Bit: map32bit,
+		Private:  private,
+		Perms: usermem.AccessType{
+			Read:    linux.PROT_READ&prot != 0,
+			Write:   linux.PROT_WRITE&prot != 0,
+			Execute: linux.PROT_EXEC&prot != 0,
+		},
+		MaxPerms:  usermem.AnyAccess,
+		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
+		Precommit: linux.MAP_POPULATE&flags != 0,
+	}
+	if linux.MAP_LOCKED&flags != 0 {
+		opts.MLockMode = memmap.MLockEager
+	}
+	defer func() {
+		if opts.MappingIdentity != nil {
+			opts.MappingIdentity.DecRef()
+		}
+	}()
+
+	if !anon {
+		// Convert the passed FD to a file reference.
+		file := t.GetFileVFS2(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		// mmap unconditionally requires that the FD is readable.
+		if !file.IsReadable() {
+			return 0, nil, syserror.EACCES
+		}
+		// MAP_SHARED requires that the FD be writable for PROT_WRITE.
+		if shared && !file.IsWritable() {
+			opts.MaxPerms.Write = false
+		}
+
+		if err := file.ConfigureMMap(t, &opts); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	rv, err := t.MemoryManager().MMap(t, opts)
+	return uintptr(rv), nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go
new file mode 100644
index 000000000..adeaa39cc
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mount.go
@@ -0,0 +1,145 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mount implements Linux syscall mount(2).
+func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sourceAddr := args[0].Pointer()
+	targetAddr := args[1].Pointer()
+	typeAddr := args[2].Pointer()
+	flags := args[3].Uint64()
+	dataAddr := args[4].Pointer()
+
+	// For null-terminated strings related to mount(2), Linux copies in at most
+	// a page worth of data. See fs/namespace.c:copy_mount_string().
+	fsType, err := t.CopyInString(typeAddr, usermem.PageSize)
+	if err != nil {
+		return 0, nil, err
+	}
+	source, err := t.CopyInString(sourceAddr, usermem.PageSize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	targetPath, err := copyInPath(t, targetAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	data := ""
+	if dataAddr != 0 {
+		// In Linux, a full page is always copied in regardless of null
+		// character placement, and the address is passed to each file system.
+		// Most file systems always treat this data as a string, though, and so
+		// do all of the ones we implement.
+		data, err = t.CopyInString(dataAddr, usermem.PageSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+
+	// Ignore magic value that was required before Linux 2.4.
+	if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL {
+		flags = flags &^ linux.MS_MGC_MSK
+	}
+
+	// Must have CAP_SYS_ADMIN in the current mount namespace's associated user
+	// namespace.
+	creds := t.Credentials()
+	if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+		return 0, nil, syserror.EPERM
+	}
+
+	const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND |
+		linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE |
+		linux.MS_UNBINDABLE | linux.MS_MOVE
+
+	// Silently allow MS_NOSUID, since we don't implement set-id bits
+	// anyway.
+	const unsupportedFlags = linux.MS_NODEV |
+		linux.MS_NODIRATIME | linux.MS_STRICTATIME
+
+	// Linux just allows passing any flags to mount(2) - it won't fail when
+	// unknown or unsupported flags are passed. Since we don't implement
+	// everything, we fail explicitly on flags that are unimplemented.
+	if flags&(unsupportedOps|unsupportedFlags) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var opts vfs.MountOptions
+	if flags&linux.MS_NOATIME == linux.MS_NOATIME {
+		opts.Flags.NoATime = true
+	}
+	if flags&linux.MS_NOEXEC == linux.MS_NOEXEC {
+		opts.Flags.NoExec = true
+	}
+	if flags&linux.MS_RDONLY == linux.MS_RDONLY {
+		opts.ReadOnly = true
+	}
+	opts.GetFilesystemOptions.Data = data
+
+	target, err := getTaskPathOperation(t, linux.AT_FDCWD, targetPath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer target.Release()
+
+	return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
+}
+
+// Umount2 implements Linux syscall umount2(2).
+func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Int()
+
+	// Must have CAP_SYS_ADMIN in the mount namespace's associated user
+	// namespace.
+	//
+	// Currently, this is always the init task's user namespace.
+	creds := t.Credentials()
+	if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+		return 0, nil, syserror.EPERM
+	}
+
+	const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE
+	if flags&unsupported != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	opts := vfs.UmountOptions{
+		Flags: uint32(flags),
+	}
+
+	return 0, nil, t.Kernel().VFS().UmountAt(t, creds, &tpop.pop, &opts)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go
new file mode 100644
index 000000000..97da6c647
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/path.go
@@ -0,0 +1,94 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func copyInPath(t *kernel.Task, addr usermem.Addr) (fspath.Path, error) {
+	pathname, err := t.CopyInString(addr, linux.PATH_MAX)
+	if err != nil {
+		return fspath.Path{}, err
+	}
+	return fspath.Parse(pathname), nil
+}
+
+type taskPathOperation struct {
+	pop          vfs.PathOperation
+	haveStartRef bool
+}
+
+func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink) (taskPathOperation, error) {
+	root := t.FSContext().RootDirectoryVFS2()
+	start := root
+	haveStartRef := false
+	if !path.Absolute {
+		if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
+			root.DecRef()
+			return taskPathOperation{}, syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			haveStartRef = true
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				root.DecRef()
+				return taskPathOperation{}, syserror.EBADF
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			haveStartRef = true
+			dirfile.DecRef()
+		}
+	}
+	return taskPathOperation{
+		pop: vfs.PathOperation{
+			Root:               root,
+			Start:              start,
+			Path:               path,
+			FollowFinalSymlink: bool(shouldFollowFinalSymlink),
+		},
+		haveStartRef: haveStartRef,
+	}, nil
+}
+
+func (tpop *taskPathOperation) Release() {
+	tpop.pop.Root.DecRef()
+	if tpop.haveStartRef {
+		tpop.pop.Start.DecRef()
+		tpop.haveStartRef = false
+	}
+}
+
+type shouldAllowEmptyPath bool
+
+const (
+	disallowEmptyPath shouldAllowEmptyPath = false
+	allowEmptyPath    shouldAllowEmptyPath = true
+)
+
+type shouldFollowFinalSymlink bool
+
+const (
+	nofollowFinalSymlink shouldFollowFinalSymlink = false
+	followFinalSymlink   shouldFollowFinalSymlink = true
+)
diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go
new file mode 100644
index 000000000..4a01e4209
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go
@@ -0,0 +1,63 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Pipe implements Linux syscall pipe(2).
+func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	return 0, nil, pipe2(t, addr, 0)
+}
+
+// Pipe2 implements Linux syscall pipe2(2).
+func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Int()
+	return 0, nil, pipe2(t, addr, flags)
+}
+
+func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error {
+	if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
+		return syserror.EINVAL
+	}
+	r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK))
+	defer r.DecRef()
+	defer w.DecRef()
+
+	fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	if err != nil {
+		return err
+	}
+	if _, err := t.CopyOut(addr, fds); err != nil {
+		for _, fd := range fds {
+			if _, file := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
+		}
+		return err
+	}
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
new file mode 100644
index 000000000..ff1b25d7b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -0,0 +1,586 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"fmt"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// fileCap is the maximum allowable files for poll & select. This has no
+// equivalent in Linux; it exists in gVisor since allocation failure in Go is
+// unrecoverable.
+const fileCap = 1024 * 1024
+
+// Masks for "readable", "writable", and "exceptional" events as defined by
+// select(2).
+const (
+	// selectReadEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLIN_SET.
+	selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR
+
+	// selectWriteEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLOUT_SET.
+	selectWriteEvents = linux.POLLOUT | linux.POLLERR
+
+	// selectExceptEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLEX_SET.
+	selectExceptEvents = linux.POLLPRI
+)
+
+// pollState tracks the associated file description and waiter of a PollFD.
+type pollState struct {
+	file   *vfs.FileDescription
+	waiter waiter.Entry
+}
+
+// initReadiness gets the current ready mask for the file represented by the FD
+// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
+// used to register with the file for event notifications, and a reference to
+// the file is stored in "state".
+func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) {
+	if pfd.FD < 0 {
+		pfd.REvents = 0
+		return
+	}
+
+	file := t.GetFileVFS2(pfd.FD)
+	if file == nil {
+		pfd.REvents = linux.POLLNVAL
+		return
+	}
+
+	if ch == nil {
+		defer file.DecRef()
+	} else {
+		state.file = file
+		state.waiter, _ = waiter.NewChannelEntry(ch)
+		file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	}
+
+	r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	pfd.REvents = int16(r.ToLinux()) & pfd.Events
+}
+
+// releaseState releases all the pollState in "state".
+func releaseState(state []pollState) {
+	for i := range state {
+		if state[i].file != nil {
+			state[i].file.EventUnregister(&state[i].waiter)
+			state[i].file.DecRef()
+		}
+	}
+}
+
+// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout"
+// when "timeout" is greater than zero.
+//
+// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or
+// positive if interrupted by a signal.
+func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
+	var ch chan struct{}
+	if timeout != 0 {
+		ch = make(chan struct{}, 1)
+	}
+
+	// Register for event notification in the files involved if we may
+	// block (timeout not zero). Once we find a file that has a non-zero
+	// result, we stop registering for events but still go through all files
+	// to get their ready masks.
+	state := make([]pollState, len(pfd))
+	defer releaseState(state)
+	n := uintptr(0)
+	for i := range pfd {
+		initReadiness(t, &pfd[i], &state[i], ch)
+		if pfd[i].REvents != 0 {
+			n++
+			ch = nil
+		}
+	}
+
+	if timeout == 0 {
+		return timeout, n, nil
+	}
+
+	haveTimeout := timeout >= 0
+
+	for n == 0 {
+		var err error
+		// Wait for a notification.
+		timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout)
+		if err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = nil
+			}
+			return timeout, 0, err
+		}
+
+		// We got notified, count how many files are ready. If none,
+		// then this was a spurious notification, and we just go back
+		// to sleep with the remaining timeout.
+		for i := range state {
+			if state[i].file == nil {
+				continue
+			}
+
+			r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events)))
+			rl := int16(r.ToLinux()) & pfd[i].Events
+			if rl != 0 {
+				pfd[i].REvents = rl
+				n++
+			}
+		}
+	}
+
+	return timeout, n, nil
+}
+
+// copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max.
+func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) {
+	if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
+		return nil, syserror.EINVAL
+	}
+
+	pfd := make([]linux.PollFD, nfds)
+	if nfds > 0 {
+		if _, err := t.CopyIn(addr, &pfd); err != nil {
+			return nil, err
+		}
+	}
+
+	return pfd, nil
+}
+
+func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
+	pfd, err := copyInPollFDs(t, addr, nfds)
+	if err != nil {
+		return timeout, 0, err
+	}
+
+	// Compatibility warning: Linux adds POLLHUP and POLLERR just before
+	// polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
+	// polling, changing event masks here is an application-visible difference.
+	// (Linux also doesn't copy out event masks at all, only revents.)
+	for i := range pfd {
+		pfd[i].Events |= linux.POLLHUP | linux.POLLERR
+	}
+	remainingTimeout, n, err := pollBlock(t, pfd, timeout)
+	err = syserror.ConvertIntr(err, syserror.EINTR)
+
+	// The poll entries are copied out regardless of whether
+	// any are set or not. This aligns with the Linux behavior.
+	if nfds > 0 && err == nil {
+		if _, err := t.CopyOut(addr, pfd); err != nil {
+			return remainingTimeout, 0, err
+		}
+	}
+
+	return remainingTimeout, n, err
+}
+
+// CopyInFDSet copies an fd set from select(2)/pselect(2).
+func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
+	set := make([]byte, nBytes)
+
+	if addr != 0 {
+		if _, err := t.CopyIn(addr, &set); err != nil {
+			return nil, err
+		}
+		// If we only use part of the last byte, mask out the extraneous bits.
+		//
+		// N.B. This only works on little-endian architectures.
+		if nBitsInLastPartialByte != 0 {
+			set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
+		}
+	}
+	return set, nil
+}
+
+func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
+	if nfds < 0 || nfds > fileCap {
+		return 0, syserror.EINVAL
+	}
+
+	// Calculate the size of the fd sets (one bit per fd).
+	nBytes := (nfds + 7) / 8
+	nBitsInLastPartialByte := nfds % 8
+
+	// Capture all the provided input vectors.
+	r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+	w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+	e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+
+	// Count how many FDs are actually being requested so that we can build
+	// a PollFD array.
+	fdCount := 0
+	for i := 0; i < nBytes; i++ {
+		v := r[i] | w[i] | e[i]
+		for v != 0 {
+			v &= (v - 1)
+			fdCount++
+		}
+	}
+
+	// Build the PollFD array.
+	pfd := make([]linux.PollFD, 0, fdCount)
+	var fd int32
+	for i := 0; i < nBytes; i++ {
+		rV, wV, eV := r[i], w[i], e[i]
+		v := rV | wV | eV
+		m := byte(1)
+		for j := 0; j < 8; j++ {
+			if (v & m) != 0 {
+				// Make sure the fd is valid and decrement the reference
+				// immediately to ensure we don't leak. Note, another thread
+				// might be about to close fd. This is racy, but that's
+				// OK. Linux is racy in the same way.
+				file := t.GetFileVFS2(fd)
+				if file == nil {
+					return 0, syserror.EBADF
+				}
+				file.DecRef()
+
+				var mask int16
+				if (rV & m) != 0 {
+					mask |= selectReadEvents
+				}
+
+				if (wV & m) != 0 {
+					mask |= selectWriteEvents
+				}
+
+				if (eV & m) != 0 {
+					mask |= selectExceptEvents
+				}
+
+				pfd = append(pfd, linux.PollFD{
+					FD:     fd,
+					Events: mask,
+				})
+			}
+
+			fd++
+			m <<= 1
+		}
+	}
+
+	// Do the syscall, then count the number of bits set.
+	if _, _, err = pollBlock(t, pfd, timeout); err != nil {
+		return 0, syserror.ConvertIntr(err, syserror.EINTR)
+	}
+
+	// r, w, and e are currently event mask bitsets; unset bits corresponding
+	// to events that *didn't* occur.
+	bitSetCount := uintptr(0)
+	for idx := range pfd {
+		events := pfd[idx].REvents
+		i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
+		m := byte(1) << j
+		if r[i]&m != 0 {
+			if (events & selectReadEvents) != 0 {
+				bitSetCount++
+			} else {
+				r[i] &^= m
+			}
+		}
+		if w[i]&m != 0 {
+			if (events & selectWriteEvents) != 0 {
+				bitSetCount++
+			} else {
+				w[i] &^= m
+			}
+		}
+		if e[i]&m != 0 {
+			if (events & selectExceptEvents) != 0 {
+				bitSetCount++
+			} else {
+				e[i] &^= m
+			}
+		}
+	}
+
+	// Copy updated vectors back.
+	if readFDs != 0 {
+		if _, err := t.CopyOut(readFDs, r); err != nil {
+			return 0, err
+		}
+	}
+
+	if writeFDs != 0 {
+		if _, err := t.CopyOut(writeFDs, w); err != nil {
+			return 0, err
+		}
+	}
+
+	if exceptFDs != 0 {
+		if _, err := t.CopyOut(exceptFDs, e); err != nil {
+			return 0, err
+		}
+	}
+
+	return bitSetCount, nil
+}
+
+// timeoutRemaining returns the amount of time remaining for the specified
+// timeout or 0 if it has elapsed.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
+	now := t.Kernel().MonotonicClock().Now()
+	remaining := timeout - now.Sub(startNs)
+	if remaining < 0 {
+		remaining = 0
+	}
+	return remaining
+}
+
+// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
+	_, err := tsRemaining.CopyOut(t, timespecAddr)
+	return err
+}
+
+// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
+	_, err := tvRemaining.CopyOut(t, timevalAddr)
+	return err
+}
+
+// pollRestartBlock encapsulates the state required to restart poll(2) via
+// restart_syscall(2).
+//
+// +stateify savable
+type pollRestartBlock struct {
+	pfdAddr usermem.Addr
+	nfds    uint
+	timeout time.Duration
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return poll(t, p.pfdAddr, p.nfds, p.timeout)
+}
+
+func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
+	remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	// On an interrupt poll(2) is restarted with the remaining timeout.
+	if err == syserror.EINTR {
+		t.SetSyscallRestartBlock(&pollRestartBlock{
+			pfdAddr: pfdAddr,
+			nfds:    nfds,
+			timeout: remainingTimeout,
+		})
+		return 0, kernel.ERESTART_RESTARTBLOCK
+	}
+	return n, err
+}
+
+// Poll implements linux syscall poll(2).
+func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timeout := time.Duration(args[2].Int()) * time.Millisecond
+	n, err := poll(t, pfdAddr, nfds, timeout)
+	return n, nil, err
+}
+
+// Ppoll implements linux syscall ppoll(2).
+func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timespecAddr := args[2].Pointer()
+	maskAddr := args[3].Pointer()
+	maskSize := uint(args[4].Uint())
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
+		return 0, nil, err
+	}
+
+	_, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// doPoll returns EINTR if interrupted, but ppoll is normally restartable
+	// if interrupted by something other than a signal handled by the
+	// application (i.e. returns ERESTARTNOHAND). However, if
+	// copyOutTimespecRemaining failed, then the restarted ppoll would use the
+	// wrong timeout, so the error should be left as EINTR.
+	//
+	// Note that this means that if err is nil but copyErr is not, copyErr is
+	// ignored. This is consistent with Linux.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Select implements linux syscall select(2).
+func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timevalAddr := args[4].Pointer()
+
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timevalAddr != 0 {
+		var timeval linux.Timeval
+		if _, err := timeval.CopyIn(t, timevalAddr); err != nil {
+			return 0, nil, err
+		}
+		if timeval.Sec < 0 || timeval.Usec < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		timeout = time.Duration(timeval.ToNsecCapped())
+	}
+	startNs := t.Kernel().MonotonicClock().Now()
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Pselect implements linux syscall pselect(2).
+func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timespecAddr := args[4].Pointer()
+	maskWithSizeAddr := args[5].Pointer()
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if maskWithSizeAddr != 0 {
+		if t.Arch().Width() != 8 {
+			panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width()))
+		}
+		var maskStruct sigSetWithSize
+		if _, err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil {
+			return 0, nil, err
+		}
+		if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// +marshal
+type sigSetWithSize struct {
+	sigsetAddr   uint64
+	sizeofSigset uint64
+}
+
+// copyTimespecInToDuration copies a Timespec from the untrusted app range,
+// validates it and converts it to a Duration.
+//
+// If the Timespec is larger than what can be represented in a Duration, the
+// returned value is the maximum that Duration will allow.
+//
+// If timespecAddr is NULL, the returned value is negative.
+func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) {
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timespecAddr != 0 {
+		var timespec linux.Timespec
+		if _, err := timespec.CopyIn(t, timespecAddr); err != nil {
+			return 0, err
+		}
+		if !timespec.Valid() {
+			return 0, syserror.EINVAL
+		}
+		timeout = time.Duration(timespec.ToNsecCapped())
+	}
+	return timeout, nil
+}
+
+func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) error {
+	if maskAddr == 0 {
+		return nil
+	}
+	if maskSize != linux.SignalSetSize {
+		return syserror.EINVAL
+	}
+	var mask linux.SignalSet
+	if _, err := mask.CopyIn(t, maskAddr); err != nil {
+		return err
+	}
+	mask &^= kernel.UnblockableSignals
+	oldmask := t.SignalMask()
+	t.SetSignalMask(mask)
+	t.SetSavedSignalMask(oldmask)
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
new file mode 100644
index 000000000..cd25597a7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -0,0 +1,641 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	eventMaskRead  = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+	eventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr
+)
+
+// Read implements Linux syscall read(2).
+func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := read(t, file, dst, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+}
+
+// Readv implements Linux syscall readv(2).
+func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := read(t, file, dst, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+}
+
+func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	n, err := file.Read(t, dst, opts)
+	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
+	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err = file.Read(t, dst, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = syserror.ErrWouldBlock
+			}
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	}
+	return total, err
+}
+
+// Pread64 implements Linux syscall pread64(2).
+func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+}
+
+// Preadv implements Linux syscall preadv(2).
+func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+}
+
+// Preadv2 implements Linux syscall preadv2(2).
+func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the glibc signature is
+	// preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the actual syscall
+	// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 6th argument (index 5).
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := args[5].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.ReadOptions{
+		Flags: uint32(flags),
+	}
+	var n int64
+	if offset == -1 {
+		n, err = read(t, file, dst, opts)
+	} else {
+		n, err = pread(t, file, dst, offset, opts)
+	}
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+}
+
+func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	n, err := file.PRead(t, dst, offset, opts)
+	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
+	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err = file.PRead(t, dst, offset+total, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = syserror.ErrWouldBlock
+			}
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	}
+	return total, err
+}
+
+// Write implements Linux syscall write(2).
+func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := write(t, file, src, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+}
+
+// Writev implements Linux syscall writev(2).
+func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := write(t, file, src, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+}
+
+func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	n, err := file.Write(t, src, opts)
+	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
+	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err = file.Write(t, src, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = syserror.ErrWouldBlock
+			}
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+	}
+	return total, err
+}
+
+// Pwrite64 implements Linux syscall pwrite64(2).
+func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+}
+
+// Pwritev implements Linux syscall pwritev(2).
+func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+}
+
+// Pwritev2 implements Linux syscall pwritev2(2).
+func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the glibc signature is
+	// pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the actual syscall
+	// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 6th argument (index 5).
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := args[5].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.WriteOptions{
+		Flags: uint32(flags),
+	}
+	var n int64
+	if offset == -1 {
+		n, err = write(t, file, src, opts)
+	} else {
+		n, err = pwrite(t, file, src, offset, opts)
+	}
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+}
+
+func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, err := file.PWrite(t, src, offset, opts)
+	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
+	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err = file.PWrite(t, src, offset+total, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = syserror.ErrWouldBlock
+			}
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	}
+	return total, err
+}
+
+func blockPolicy(t *kernel.Task, file *vfs.FileDescription) (allowBlock bool, deadline ktime.Time, hasDeadline bool) {
+	if file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return false, ktime.Time{}, false
+	}
+	// Sockets support read/write timeouts.
+	if s, ok := file.Impl().(socket.SocketVFS2); ok {
+		dl := s.RecvTimeout()
+		if dl < 0 {
+			return false, ktime.Time{}, false
+		}
+		if dl > 0 {
+			return true, t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond), true
+		}
+	}
+	return true, ktime.Time{}, false
+}
+
+// Lseek implements Linux syscall lseek(2).
+func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	whence := args[2].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newoff, err := file.Seek(t, offset, whence)
+	return uintptr(newoff), nil, err
+}
+
+// Readahead implements readahead(2).
+func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is readable.
+	if !file.IsReadable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is valid.
+	if int(size) < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Return EINVAL; if the underlying file type does not support readahead,
+	// then Linux will return EINVAL to indicate as much. In the future, we
+	// may extend this function to actually support readahead hints.
+	return 0, nil, syserror.EINVAL
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
new file mode 100644
index 000000000..09ecfed26
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -0,0 +1,428 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
+
+// Chmod implements Linux syscall chmod(2).
+func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode)
+}
+
+// Fchmodat implements Linux syscall fchmodat(2).
+func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	mode := args[2].ModeT()
+	return 0, nil, fchmodat(t, dirfd, pathAddr, mode)
+}
+
+func fchmodat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_MODE,
+			Mode: uint16(mode & chmodMask),
+		},
+	})
+}
+
+// Fchmod implements Linux syscall fchmod(2).
+func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	mode := args[1].ModeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.SetStat(t, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_MODE,
+			Mode: uint16(mode & chmodMask),
+		},
+	})
+}
+
+// Chown implements Linux syscall chown(2).
+func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	owner := args[1].Int()
+	group := args[2].Int()
+	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */)
+}
+
+// Lchown implements Linux syscall lchown(2).
+func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	owner := args[1].Int()
+	group := args[2].Int()
+	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW)
+}
+
+// Fchownat implements Linux syscall fchownat(2).
+func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	owner := args[2].Int()
+	group := args[3].Int()
+	flags := args[4].Int()
+	return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags)
+}
+
+func fchownat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, owner, group, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
+		return err
+	}
+
+	return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
+}
+
+func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error {
+	userns := t.UserNamespace()
+	if owner != -1 {
+		kuid := userns.MapToKUID(auth.UID(owner))
+		if !kuid.Ok() {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_UID
+		opts.Stat.UID = uint32(kuid)
+	}
+	if group != -1 {
+		kgid := userns.MapToKGID(auth.GID(group))
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_GID
+		opts.Stat.GID = uint32(kgid)
+	}
+	return nil
+}
+
+// Fchown implements Linux syscall fchown(2).
+func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	owner := args[1].Int()
+	group := args[2].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, file.SetStat(t, opts)
+}
+
+// Truncate implements Linux syscall truncate(2).
+func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	err = setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: uint64(length),
+		},
+	})
+	return 0, nil, handleSetSizeError(t, err)
+}
+
+// Ftruncate implements Linux syscall ftruncate(2).
+func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.SetStat(t, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: uint64(length),
+		},
+	})
+	return 0, nil, handleSetSizeError(t, err)
+}
+
+// Utime implements Linux syscall utime(2).
+func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_ATIME | linux.STATX_MTIME,
+		},
+	}
+	if timesAddr == 0 {
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+	} else {
+		var times linux.Utime
+		if _, err := times.CopyIn(t, timesAddr); err != nil {
+			return 0, nil, err
+		}
+		opts.Stat.Atime.Sec = times.Actime
+		opts.Stat.Mtime.Sec = times.Modtime
+	}
+
+	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Utimes implements Linux syscall utimes(2).
+func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Futimesat implements Linux syscall futimesat(2).
+func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+
+	// "If filename is NULL and dfd refers to an open file, then operate on the
+	// file. Otherwise look up filename, possibly using dfd as a starting
+	// point." - fs/utimes.c
+	var path fspath.Path
+	shouldAllowEmptyPath := allowEmptyPath
+	if dirfd == linux.AT_FDCWD || pathAddr != 0 {
+		var err error
+		path, err = copyInPath(t, pathAddr)
+		if err != nil {
+			return 0, nil, err
+		}
+		shouldAllowEmptyPath = disallowEmptyPath
+	}
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, followFinalSymlink, &opts)
+}
+
+func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error {
+	if timesAddr == 0 {
+		opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+		return nil
+	}
+	var times [2]linux.Timeval
+	if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		return err
+	}
+	if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 {
+		return syserror.EINVAL
+	}
+	opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
+	opts.Stat.Atime = linux.StatxTimestamp{
+		Sec:  times[0].Sec,
+		Nsec: uint32(times[0].Usec * 1000),
+	}
+	opts.Stat.Mtime = linux.StatxTimestamp{
+		Sec:  times[1].Sec,
+		Nsec: uint32(times[1].Usec * 1000),
+	}
+	return nil
+}
+
+// Utimensat implements Linux syscall utimensat(2).
+func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+	flags := args[3].Int()
+
+	// Linux requires that the UTIME_OMIT check occur before checking path or
+	// flags.
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
+		return 0, nil, err
+	}
+	if opts.Stat.Mask == 0 {
+		return 0, nil, nil
+	}
+
+	if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "If filename is NULL and dfd refers to an open file, then operate on the
+	// file. Otherwise look up filename, possibly using dfd as a starting
+	// point." - fs/utimes.c
+	var path fspath.Path
+	shouldAllowEmptyPath := allowEmptyPath
+	if dirfd == linux.AT_FDCWD || pathAddr != 0 {
+		var err error
+		path, err = copyInPath(t, pathAddr)
+		if err != nil {
+			return 0, nil, err
+		}
+		shouldAllowEmptyPath = disallowEmptyPath
+	}
+
+	return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
+}
+
+func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error {
+	if timesAddr == 0 {
+		opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+		return nil
+	}
+	var times [2]linux.Timespec
+	if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		return err
+	}
+	if times[0].Nsec != linux.UTIME_OMIT {
+		if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_ATIME
+		opts.Stat.Atime = linux.StatxTimestamp{
+			Sec:  times[0].Sec,
+			Nsec: uint32(times[0].Nsec),
+		}
+	}
+	if times[1].Nsec != linux.UTIME_OMIT {
+		if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_MTIME
+		opts.Stat.Mtime = linux.StatxTimestamp{
+			Sec:  times[1].Sec,
+			Nsec: uint32(times[1].Nsec),
+		}
+	}
+	return nil
+}
+
+func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
+			return syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.SetStat() instead of
+				// VirtualFilesystem.SetStatAt(), since the former may be able
+				// to use opened file state to expedite the SetStat.
+				err := dirfile.SetStat(t, *opts)
+				dirfile.DecRef()
+				return err
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+	return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: bool(shouldFollowFinalSymlink),
+	}, opts)
+}
+
+func handleSetSizeError(t *kernel.Task, err error) error {
+	if err == syserror.ErrExceedsFileSizeLimit {
+		// Convert error to EFBIG and send a SIGXFSZ per setrlimit(2).
+		t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
+		return syserror.EFBIG
+	}
+	return err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go
new file mode 100644
index 000000000..623992f6f
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/signal.go
@@ -0,0 +1,100 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/signalfd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// sharedSignalfd is shared between the two calls.
+func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) {
+	// Copy in the signal mask.
+	mask, err := slinux.CopyInSigSet(t, sigset, sigsetsize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Always check for valid flags, even if not creating.
+	if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is this a change to an existing signalfd?
+	//
+	// The spec indicates that this should adjust the mask.
+	if fd != -1 {
+		file := t.GetFileVFS2(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		// Is this a signalfd?
+		if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok {
+			sfd.SetMask(mask)
+			return 0, nil, nil
+		}
+
+		// Not a signalfd.
+		return 0, nil, syserror.EINVAL
+	}
+
+	fileFlags := uint32(linux.O_RDWR)
+	if flags&linux.SFD_NONBLOCK != 0 {
+		fileFlags |= linux.O_NONBLOCK
+	}
+
+	// Create a new file.
+	vfsObj := t.Kernel().VFS()
+	file, err := signalfd.New(vfsObj, t, mask, fileFlags)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	// Create a new descriptor.
+	fd, err = t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.SFD_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Done.
+	return uintptr(fd), nil, nil
+}
+
+// Signalfd implements the linux syscall signalfd(2).
+func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	sigset := args[1].Pointer()
+	sigsetsize := args[2].SizeT()
+	return sharedSignalfd(t, fd, sigset, sigsetsize, 0)
+}
+
+// Signalfd4 implements the linux syscall signalfd4(2).
+func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	sigset := args[1].Pointer()
+	sigsetsize := args[2].SizeT()
+	flags := args[3].Int()
+	return sharedSignalfd(t, fd, sigset, sigsetsize, flags)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
new file mode 100644
index 000000000..10b668477
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -0,0 +1,1139 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// minListenBacklog is the minimum reasonable backlog for listening sockets.
+const minListenBacklog = 8
+
+// maxListenBacklog is the maximum allowed backlog for listening sockets.
+const maxListenBacklog = 1024
+
+// maxAddrLen is the maximum socket address length we're willing to accept.
+const maxAddrLen = 200
+
+// maxOptLen is the maximum sockopt parameter length we're willing to accept.
+const maxOptLen = 1024 * 8
+
+// maxControlLen is the maximum length of the msghdr.msg_control buffer we're
+// willing to accept. Note that this limit is smaller than Linux, which allows
+// buffers upto INT_MAX.
+const maxControlLen = 10 * 1024 * 1024
+
+// nameLenOffset is the offset from the start of the MessageHeader64 struct to
+// the NameLen field.
+const nameLenOffset = 8
+
+// controlLenOffset is the offset form the start of the MessageHeader64 struct
+// to the ControlLen field.
+const controlLenOffset = 40
+
+// flagsOffset is the offset form the start of the MessageHeader64 struct
+// to the Flags field.
+const flagsOffset = 48
+
+const sizeOfInt32 = 4
+
+// messageHeader64Len is the length of a MessageHeader64 struct.
+var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+
+// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
+var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+
+// baseRecvFlags are the flags that are accepted across recvmsg(2),
+// recvmmsg(2), and recvfrom(2).
+const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC
+
+// MessageHeader64 is the 64-bit representation of the msghdr struct used in
+// the recvmsg and sendmsg syscalls.
+type MessageHeader64 struct {
+	// Name is the optional pointer to a network address buffer.
+	Name uint64
+
+	// NameLen is the length of the buffer pointed to by Name.
+	NameLen uint32
+	_       uint32
+
+	// Iov is a pointer to an array of io vectors that describe the memory
+	// locations involved in the io operation.
+	Iov uint64
+
+	// IovLen is the length of the array pointed to by Iov.
+	IovLen uint64
+
+	// Control is the optional pointer to ancillary control data.
+	Control uint64
+
+	// ControlLen is the length of the data pointed to by Control.
+	ControlLen uint64
+
+	// Flags on the sent/received message.
+	Flags int32
+	_     int32
+}
+
+// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
+// the recvmmsg and sendmmsg syscalls.
+type multipleMessageHeader64 struct {
+	msgHdr MessageHeader64
+	msgLen uint32
+	_      int32
+}
+
+// CopyInMessageHeader64 copies a message header from user to kernel memory.
+func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
+	b := t.CopyScratchBuffer(52)
+	if _, err := t.CopyInBytes(addr, b); err != nil {
+		return err
+	}
+
+	msg.Name = usermem.ByteOrder.Uint64(b[0:])
+	msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
+	msg.Iov = usermem.ByteOrder.Uint64(b[16:])
+	msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
+	msg.Control = usermem.ByteOrder.Uint64(b[32:])
+	msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
+	msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
+
+	return nil
+}
+
+// CaptureAddress allocates memory for and copies a socket address structure
+// from the untrusted address space range.
+func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
+	if addrlen > maxAddrLen {
+		return nil, syserror.EINVAL
+	}
+
+	addrBuf := make([]byte, addrlen)
+	if _, err := t.CopyInBytes(addr, addrBuf); err != nil {
+		return nil, err
+	}
+
+	return addrBuf, nil
+}
+
+// writeAddress writes a sockaddr structure and its length to an output buffer
+// in the unstrusted address space range. If the address is bigger than the
+// buffer, it is truncated.
+func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+	// Get the buffer length.
+	var bufLen uint32
+	if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+		return err
+	}
+
+	if int32(bufLen) < 0 {
+		return syserror.EINVAL
+	}
+
+	// Write the length unconditionally.
+	if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+		return err
+	}
+
+	if addr == nil {
+		return nil
+	}
+
+	if bufLen > addrLen {
+		bufLen = addrLen
+	}
+
+	// Copy as much of the address as will fit in the buffer.
+	encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+	if bufLen > uint32(len(encodedAddr)) {
+		bufLen = uint32(len(encodedAddr))
+	}
+	_, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)])
+	return err
+}
+
+// Socket implements the linux syscall socket(2).
+func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Create the new socket.
+	s, e := socket.NewVFS2(t, domain, linux.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	defer s.DecRef()
+
+	if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil {
+		return 0, nil, err
+	}
+
+	fd, err := t.NewFDFromVFS2(0, s, kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// SocketPair implements the linux syscall socketpair(2).
+func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+	addr := args[3].Pointer()
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Create the socket pair.
+	s1, s2, e := socket.PairVFS2(t, domain, linux.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	// Adding to the FD table will cause an extra reference to be acquired.
+	defer s1.DecRef()
+	defer s2.DecRef()
+
+	nonblocking := uint32(stype & linux.SOCK_NONBLOCK)
+	if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
+		return 0, nil, err
+	}
+	if err := s2.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
+		return 0, nil, err
+	}
+
+	// Create the FDs for the sockets.
+	flags := kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	}
+	fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{s1, s2}, flags)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if _, err := t.CopyOut(addr, fds); err != nil {
+		for _, fd := range fds {
+			if _, file := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
+		}
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// Connect implements the linux syscall connect(2).
+func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
+	return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS)
+}
+
+// accept is the implementation of the accept syscall. It is called by accept
+// and accept4 syscall handlers.
+func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) {
+	// Check that no unsupported flags are passed in.
+	if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	// Call the syscall implementation for this socket, then copy the
+	// output address if one is specified.
+	blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
+
+	peerRequested := addrLen != 0
+	nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	if peerRequested {
+		// NOTE(magi): Linux does not give you an error if it can't
+		// write the data back out so neither do we.
+		if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL {
+			return 0, err
+		}
+	}
+	return uintptr(nfd), nil
+}
+
+// Accept4 implements the linux syscall accept4(2).
+func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+	flags := int(args[3].Int())
+
+	n, err := accept(t, fd, addr, addrlen, flags)
+	return n, nil, err
+}
+
+// Accept implements the linux syscall accept(2).
+func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	n, err := accept(t, fd, addr, addrlen, 0)
+	return n, nil, err
+}
+
+// Bind implements the linux syscall bind(2).
+func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, s.Bind(t, a).ToError()
+}
+
+// Listen implements the linux syscall listen(2).
+func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	backlog := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Per Linux, the backlog is silently capped to reasonable values.
+	if backlog <= 0 {
+		backlog = minListenBacklog
+	}
+	if backlog > maxListenBacklog {
+		backlog = maxListenBacklog
+	}
+
+	return 0, nil, s.Listen(t, int(backlog)).ToError()
+}
+
+// Shutdown implements the linux syscall shutdown(2).
+func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	how := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Validate how, then call syscall implementation.
+	switch how {
+	case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR:
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, s.Shutdown(t, int(how)).ToError()
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2).
+func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLenAddr := args[4].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Read the length. Reject negative values.
+	optLen := int32(0)
+	if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+		return 0, nil, err
+	}
+	if optLen < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Call syscall implementation then copy both value and value len out.
+	v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen))
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+
+	vLen := int32(binary.Size(v))
+	if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+		return 0, nil, err
+	}
+
+	if v != nil {
+		if _, err := t.CopyOut(optValAddr, v); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	return 0, nil, nil
+}
+
+// getSockOpt tries to handle common socket options, or dispatches to a specific
+// socket implementation.
+func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
+	if level == linux.SOL_SOCKET {
+		switch name {
+		case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
+			if len < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+		}
+
+		switch name {
+		case linux.SO_TYPE:
+			_, skType, _ := s.Type()
+			return int32(skType), nil
+		case linux.SO_DOMAIN:
+			family, _, _ := s.Type()
+			return int32(family), nil
+		case linux.SO_PROTOCOL:
+			_, _, protocol := s.Type()
+			return int32(protocol), nil
+		}
+	}
+
+	return s.GetSockOpt(t, level, name, optValAddr, len)
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2).
+//
+// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket.
+func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLen := args[4].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	if optLen < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if optLen > maxOptLen {
+		return 0, nil, syserror.EINVAL
+	}
+	buf := t.CopyScratchBuffer(int(optLen))
+	if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+		return 0, nil, err
+	}
+
+	// Call syscall implementation.
+	if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, nil
+}
+
+// GetSockName implements the linux syscall getsockname(2).
+func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Get the socket name and copy it to the caller.
+	v, vl, err := s.GetSockName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// GetPeerName implements the linux syscall getpeername(2).
+func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Get the socket peer name and copy it to the caller.
+	v, vl, err := s.GetPeerName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// RecvMsg implements the linux syscall recvmsg(2).
+func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.RecvTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline)
+	return n, nil, err
+}
+
+// RecvMMsg implements the linux syscall recvmmsg(2).
+func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+	toPtr := args[4].Pointer()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if toPtr != 0 {
+		var ts linux.Timespec
+		if _, err := ts.CopyIn(t, toPtr); err != nil {
+			return 0, nil, err
+		}
+		if !ts.Valid() {
+			return 0, nil, syserror.EINVAL
+		}
+		deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration())
+		haveDeadline = true
+	}
+
+	if !haveDeadline {
+		if dl := s.RecvTimeout(); dl > 0 {
+			deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+			haveDeadline = true
+		} else if dl < 0 {
+			flags |= linux.MSG_DONTWAIT
+		}
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		var n uintptr
+		if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
+	// Capture the message header and io vectors.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syserror.EMSGSIZE
+	}
+	dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	// FIXME(b/63594852): Pretend we have an empty error queue.
+	if flags&linux.MSG_ERRQUEUE != 0 {
+		return 0, syserror.EAGAIN
+	}
+
+	// Fast path when no control message nor name buffers are provided.
+	if msg.ControlLen == 0 && msg.NameLen == 0 {
+		n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
+		if err != nil {
+			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
+		}
+		if !cms.Unix.Empty() {
+			mflags |= linux.MSG_CTRUNC
+			cms.Release()
+		}
+
+		if int(msg.Flags) != mflags {
+			// Copy out the flags to the caller.
+			if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+				return 0, err
+			}
+		}
+
+		return uintptr(n), nil
+	}
+
+	if msg.ControlLen > maxControlLen {
+		return 0, syserror.ENOBUFS
+	}
+	n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	defer cms.Release()
+
+	controlData := make([]byte, 0, msg.ControlLen)
+	controlData = control.PackControlMessages(t, cms, controlData)
+
+	if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
+		creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
+		controlData, mflags = control.PackCredentials(t, creds, controlData, mflags)
+	}
+
+	if cms.Unix.Rights != nil {
+		controlData, mflags = control.PackRightsVFS2(t, cms.Unix.Rights.(control.SCMRightsVFS2), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
+	}
+
+	// Copy the address to the caller.
+	if msg.NameLen != 0 {
+		if err := writeAddress(t, sender, senderLen, usermem.Addr(msg.Name), usermem.Addr(msgPtr+nameLenOffset)); err != nil {
+			return 0, err
+		}
+	}
+
+	// Copy the control data to the caller.
+	if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+		return 0, err
+	}
+	if len(controlData) > 0 {
+		if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	// Copy out the flags to the caller.
+	if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+		return 0, err
+	}
+
+	return uintptr(n), nil
+}
+
+// recvFrom is the implementation of the recvfrom syscall. It is called by
+// recvfrom and recv syscall handlers.
+func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) {
+	if int(bufLen) < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.RecvTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
+	cm.Release()
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+
+	// Copy the address to the caller.
+	if nameLenPtr != 0 {
+		if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil {
+			return 0, err
+		}
+	}
+
+	return uintptr(n), nil
+}
+
+// RecvFrom implements the linux syscall recvfrom(2).
+func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLenPtr := args[5].Pointer()
+
+	n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr)
+	return n, nil, err
+}
+
+// SendMsg implements the linux syscall sendmsg(2).
+func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := sendSingleMsg(t, s, file, msgPtr, flags)
+	return n, nil, err
+}
+
+// SendMMsg implements the linux syscall sendmmsg(2).
+func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		var n uintptr
+		if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr usermem.Addr, flags int32) (uintptr, error) {
+	// Capture the message header.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	var controlData []byte
+	if msg.ControlLen > 0 {
+		// Put an upper bound to prevent large allocations.
+		if msg.ControlLen > maxControlLen {
+			return 0, syserror.ENOBUFS
+		}
+		controlData = make([]byte, msg.ControlLen)
+		if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	if msg.NameLen != 0 {
+		var err error
+		to, err = CaptureAddress(t, usermem.Addr(msg.Name), msg.NameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	// Read data then call the sendmsg implementation.
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syserror.EMSGSIZE
+	}
+	src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	controlMessages, err := control.Parse(t, s, controlData)
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.SendTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
+	err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
+	if err != nil {
+		controlMessages.Release()
+	}
+	return uintptr(n), err
+}
+
+// sendTo is the implementation of the sendto syscall. It is called by sendto
+// and send syscall handlers.
+func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) {
+	bl := int(bufLen)
+	if bl < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	var err error
+	if namePtr != 0 {
+		to, err = CaptureAddress(t, namePtr, nameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.SendTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
+	return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
+}
+
+// SendTo implements the linux syscall sendto(2).
+func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLen := args[5].Uint()
+
+	n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
+	return n, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
new file mode 100644
index 000000000..945a364a7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -0,0 +1,291 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Splice implements Linux syscall splice(2).
+func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	inFD := args[0].Int()
+	inOffsetPtr := args[1].Pointer()
+	outFD := args[2].Int()
+	outOffsetPtr := args[3].Pointer()
+	count := int64(args[4].SizeT())
+	flags := args[5].Int()
+
+	if count == 0 {
+		return 0, nil, nil
+	}
+	if count > int64(kernel.MAX_RW_COUNT) {
+		count = int64(kernel.MAX_RW_COUNT)
+	}
+
+	// Check for invalid flags.
+	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get file descriptions.
+	inFile := t.GetFileVFS2(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+	outFile := t.GetFileVFS2(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	// Check that both files support the required directionality.
+	if !inFile.IsReadable() || !outFile.IsWritable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	// The operation is non-blocking if anything is non-blocking.
+	//
+	// N.B. This is a rather simplistic heuristic that avoids some
+	// poor edge case behavior since the exact semantics here are
+	// underspecified and vary between versions of Linux itself.
+	nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+	// At least one file description must represent a pipe.
+	inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+	if !inIsPipe && !outIsPipe {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Copy in offsets.
+	inOffset := int64(-1)
+	if inOffsetPtr != 0 {
+		if inIsPipe {
+			return 0, nil, syserror.ESPIPE
+		}
+		if inFile.Options().DenyPRead {
+			return 0, nil, syserror.EINVAL
+		}
+		if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil {
+			return 0, nil, err
+		}
+		if inOffset < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+	}
+	outOffset := int64(-1)
+	if outOffsetPtr != 0 {
+		if outIsPipe {
+			return 0, nil, syserror.ESPIPE
+		}
+		if outFile.Options().DenyPWrite {
+			return 0, nil, syserror.EINVAL
+		}
+		if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil {
+			return 0, nil, err
+		}
+		if outOffset < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+	}
+
+	// Move data.
+	var (
+		n     int64
+		err   error
+		inCh  chan struct{}
+		outCh chan struct{}
+	)
+	for {
+		// If both input and output are pipes, delegate to the pipe
+		// implementation. Otherwise, exactly one end is a pipe, which we
+		// ensure is consistently ordered after the non-pipe FD's locks by
+		// passing the pipe FD as usermem.IO to the non-pipe end.
+		switch {
+		case inIsPipe && outIsPipe:
+			n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
+		case inIsPipe:
+			if outOffset != -1 {
+				n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{})
+				outOffset += n
+			} else {
+				n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{})
+			}
+		case outIsPipe:
+			if inOffset != -1 {
+				n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{})
+				inOffset += n
+			} else {
+				n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
+			}
+		}
+		if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
+			break
+		}
+
+		// Note that the blocking behavior here is a bit different than the
+		// normal pattern. Because we need to have both data to read and data
+		// to write simultaneously, we actually explicitly block on both of
+		// these cases in turn before returning to the splice operation.
+		if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
+			if inCh == nil {
+				inCh = make(chan struct{}, 1)
+				inW, _ := waiter.NewChannelEntry(inCh)
+				inFile.EventRegister(&inW, eventMaskRead)
+				defer inFile.EventUnregister(&inW)
+				continue // Need to refresh readiness.
+			}
+			if err = t.Block(inCh); err != nil {
+				break
+			}
+		}
+		if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
+			if outCh == nil {
+				outCh = make(chan struct{}, 1)
+				outW, _ := waiter.NewChannelEntry(outCh)
+				outFile.EventRegister(&outW, eventMaskWrite)
+				defer outFile.EventUnregister(&outW)
+				continue // Need to refresh readiness.
+			}
+			if err = t.Block(outCh); err != nil {
+				break
+			}
+		}
+	}
+
+	// Copy updated offsets out.
+	if inOffsetPtr != 0 {
+		if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil {
+			return 0, nil, err
+		}
+	}
+	if outOffsetPtr != 0 {
+		if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	if n == 0 {
+		return 0, nil, err
+	}
+
+	// On Linux, inotify behavior is not very consistent with splice(2). We try
+	// our best to emulate Linux for very basic calls to splice, where for some
+	// reason, events are generated for output files, but not input files.
+	outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+	return uintptr(n), nil, nil
+}
+
+// Tee implements Linux syscall tee(2).
+func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	inFD := args[0].Int()
+	outFD := args[1].Int()
+	count := int64(args[2].SizeT())
+	flags := args[3].Int()
+
+	if count == 0 {
+		return 0, nil, nil
+	}
+	if count > int64(kernel.MAX_RW_COUNT) {
+		count = int64(kernel.MAX_RW_COUNT)
+	}
+
+	// Check for invalid flags.
+	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get file descriptions.
+	inFile := t.GetFileVFS2(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+	outFile := t.GetFileVFS2(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	// Check that both files support the required directionality.
+	if !inFile.IsReadable() || !outFile.IsWritable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	// The operation is non-blocking if anything is non-blocking.
+	//
+	// N.B. This is a rather simplistic heuristic that avoids some
+	// poor edge case behavior since the exact semantics here are
+	// underspecified and vary between versions of Linux itself.
+	nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+	// Both file descriptions must represent pipes.
+	inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+	if !inIsPipe || !outIsPipe {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Copy data.
+	var (
+		inCh  chan struct{}
+		outCh chan struct{}
+	)
+	for {
+		n, err := pipe.Tee(t, outPipeFD, inPipeFD, count)
+		if n != 0 {
+			return uintptr(n), nil, nil
+		}
+		if err != syserror.ErrWouldBlock || nonBlock {
+			return 0, nil, err
+		}
+
+		// Note that the blocking behavior here is a bit different than the
+		// normal pattern. Because we need to have both data to read and data
+		// to write simultaneously, we actually explicitly block on both of
+		// these cases in turn before returning to the tee operation.
+		if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
+			if inCh == nil {
+				inCh = make(chan struct{}, 1)
+				inW, _ := waiter.NewChannelEntry(inCh)
+				inFile.EventRegister(&inW, eventMaskRead)
+				defer inFile.EventUnregister(&inW)
+				continue // Need to refresh readiness.
+			}
+			if err := t.Block(inCh); err != nil {
+				return 0, nil, err
+			}
+		}
+		if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
+			if outCh == nil {
+				outCh = make(chan struct{}, 1)
+				outW, _ := waiter.NewChannelEntry(outCh)
+				outFile.EventRegister(&outW, eventMaskWrite)
+				defer outFile.EventUnregister(&outW)
+				continue // Need to refresh readiness.
+			}
+			if err := t.Block(outCh); err != nil {
+				return 0, nil, err
+			}
+		}
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
new file mode 100644
index 000000000..bb1d5cac4
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -0,0 +1,388 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Stat implements Linux syscall stat(2).
+func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+	return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, 0 /* flags */)
+}
+
+// Lstat implements Linux syscall lstat(2).
+func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+	return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, linux.AT_SYMLINK_NOFOLLOW)
+}
+
+// Newfstatat implements Linux syscall newfstatat, which backs fstatat(2).
+func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	statAddr := args[2].Pointer()
+	flags := args[3].Int()
+	return 0, nil, fstatat(t, dirfd, pathAddr, statAddr, flags)
+}
+
+func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+
+	opts := vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS,
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.Stat() instead of
+				// VirtualFilesystem.StatAt() for fstatat(fd, ""), since the
+				// former may be able to use opened file state to expedite the
+				// Stat.
+				statx, err := dirfile.Stat(t, opts)
+				dirfile.DecRef()
+				if err != nil {
+					return err
+				}
+				var stat linux.Stat
+				convertStatxToUserStat(t, &statx, &stat)
+				_, err = stat.CopyOut(t, statAddr)
+				return err
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+
+	statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+	}, &opts)
+	if err != nil {
+		return err
+	}
+	var stat linux.Stat
+	convertStatxToUserStat(t, &statx, &stat)
+	_, err = stat.CopyOut(t, statAddr)
+	return err
+}
+
+func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec {
+	return linux.Timespec{
+		Sec:  sxts.Sec,
+		Nsec: int64(sxts.Nsec),
+	}
+}
+
+// Fstat implements Linux syscall fstat(2).
+func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	statAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	statx, err := file.Stat(t, vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	var stat linux.Stat
+	convertStatxToUserStat(t, &statx, &stat)
+	_, err = stat.CopyOut(t, statAddr)
+	return 0, nil, err
+}
+
+// Statx implements Linux syscall statx(2).
+func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	flags := args[2].Int()
+	mask := args[3].Uint()
+	statxAddr := args[4].Pointer()
+
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	// Make sure that only one sync type option is set.
+	syncType := uint32(flags & linux.AT_STATX_SYNC_TYPE)
+	if syncType != 0 && !bits.IsPowerOfTwo32(syncType) {
+		return 0, nil, syserror.EINVAL
+	}
+	if mask&linux.STATX__RESERVED != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts := vfs.StatOptions{
+		Mask: mask,
+		Sync: uint32(flags & linux.AT_STATX_SYNC_TYPE),
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return 0, nil, syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return 0, nil, syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.Stat() instead of
+				// VirtualFilesystem.StatAt() for statx(fd, ""), since the
+				// former may be able to use opened file state to expedite the
+				// Stat.
+				statx, err := dirfile.Stat(t, opts)
+				dirfile.DecRef()
+				if err != nil {
+					return 0, nil, err
+				}
+				userifyStatx(t, &statx)
+				_, err = statx.CopyOut(t, statxAddr)
+				return 0, nil, err
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+
+	statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+	}, &opts)
+	if err != nil {
+		return 0, nil, err
+	}
+	userifyStatx(t, &statx)
+	_, err = statx.CopyOut(t, statxAddr)
+	return 0, nil, err
+}
+
+func userifyStatx(t *kernel.Task, statx *linux.Statx) {
+	userns := t.UserNamespace()
+	statx.UID = uint32(auth.KUID(statx.UID).In(userns).OrOverflow())
+	statx.GID = uint32(auth.KGID(statx.GID).In(userns).OrOverflow())
+}
+
+// Readlink implements Linux syscall readlink(2).
+func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+	size := args[2].SizeT()
+	return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size)
+}
+
+// Access implements Linux syscall access(2).
+func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+
+	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Faccessat implements Linux syscall faccessat(2).
+//
+// Note that the faccessat() system call does not take a flags argument:
+// "The raw faccessat() system call takes only the first three arguments. The
+// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within
+// the glibc wrapper function for faccessat().  If either of these flags is
+// specified, then the wrapper function employs fstatat(2) to determine access
+// permissions." - faccessat(2)
+func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+
+	return 0, nil, accessAt(t, dirfd, addr, mode)
+}
+
+func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error {
+	const rOK = 4
+	const wOK = 2
+	const xOK = 1
+
+	// Sanity check the mode.
+	if mode&^(rOK|wOK|xOK) != 0 {
+		return syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+
+	// access(2) and faccessat(2) check permissions using real
+	// UID/GID, not effective UID/GID.
+	//
+	// "access() needs to use the real uid/gid, not the effective
+	// uid/gid. We do this by temporarily clearing all FS-related
+	// capabilities and switching the fsuid/fsgid around to the
+	// real ones." -fs/open.c:faccessat
+	creds := t.Credentials().Fork()
+	creds.EffectiveKUID = creds.RealKUID
+	creds.EffectiveKGID = creds.RealKGID
+	if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
+		creds.EffectiveCaps = creds.PermittedCaps
+	} else {
+		creds.EffectiveCaps = 0
+	}
+
+	return t.Kernel().VFS().AccessAt(t, creds, vfs.AccessTypes(mode), &tpop.pop)
+}
+
+// Readlinkat implements Linux syscall mknodat(2).
+func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	bufAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	return readlinkat(t, dirfd, pathAddr, bufAddr, size)
+}
+
+func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, size uint) (uintptr, *kernel.SyscallControl, error) {
+	if int(size) <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	// "Since Linux 2.6.39, pathname can be an empty string, in which case the
+	// call operates on the symbolic link referred to by dirfd ..." -
+	// readlinkat(2)
+	tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if len(target) > int(size) {
+		target = target[:size]
+	}
+	n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target))
+	if n == 0 {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Statfs implements Linux syscall statfs(2).
+func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+	_, err = statfs.CopyOut(t, bufAddr)
+	return 0, nil, err
+}
+
+// Fstatfs implements Linux syscall fstatfs(2).
+func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufAddr := args[1].Pointer()
+
+	tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+	_, err = statfs.CopyOut(t, bufAddr)
+	return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
new file mode 100644
index 000000000..2da538fc6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+	// Linux just copies fields from struct kstat without regard to struct
+	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+	userns := t.UserNamespace()
+	*stat = linux.Stat{
+		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+		Ino:     statx.Ino,
+		Nlink:   uint64(statx.Nlink),
+		Mode:    uint32(statx.Mode),
+		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+		Size:    int64(statx.Size),
+		Blksize: int64(statx.Blksize),
+		Blocks:  int64(statx.Blocks),
+		ATime:   timespecFromStatxTimestamp(statx.Atime),
+		MTime:   timespecFromStatxTimestamp(statx.Mtime),
+		CTime:   timespecFromStatxTimestamp(statx.Ctime),
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go
new file mode 100644
index 000000000..88b9c7627
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+	// Linux just copies fields from struct kstat without regard to struct
+	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+	userns := t.UserNamespace()
+	*stat = linux.Stat{
+		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+		Ino:     statx.Ino,
+		Nlink:   uint32(statx.Nlink),
+		Mode:    uint32(statx.Mode),
+		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+		Size:    int64(statx.Size),
+		Blksize: int32(statx.Blksize),
+		Blocks:  int64(statx.Blocks),
+		ATime:   timespecFromStatxTimestamp(statx.Atime),
+		MTime:   timespecFromStatxTimestamp(statx.Mtime),
+		CTime:   timespecFromStatxTimestamp(statx.Ctime),
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go
new file mode 100644
index 000000000..0d0ebf46a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/sync.go
@@ -0,0 +1,115 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements Linux syscall sync(2).
+func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.Kernel().VFS().SyncAllFilesystems(t)
+}
+
+// Syncfs implements Linux syscall syncfs(2).
+func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.SyncFS(t)
+}
+
+// Fsync implements Linux syscall fsync(2).
+func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.Sync(t)
+}
+
+// Fdatasync implements Linux syscall fdatasync(2).
+func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// TODO(gvisor.dev/issue/1897): Avoid writeback of unnecessary metadata.
+	return Fsync(t, args)
+}
+
+// SyncFileRange implements Linux syscall sync_file_range(2).
+func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	nbytes := args[2].Int64()
+	flags := args[3].Uint()
+
+	// Check for negative values and overflow.
+	if offset < 0 || offset+nbytes < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support
+	// is a full-file sync, i.e. fsync(2). As a result, there are severe
+	// limitations on how much we support sync_file_range:
+	// - In Linux, sync_file_range(2) doesn't write out the file's metadata, even
+	//   if the file size is changed. We do.
+	// - We always sync the entire file instead of [offset, offset+nbytes).
+	// - We do not support the use of WAIT_BEFORE without WAIT_AFTER. For
+	//   correctness, we would have to perform a write-out every time WAIT_BEFORE
+	//   was used, but this would be much more expensive than expected if there
+	//   were no write-out operations in progress.
+	// - Whenever WAIT_AFTER is used, we sync the file.
+	// - Ignore WRITE. If this flag is used with WAIT_AFTER, then the file will
+	//   be synced anyway. If this flag is used without WAIT_AFTER, then it is
+	//   safe (and less expensive) to do nothing, because the syscall will not
+	//   wait for the write-out to complete--we only need to make sure that the
+	//   next time WAIT_BEFORE or WAIT_AFTER are used, the write-out completes.
+	// - According to fs/sync.c, WAIT_BEFORE|WAIT_AFTER "will detect any I/O
+	//   errors or ENOSPC conditions and will return those to the caller, after
+	//   clearing the EIO and ENOSPC flags in the address_space." We don't do
+	//   this.
+
+	if flags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 &&
+		flags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 {
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.ENOSYS
+	}
+
+	if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 {
+		if err := file.Sync(t); err != nil {
+			return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+		}
+	}
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
new file mode 100644
index 000000000..5ac79bc09
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
@@ -0,0 +1,127 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TimerfdCreate implements Linux syscall timerfd_create(2).
+func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := args[0].Int()
+	flags := args[1].Int()
+
+	if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Timerfds aren't writable per se (their implementation of Write just
+	// returns EINVAL), but they are "opened for writing", which is necessary
+	// to actually reach said implementation of Write.
+	fileFlags := uint32(linux.O_RDWR)
+	if flags&linux.TFD_NONBLOCK != 0 {
+		fileFlags |= linux.O_NONBLOCK
+	}
+
+	var clock ktime.Clock
+	switch clockID {
+	case linux.CLOCK_REALTIME:
+		clock = t.Kernel().RealtimeClock()
+	case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME:
+		clock = t.Kernel().MonotonicClock()
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+	vfsObj := t.Kernel().VFS()
+	file, err := timerfd.New(vfsObj, clock, fileFlags)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// TimerfdSettime implements Linux syscall timerfd_settime(2).
+func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	flags := args[1].Int()
+	newValAddr := args[2].Pointer()
+	oldValAddr := args[3].Pointer()
+
+	if flags&^(linux.TFD_TIMER_ABSTIME) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	tfd, ok := file.Impl().(*timerfd.TimerFileDescription)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var newVal linux.Itimerspec
+	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+		return 0, nil, err
+	}
+	newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock())
+	if err != nil {
+		return 0, nil, err
+	}
+	tm, oldS := tfd.SetTime(newS)
+	if oldValAddr != 0 {
+		oldVal := ktime.ItimerspecFromSetting(tm, oldS)
+		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// TimerfdGettime implements Linux syscall timerfd_gettime(2).
+func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	curValAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	tfd, ok := file.Impl().(*timerfd.TimerFileDescription)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	tm, s := tfd.GetTime()
+	curVal := ktime.ItimerspecFromSetting(tm, s)
+	_, err := t.CopyOut(curValAddr, &curVal)
+	return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
new file mode 100644
index 000000000..8f497ecc7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -0,0 +1,168 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package vfs2 provides syscall implementations that use VFS2.
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/syscalls"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+)
+
+// Override syscall table to add syscalls implementations from this package.
+func Override() {
+	// Override AMD64.
+	s := linux.AMD64
+	s.Table[0] = syscalls.Supported("read", Read)
+	s.Table[1] = syscalls.Supported("write", Write)
+	s.Table[2] = syscalls.Supported("open", Open)
+	s.Table[3] = syscalls.Supported("close", Close)
+	s.Table[4] = syscalls.Supported("stat", Stat)
+	s.Table[5] = syscalls.Supported("fstat", Fstat)
+	s.Table[6] = syscalls.Supported("lstat", Lstat)
+	s.Table[7] = syscalls.Supported("poll", Poll)
+	s.Table[8] = syscalls.Supported("lseek", Lseek)
+	s.Table[9] = syscalls.Supported("mmap", Mmap)
+	s.Table[16] = syscalls.Supported("ioctl", Ioctl)
+	s.Table[17] = syscalls.Supported("pread64", Pread64)
+	s.Table[18] = syscalls.Supported("pwrite64", Pwrite64)
+	s.Table[19] = syscalls.Supported("readv", Readv)
+	s.Table[20] = syscalls.Supported("writev", Writev)
+	s.Table[21] = syscalls.Supported("access", Access)
+	s.Table[22] = syscalls.Supported("pipe", Pipe)
+	s.Table[23] = syscalls.Supported("select", Select)
+	s.Table[32] = syscalls.Supported("dup", Dup)
+	s.Table[33] = syscalls.Supported("dup2", Dup2)
+	delete(s.Table, 40) // sendfile
+	s.Table[41] = syscalls.Supported("socket", Socket)
+	s.Table[42] = syscalls.Supported("connect", Connect)
+	s.Table[43] = syscalls.Supported("accept", Accept)
+	s.Table[44] = syscalls.Supported("sendto", SendTo)
+	s.Table[45] = syscalls.Supported("recvfrom", RecvFrom)
+	s.Table[46] = syscalls.Supported("sendmsg", SendMsg)
+	s.Table[47] = syscalls.Supported("recvmsg", RecvMsg)
+	s.Table[48] = syscalls.Supported("shutdown", Shutdown)
+	s.Table[49] = syscalls.Supported("bind", Bind)
+	s.Table[50] = syscalls.Supported("listen", Listen)
+	s.Table[51] = syscalls.Supported("getsockname", GetSockName)
+	s.Table[52] = syscalls.Supported("getpeername", GetPeerName)
+	s.Table[53] = syscalls.Supported("socketpair", SocketPair)
+	s.Table[54] = syscalls.Supported("setsockopt", SetSockOpt)
+	s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt)
+	s.Table[59] = syscalls.Supported("execve", Execve)
+	s.Table[72] = syscalls.Supported("fcntl", Fcntl)
+	s.Table[73] = syscalls.Supported("fcntl", Flock)
+	s.Table[74] = syscalls.Supported("fsync", Fsync)
+	s.Table[75] = syscalls.Supported("fdatasync", Fdatasync)
+	s.Table[76] = syscalls.Supported("truncate", Truncate)
+	s.Table[77] = syscalls.Supported("ftruncate", Ftruncate)
+	s.Table[78] = syscalls.Supported("getdents", Getdents)
+	s.Table[79] = syscalls.Supported("getcwd", Getcwd)
+	s.Table[80] = syscalls.Supported("chdir", Chdir)
+	s.Table[81] = syscalls.Supported("fchdir", Fchdir)
+	s.Table[82] = syscalls.Supported("rename", Rename)
+	s.Table[83] = syscalls.Supported("mkdir", Mkdir)
+	s.Table[84] = syscalls.Supported("rmdir", Rmdir)
+	s.Table[85] = syscalls.Supported("creat", Creat)
+	s.Table[86] = syscalls.Supported("link", Link)
+	s.Table[87] = syscalls.Supported("unlink", Unlink)
+	s.Table[88] = syscalls.Supported("symlink", Symlink)
+	s.Table[89] = syscalls.Supported("readlink", Readlink)
+	s.Table[90] = syscalls.Supported("chmod", Chmod)
+	s.Table[91] = syscalls.Supported("fchmod", Fchmod)
+	s.Table[92] = syscalls.Supported("chown", Chown)
+	s.Table[93] = syscalls.Supported("fchown", Fchown)
+	s.Table[94] = syscalls.Supported("lchown", Lchown)
+	s.Table[132] = syscalls.Supported("utime", Utime)
+	s.Table[133] = syscalls.Supported("mknod", Mknod)
+	s.Table[137] = syscalls.Supported("statfs", Statfs)
+	s.Table[138] = syscalls.Supported("fstatfs", Fstatfs)
+	s.Table[161] = syscalls.Supported("chroot", Chroot)
+	s.Table[162] = syscalls.Supported("sync", Sync)
+	s.Table[165] = syscalls.Supported("mount", Mount)
+	s.Table[166] = syscalls.Supported("umount2", Umount2)
+	s.Table[187] = syscalls.Supported("readahead", Readahead)
+	s.Table[188] = syscalls.Supported("setxattr", Setxattr)
+	s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
+	s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
+	s.Table[191] = syscalls.Supported("getxattr", Getxattr)
+	s.Table[192] = syscalls.Supported("lgetxattr", Lgetxattr)
+	s.Table[193] = syscalls.Supported("fgetxattr", Fgetxattr)
+	s.Table[194] = syscalls.Supported("listxattr", Listxattr)
+	s.Table[195] = syscalls.Supported("llistxattr", Llistxattr)
+	s.Table[196] = syscalls.Supported("flistxattr", Flistxattr)
+	s.Table[197] = syscalls.Supported("removexattr", Removexattr)
+	s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
+	s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
+	s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"})
+	s.Table[213] = syscalls.Supported("epoll_create", EpollCreate)
+	s.Table[217] = syscalls.Supported("getdents64", Getdents64)
+	s.Table[221] = syscalls.PartiallySupported("fadvise64", Fadvise64, "The syscall is 'supported', but ignores all provided advice.", nil)
+	s.Table[232] = syscalls.Supported("epoll_wait", EpollWait)
+	s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
+	s.Table[235] = syscalls.Supported("utimes", Utimes)
+	s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil)
+	s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil)
+	s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil)
+	s.Table[257] = syscalls.Supported("openat", Openat)
+	s.Table[258] = syscalls.Supported("mkdirat", Mkdirat)
+	s.Table[259] = syscalls.Supported("mknodat", Mknodat)
+	s.Table[260] = syscalls.Supported("fchownat", Fchownat)
+	s.Table[261] = syscalls.Supported("futimesat", Futimesat)
+	s.Table[262] = syscalls.Supported("newfstatat", Newfstatat)
+	s.Table[263] = syscalls.Supported("unlinkat", Unlinkat)
+	s.Table[264] = syscalls.Supported("renameat", Renameat)
+	s.Table[265] = syscalls.Supported("linkat", Linkat)
+	s.Table[266] = syscalls.Supported("symlinkat", Symlinkat)
+	s.Table[267] = syscalls.Supported("readlinkat", Readlinkat)
+	s.Table[268] = syscalls.Supported("fchmodat", Fchmodat)
+	s.Table[269] = syscalls.Supported("faccessat", Faccessat)
+	s.Table[270] = syscalls.Supported("pselect", Pselect)
+	s.Table[271] = syscalls.Supported("ppoll", Ppoll)
+	s.Table[275] = syscalls.Supported("splice", Splice)
+	s.Table[276] = syscalls.Supported("tee", Tee)
+	s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange)
+	s.Table[280] = syscalls.Supported("utimensat", Utimensat)
+	s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait)
+	s.Table[282] = syscalls.Supported("signalfd", Signalfd)
+	s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate)
+	s.Table[284] = syscalls.Supported("eventfd", Eventfd)
+	s.Table[285] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil)
+	s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime)
+	s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime)
+	s.Table[288] = syscalls.Supported("accept4", Accept4)
+	s.Table[289] = syscalls.Supported("signalfd4", Signalfd4)
+	s.Table[290] = syscalls.Supported("eventfd2", Eventfd2)
+	s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
+	s.Table[292] = syscalls.Supported("dup3", Dup3)
+	s.Table[293] = syscalls.Supported("pipe2", Pipe2)
+	s.Table[294] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil)
+	s.Table[295] = syscalls.Supported("preadv", Preadv)
+	s.Table[296] = syscalls.Supported("pwritev", Pwritev)
+	s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg)
+	s.Table[306] = syscalls.Supported("syncfs", Syncfs)
+	s.Table[307] = syscalls.Supported("sendmmsg", SendMMsg)
+	s.Table[316] = syscalls.Supported("renameat2", Renameat2)
+	s.Table[319] = syscalls.Supported("memfd_create", MemfdCreate)
+	s.Table[322] = syscalls.Supported("execveat", Execveat)
+	s.Table[327] = syscalls.Supported("preadv2", Preadv2)
+	s.Table[328] = syscalls.Supported("pwritev2", Pwritev2)
+	s.Table[332] = syscalls.Supported("statx", Statx)
+	s.Init()
+
+	// Override ARM64.
+	s = linux.ARM64
+	s.Table[63] = syscalls.Supported("read", Read)
+	s.Init()
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
new file mode 100644
index 000000000..af455d5c1
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -0,0 +1,356 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"bytes"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Listxattr implements Linux syscall listxattr(2).
+func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listxattr(t, args, followFinalSymlink)
+}
+
+// Llistxattr implements Linux syscall llistxattr(2).
+func Llistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listxattr(t, args, nofollowFinalSymlink)
+}
+
+func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	listAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrNameList(t, listAddr, size, names)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Flistxattr implements Linux syscall flistxattr(2).
+func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	listAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	names, err := file.Listxattr(t, uint64(size))
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrNameList(t, listAddr, size, names)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Getxattr implements Linux syscall getxattr(2).
+func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getxattr(t, args, followFinalSymlink)
+}
+
+// Lgetxattr implements Linux syscall lgetxattr(2).
+func Lgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getxattr(t, args, nofollowFinalSymlink)
+}
+
+func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{
+		Name: name,
+		Size: uint64(size),
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrValue(t, valueAddr, size, value)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Fgetxattr implements Linux syscall fgetxattr(2).
+func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)})
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrValue(t, valueAddr, size, value)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Setxattr implements Linux syscall setxattr(2).
+func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, setxattr(t, args, followFinalSymlink)
+}
+
+// Lsetxattr implements Linux syscall lsetxattr(2).
+func Lsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, setxattr(t, args, nofollowFinalSymlink)
+}
+
+func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	flags := args[4].Int()
+
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+	value, err := copyInXattrValue(t, valueAddr, size)
+	if err != nil {
+		return err
+	}
+
+	return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{
+		Name:  name,
+		Value: value,
+		Flags: uint32(flags),
+	})
+}
+
+// Fsetxattr implements Linux syscall fsetxattr(2).
+func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	flags := args[4].Int()
+
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	value, err := copyInXattrValue(t, valueAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{
+		Name:  name,
+		Value: value,
+		Flags: uint32(flags),
+	})
+}
+
+// Removexattr implements Linux syscall removexattr(2).
+func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, removexattr(t, args, followFinalSymlink)
+}
+
+// Lremovexattr implements Linux syscall lremovexattr(2).
+func Lremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, removexattr(t, args, nofollowFinalSymlink)
+}
+
+func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name)
+}
+
+// Fremovexattr implements Linux syscall fremovexattr(2).
+func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, file.Removexattr(t, name)
+}
+
+func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
+	name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
+	if err != nil {
+		if err == syserror.ENAMETOOLONG {
+			return "", syserror.ERANGE
+		}
+		return "", err
+	}
+	if len(name) == 0 {
+		return "", syserror.ERANGE
+	}
+	return name, nil
+}
+
+func copyOutXattrNameList(t *kernel.Task, listAddr usermem.Addr, size uint, names []string) (int, error) {
+	if size > linux.XATTR_LIST_MAX {
+		size = linux.XATTR_LIST_MAX
+	}
+	var buf bytes.Buffer
+	for _, name := range names {
+		buf.WriteString(name)
+		buf.WriteByte(0)
+	}
+	if size == 0 {
+		// Return the size that would be required to accomodate the list.
+		return buf.Len(), nil
+	}
+	if buf.Len() > int(size) {
+		if size >= linux.XATTR_LIST_MAX {
+			return 0, syserror.E2BIG
+		}
+		return 0, syserror.ERANGE
+	}
+	return t.CopyOutBytes(listAddr, buf.Bytes())
+}
+
+func copyInXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint) (string, error) {
+	if size > linux.XATTR_SIZE_MAX {
+		return "", syserror.E2BIG
+	}
+	buf := make([]byte, size)
+	if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
+		return "", err
+	}
+	return gohacks.StringFromImmutableBytes(buf), nil
+}
+
+func copyOutXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint, value string) (int, error) {
+	if size > linux.XATTR_SIZE_MAX {
+		size = linux.XATTR_SIZE_MAX
+	}
+	if size == 0 {
+		// Return the size that would be required to accomodate the value.
+		return len(value), nil
+	}
+	if len(value) > int(size) {
+		if size >= linux.XATTR_SIZE_MAX {
+			return 0, syserror.E2BIG
+		}
+		return 0, syserror.ERANGE
+	}
+	return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value))
+}
diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go
new file mode 100644
index 000000000..f88055676
--- /dev/null
+++ b/pkg/sentry/syscalls/syscalls.go
@@ -0,0 +1,111 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package syscalls is the interface from the application to the kernel.
+// Traditionally, syscalls is the interface that is used by applications to
+// request services from the kernel of a operating system. We provide a
+// user-mode kernel that needs to handle those requests coming from unmodified
+// applications. Therefore, we still use the term "syscalls" to denote this
+// interface.
+//
+// Note that the stubs in this package may merely provide the interface, not
+// the actual implementation. It just makes writing syscall stubs
+// straightforward.
+package syscalls
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Supported returns a syscall that is fully supported.
+func Supported(name string, fn kernel.SyscallFn) kernel.Syscall {
+	return kernel.Syscall{
+		Name:         name,
+		Fn:           fn,
+		SupportLevel: kernel.SupportFull,
+		Note:         "Fully Supported.",
+	}
+}
+
+// PartiallySupported returns a syscall that has a partial implementation.
+func PartiallySupported(name string, fn kernel.SyscallFn, note string, urls []string) kernel.Syscall {
+	return kernel.Syscall{
+		Name:         name,
+		Fn:           fn,
+		SupportLevel: kernel.SupportPartial,
+		Note:         note,
+		URLs:         urls,
+	}
+}
+
+// Error returns a syscall handler that will always give the passed error.
+func Error(name string, err error, note string, urls []string) kernel.Syscall {
+	if note != "" {
+		note = note + "; "
+	}
+	return kernel.Syscall{
+		Name: name,
+		Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+			return 0, nil, err
+		},
+		SupportLevel: kernel.SupportUnimplemented,
+		Note:         fmt.Sprintf("%sReturns %q.", note, err.Error()),
+		URLs:         urls,
+	}
+}
+
+// ErrorWithEvent gives a syscall function that sends an unimplemented
+// syscall event via the event channel and returns the passed error.
+func ErrorWithEvent(name string, err error, note string, urls []string) kernel.Syscall {
+	if note != "" {
+		note = note + "; "
+	}
+	return kernel.Syscall{
+		Name: name,
+		Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+			t.Kernel().EmitUnimplementedEvent(t)
+			return 0, nil, err
+		},
+		SupportLevel: kernel.SupportUnimplemented,
+		Note:         fmt.Sprintf("%sReturns %q.", note, err.Error()),
+		URLs:         urls,
+	}
+}
+
+// CapError gives a syscall function that checks for capability c.  If the task
+// has the capability, it returns ENOSYS, otherwise EPERM. To unprivileged
+// tasks, it will seem like there is an implementation.
+func CapError(name string, c linux.Capability, note string, urls []string) kernel.Syscall {
+	if note != "" {
+		note = note + "; "
+	}
+	return kernel.Syscall{
+		Name: name,
+		Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+			if !t.HasCapability(c) {
+				return 0, nil, syserror.EPERM
+			}
+			t.Kernel().EmitUnimplementedEvent(t)
+			return 0, nil, syserror.ENOSYS
+		},
+		SupportLevel: kernel.SupportUnimplemented,
+		Note:         fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, syserror.EPERM, c.String(), syserror.ENOSYS),
+		URLs:         urls,
+	}
+}