Check in gVisor.

PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
author: Googler <noreply@google.com> 2018-04-27 10:37:02 -0700
committer: Adin Scannell <ascannell@google.com> 2018-04-28 01:44:26 -0400
commit: d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree: 54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/syscalls/linux
parent: f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
40 files changed, 10545 insertions, 0 deletions
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
new file mode 100644
index 000000000..bc67ebf30
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -0,0 +1,103 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "linux_state",
+    srcs = [
+        "sys_aio.go",
+        "sys_futex.go",
+        "sys_poll.go",
+        "sys_time.go",
+    ],
+    out = "linux_state.go",
+    package = "linux",
+)
+
+go_library(
+    name = "linux",
+    srcs = [
+        "error.go",
+        "flags.go",
+        "linux64.go",
+        "linux_state.go",
+        "sigset.go",
+        "sys_aio.go",
+        "sys_capability.go",
+        "sys_epoll.go",
+        "sys_eventfd.go",
+        "sys_file.go",
+        "sys_futex.go",
+        "sys_getdents.go",
+        "sys_identity.go",
+        "sys_inotify.go",
+        "sys_lseek.go",
+        "sys_mmap.go",
+        "sys_mount.go",
+        "sys_pipe.go",
+        "sys_poll.go",
+        "sys_prctl.go",
+        "sys_random.go",
+        "sys_read.go",
+        "sys_rlimit.go",
+        "sys_rusage.go",
+        "sys_sched.go",
+        "sys_sem.go",
+        "sys_signal.go",
+        "sys_socket.go",
+        "sys_stat.go",
+        "sys_sync.go",
+        "sys_sysinfo.go",
+        "sys_syslog.go",
+        "sys_thread.go",
+        "sys_time.go",
+        "sys_timer.go",
+        "sys_timerfd.go",
+        "sys_tls.go",
+        "sys_utsname.go",
+        "sys_write.go",
+        "timespec.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi",
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/bpf",
+        "//pkg/eventchannel",
+        "//pkg/log",
+        "//pkg/metric",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fs/timerfd",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/epoll",
+        "//pkg/sentry/kernel/eventfd",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/kernel/semaphore",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/syscalls",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserr",
+        "//pkg/syserror",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
new file mode 100644
index 000000000..013b385bc
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -0,0 +1,117 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"io"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/metric"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+var (
+	partialResultMetric = metric.MustCreateNewUint64Metric("/syscalls/partial_result", true /* sync */, "Whether or not a partial result has occurred for this sandbox.")
+	partialResultOnce   sync.Once
+)
+
+// handleIOError handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// op and f are used only for panics.
+func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error {
+	switch err {
+	case nil:
+		// Typical successful syscall.
+		return nil
+	case io.EOF:
+		// EOF is always consumed. If this is a partial read/write
+		// (result != 0), the application will see that, otherwise
+		// they will see 0.
+		return nil
+	case syserror.ErrExceedsFileSizeLimit:
+		// Ignore partialResult because this error only applies to
+		// normal files, and for those files we cannot accumulate
+		// write results.
+		//
+		// Do not consume the error and return it as EFBIG.
+		// Simultaneously send a SIGXFSZ per setrlimit(2).
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(syscall.SIGXFSZ),
+			Code:  arch.SignalInfoKernel,
+		})
+		return syscall.EFBIG
+	case syserror.ErrInterrupted:
+		// The syscall was interrupted. Return nil if it completed
+		// partially, otherwise return the error code that the syscall
+		// needs (to indicate to the kernel what it should do).
+		if partialResult {
+			return nil
+		}
+		return intr
+	}
+
+	if !partialResult {
+		// Typical syscall error.
+		return err
+	}
+
+	switch err {
+	case syserror.EINTR:
+		// Syscall interrupted, but completed a partial
+		// read/write.  Like ErrWouldBlock, since we have a
+		// partial read/write, we consume the error and return
+		// the partial result.
+		return nil
+	case syserror.EFAULT:
+		// EFAULT is only shown the user if nothing was
+		// read/written. If we read something (this case), they see
+		// a partial read/write. They will then presumably try again
+		// with an incremented buffer, which will EFAULT with
+		// result == 0.
+		return nil
+	case syserror.EPIPE:
+		// Writes to a pipe or socket will return EPIPE if the other
+		// side is gone. The partial write is returned. EPIPE will be
+		// returned on the next call.
+		//
+		// TODO: In some cases SIGPIPE should also be sent
+		// to the application.
+		return nil
+	case syserror.ErrWouldBlock:
+		// Syscall would block, but completed a partial read/write.
+		// This case should only be returned by IssueIO for nonblocking
+		// files. Since we have a partial read/write, we consume
+		// ErrWouldBlock, returning the partial result.
+		return nil
+	}
+
+	switch err.(type) {
+	case kernel.SyscallRestartErrno:
+		// Identical to the EINTR case.
+		return nil
+	}
+
+	// An unknown error is encountered with a partial read/write.
+	name, _ := f.Dirent.FullName(nil /* ignore chroot */)
+	log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations)
+	partialResultOnce.Do(partialResultMetric.Increment)
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
new file mode 100644
index 000000000..82bfd7c2a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -0,0 +1,95 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// flagsToPermissions returns a Permissions object from Linux flags.
+// This includes truncate permission if O_TRUNC is set in the mask.
+func flagsToPermissions(mask uint) (p fs.PermMask) {
+	switch mask & syscall.O_ACCMODE {
+	case syscall.O_WRONLY:
+		p.Write = true
+	case syscall.O_RDWR:
+		p.Write = true
+		p.Read = true
+	case syscall.O_RDONLY:
+		p.Read = true
+	}
+	return
+}
+
+// fdFlagsToLinux converts a kernel.FDFlags object to a Linux representation.
+func fdFlagsToLinux(flags kernel.FDFlags) (mask uint) {
+	if flags.CloseOnExec {
+		mask |= syscall.FD_CLOEXEC
+	}
+	return
+}
+
+// flagsToLinux converts a FileFlags object to a Linux representation.
+func flagsToLinux(flags fs.FileFlags) (mask uint) {
+	if flags.Direct {
+		mask |= syscall.O_DIRECT
+	}
+	if flags.NonBlocking {
+		mask |= syscall.O_NONBLOCK
+	}
+	if flags.Sync {
+		mask |= syscall.O_SYNC
+	}
+	if flags.Append {
+		mask |= syscall.O_APPEND
+	}
+	if flags.Directory {
+		mask |= syscall.O_DIRECTORY
+	}
+	switch {
+	case flags.Read && flags.Write:
+		mask |= syscall.O_RDWR
+	case flags.Write:
+		mask |= syscall.O_WRONLY
+	case flags.Read:
+		mask |= syscall.O_RDONLY
+	}
+	return
+}
+
+// linuxToFlags converts linux file flags to a FileFlags object.
+func linuxToFlags(mask uint) (flags fs.FileFlags) {
+	return fs.FileFlags{
+		Direct:      mask&syscall.O_DIRECT != 0,
+		Sync:        mask&syscall.O_SYNC != 0,
+		NonBlocking: mask&syscall.O_NONBLOCK != 0,
+		Read:        (mask & syscall.O_ACCMODE) != syscall.O_WRONLY,
+		Write:       (mask & syscall.O_ACCMODE) != syscall.O_RDONLY,
+		Append:      mask&syscall.O_APPEND != 0,
+		Directory:   mask&syscall.O_DIRECTORY != 0,
+	}
+}
+
+// linuxToSettableFlags converts linux file flags to a SettableFileFlags object.
+func linuxToSettableFlags(mask uint) fs.SettableFileFlags {
+	return fs.SettableFileFlags{
+		Direct:      mask&syscall.O_DIRECT != 0,
+		NonBlocking: mask&syscall.O_NONBLOCK != 0,
+		Append:      mask&syscall.O_APPEND != 0,
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
new file mode 100644
index 000000000..44db2d582
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -0,0 +1,376 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package linux provides syscall tables for amd64 Linux.
+//
+// NOTE: Linux i386 support has been removed.
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/syscalls"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// AUDIT_ARCH_X86_64 identifies the Linux syscall API on AMD64, and is taken
+// from <linux/audit.h>.
+const _AUDIT_ARCH_X86_64 = 0xc000003e
+
+// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
+// numbers from Linux 3.11. The entries commented out are those syscalls we
+// don't currently support.
+var AMD64 = &kernel.SyscallTable{
+	OS:   abi.Linux,
+	Arch: arch.AMD64,
+	Version: kernel.Version{
+		Sysname: "Linux",
+		Release: "3.11.10",
+		Version: "#1 SMP Fri Nov 29 10:47:50 PST 2013",
+	},
+	AuditNumber: _AUDIT_ARCH_X86_64,
+	Table: map[uintptr]kernel.SyscallFn{
+		0:  Read,
+		1:  Write,
+		2:  Open,
+		3:  Close,
+		4:  Stat,
+		5:  Fstat,
+		6:  Lstat,
+		7:  Poll,
+		8:  Lseek,
+		9:  Mmap,
+		10: Mprotect,
+		11: Munmap,
+		12: Brk,
+		13: RtSigaction,
+		14: RtSigprocmask,
+		15: RtSigreturn,
+		16: Ioctl,
+		17: Pread64,
+		18: Pwrite64,
+		19: Readv,
+		20: Writev,
+		21: Access,
+		22: Pipe,
+		23: Select,
+		24: SchedYield,
+		25: Mremap,
+		26: Msync,
+		27: Mincore,
+		28: Madvise,
+		//     29: Shmget, TODO
+		//     30: Shmat, TODO
+		//     31: Shmctl, TODO
+		32: Dup,
+		33: Dup2,
+		34: Pause,
+		35: Nanosleep,
+		36: Getitimer,
+		37: Alarm,
+		38: Setitimer,
+		39: Getpid,
+		40: Sendfile,
+		41: Socket,
+		42: Connect,
+		43: Accept,
+		44: SendTo,
+		45: RecvFrom,
+		46: SendMsg,
+		47: RecvMsg,
+		48: Shutdown,
+		49: Bind,
+		50: Listen,
+		51: GetSockName,
+		52: GetPeerName,
+		53: SocketPair,
+		54: SetSockOpt,
+		55: GetSockOpt,
+		56: Clone,
+		57: Fork,
+		58: Vfork,
+		59: Execve,
+		60: Exit,
+		61: Wait4,
+		62: Kill,
+		63: Uname,
+		64: Semget,
+		65: Semop,
+		66: Semctl,
+		//     67: Shmdt, TODO
+		//     68: Msgget, TODO
+		//     69: Msgsnd, TODO
+		//     70: Msgrcv, TODO
+		//     71: Msgctl, TODO
+		72:  Fcntl,
+		73:  Flock,
+		74:  Fsync,
+		75:  Fdatasync,
+		76:  Truncate,
+		77:  Ftruncate,
+		78:  Getdents,
+		79:  Getcwd,
+		80:  Chdir,
+		81:  Fchdir,
+		82:  Rename,
+		83:  Mkdir,
+		84:  Rmdir,
+		85:  Creat,
+		86:  Link,
+		87:  Unlink,
+		88:  Symlink,
+		89:  Readlink,
+		90:  Chmod,
+		91:  Fchmod,
+		92:  Chown,
+		93:  Fchown,
+		94:  Lchown,
+		95:  Umask,
+		96:  Gettimeofday,
+		97:  Getrlimit,
+		98:  Getrusage,
+		99:  Sysinfo,
+		100: Times,
+		101: Ptrace,
+		102: Getuid,
+		103: Syslog,
+		104: Getgid,
+		105: Setuid,
+		106: Setgid,
+		107: Geteuid,
+		108: Getegid,
+		109: Setpgid,
+		110: Getppid,
+		111: Getpgrp,
+		112: Setsid,
+		113: Setreuid,
+		114: Setregid,
+		115: Getgroups,
+		116: Setgroups,
+		117: Setresuid,
+		118: Getresuid,
+		119: Setresgid,
+		120: Getresgid,
+		121: Getpgid,
+		//     122: Setfsuid, TODO
+		//     123: Setfsgid, TODO
+		124: Getsid,
+		125: Capget,
+		126: Capset,
+		127: RtSigpending,
+		128: RtSigtimedwait,
+		129: RtSigqueueinfo,
+		130: RtSigsuspend,
+		131: Sigaltstack,
+		132: Utime,
+		133: Mknod,
+		134: syscalls.Error(syscall.ENOSYS),          // Uselib, obsolete
+		135: syscalls.ErrorWithEvent(syscall.EINVAL), // SetPersonality, unable to change personality
+		136: syscalls.ErrorWithEvent(syscall.ENOSYS), // Ustat, needs filesystem support
+		137: Statfs,
+		138: Fstatfs,
+		//     139: Sysfs, TODO
+		140: Getpriority,
+		141: Setpriority,
+		142: syscalls.CapError(linux.CAP_SYS_NICE), // SchedSetparam, requires cap_sys_nice
+		143: SchedGetparam,
+		144: SchedSetscheduler,
+		145: SchedGetscheduler,
+		146: SchedGetPriorityMax,
+		147: SchedGetPriorityMin,
+		148: syscalls.ErrorWithEvent(syscall.EPERM),      // SchedRrGetInterval,
+		149: syscalls.Error(nil),                         // Mlock, TODO
+		150: syscalls.Error(nil),                         // Munlock, TODO
+		151: syscalls.Error(nil),                         // Mlockall, TODO
+		152: syscalls.Error(nil),                         // Munlockall, TODO
+		153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup,
+		154: syscalls.Error(syscall.EPERM),               // ModifyLdt,
+		155: syscalls.Error(syscall.EPERM),               // PivotRoot,
+		156: syscalls.Error(syscall.EPERM),               // Sysctl, syscall is "worthless"
+		157: Prctl,
+		158: ArchPrctl,
+		159: syscalls.CapError(linux.CAP_SYS_TIME), // Adjtimex, requires cap_sys_time
+		160: Setrlimit,
+		161: Chroot,
+		162: Sync,
+		163: syscalls.CapError(linux.CAP_SYS_PACCT), // Acct, requires cap_sys_pacct
+		164: syscalls.CapError(linux.CAP_SYS_TIME),  // Settimeofday, requires cap_sys_time
+		165: Mount,
+		166: Umount2,
+		167: syscalls.CapError(linux.CAP_SYS_ADMIN), // Swapon, requires cap_sys_admin
+		168: syscalls.CapError(linux.CAP_SYS_ADMIN), // Swapoff, requires cap_sys_admin
+		169: syscalls.CapError(linux.CAP_SYS_BOOT),  // Reboot, requires cap_sys_boot
+		170: Sethostname,
+		171: Setdomainname,
+		172: syscalls.CapError(linux.CAP_SYS_RAWIO),  // Iopl, requires cap_sys_rawio
+		173: syscalls.CapError(linux.CAP_SYS_RAWIO),  // Ioperm, requires cap_sys_rawio
+		174: syscalls.CapError(linux.CAP_SYS_MODULE), // CreateModule, requires cap_sys_module
+		175: syscalls.CapError(linux.CAP_SYS_MODULE), // InitModule, requires cap_sys_module
+		176: syscalls.CapError(linux.CAP_SYS_MODULE), // DeleteModule, requires cap_sys_module
+		177: syscalls.Error(syscall.ENOSYS),          // GetKernelSyms, not supported in > 2.6
+		178: syscalls.Error(syscall.ENOSYS),          // QueryModule, not supported in > 2.6
+		179: syscalls.CapError(linux.CAP_SYS_ADMIN),  // Quotactl, requires cap_sys_admin (most operations)
+		180: syscalls.Error(syscall.ENOSYS),          // Nfsservctl, does not exist > 3.1
+		181: syscalls.Error(syscall.ENOSYS),          // Getpmsg, not implemented in Linux
+		182: syscalls.Error(syscall.ENOSYS),          // Putpmsg, not implemented in Linux
+		183: syscalls.Error(syscall.ENOSYS),          // AfsSyscall, not implemented in Linux
+		184: syscalls.Error(syscall.ENOSYS),          // Tuxcall, not implemented in Linux
+		185: syscalls.Error(syscall.ENOSYS),          // Security, not implemented in Linux
+		186: Gettid,
+		187: nil,                                      // Readahead, TODO
+		188: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Setxattr, requires filesystem support
+		189: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Lsetxattr, requires filesystem support
+		190: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Fsetxattr, requires filesystem support
+		191: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Getxattr, requires filesystem support
+		192: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Lgetxattr, requires filesystem support
+		193: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Fgetxattr, requires filesystem support
+		194: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Listxattr, requires filesystem support
+		195: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Llistxattr, requires filesystem support
+		196: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Flistxattr, requires filesystem support
+		197: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Removexattr, requires filesystem support
+		198: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Lremovexattr, requires filesystem support
+		199: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Fremovexattr, requires filesystem support
+		200: Tkill,
+		201: Time,
+		202: Futex,
+		203: SchedSetaffinity,
+		204: SchedGetaffinity,
+		205: syscalls.Error(syscall.ENOSYS), // SetThreadArea, expected to return ENOSYS on 64-bit
+		206: IoSetup,
+		207: IoDestroy,
+		208: IoGetevents,
+		209: IoSubmit,
+		210: IoCancel,
+		211: syscalls.Error(syscall.ENOSYS),         // GetThreadArea, expected to return ENOSYS on 64-bit
+		212: syscalls.CapError(linux.CAP_SYS_ADMIN), // LookupDcookie, requires cap_sys_admin
+		213: EpollCreate,
+		214: syscalls.ErrorWithEvent(syscall.ENOSYS), // EpollCtlOld, deprecated (afaik, unused)
+		215: syscalls.ErrorWithEvent(syscall.ENOSYS), // EpollWaitOld, deprecated (afaik, unused)
+		216: syscalls.ErrorWithEvent(syscall.ENOSYS), // RemapFilePages, deprecated since 3.16
+		217: Getdents64,
+		218: SetTidAddress,
+		219: RestartSyscall,
+		//     220: Semtimedop, TODO
+		221: Fadvise64,
+		//     222: TimerCreate, TODO
+		//     223: TimerSettime, TODO
+		//     224: TimerGettime, TODO
+		//     225: TimerGetoverrun, TODO
+		//     226: TimerDelete, TODO
+		227: ClockSettime,
+		228: ClockGettime,
+		229: ClockGetres,
+		230: ClockNanosleep,
+		231: ExitGroup,
+		232: EpollWait,
+		233: EpollCtl,
+		234: Tgkill,
+		235: Utimes,
+		236: syscalls.Error(syscall.ENOSYS),        // Vserver, not implemented by Linux
+		237: syscalls.CapError(linux.CAP_SYS_NICE), // Mbind, may require cap_sys_nice TODO
+		238: SetMempolicy,
+		239: GetMempolicy,
+		//     240: MqOpen, TODO
+		//     241: MqUnlink, TODO
+		//     242: MqTimedsend, TODO
+		//     243: MqTimedreceive, TODO
+		//     244: MqNotify, TODO
+		//     245: MqGetsetattr, TODO
+		246: syscalls.CapError(linux.CAP_SYS_BOOT), // kexec_load, requires cap_sys_boot
+		247: Waitid,
+		248: syscalls.Error(syscall.EACCES),         // AddKey, not available to user
+		249: syscalls.Error(syscall.EACCES),         // RequestKey, not available to user
+		250: syscalls.Error(syscall.EACCES),         // Keyctl, not available to user
+		251: syscalls.CapError(linux.CAP_SYS_ADMIN), // IoprioSet, requires cap_sys_nice or cap_sys_admin (depending)
+		252: syscalls.CapError(linux.CAP_SYS_ADMIN), // IoprioGet, requires cap_sys_nice or cap_sys_admin (depending)
+		253: InotifyInit,
+		254: InotifyAddWatch,
+		255: InotifyRmWatch,
+		256: syscalls.CapError(linux.CAP_SYS_NICE), // MigratePages, requires cap_sys_nice
+		257: Openat,
+		258: Mkdirat,
+		259: Mknodat,
+		260: Fchownat,
+		261: Futimesat,
+		262: Fstatat,
+		263: Unlinkat,
+		264: Renameat,
+		265: Linkat,
+		266: Symlinkat,
+		267: Readlinkat,
+		268: Fchmodat,
+		269: Faccessat,
+		270: Pselect,
+		271: Ppoll,
+		272: Unshare,
+		273: syscalls.Error(syscall.ENOSYS), // SetRobustList, obsolete
+		274: syscalls.Error(syscall.ENOSYS), // GetRobustList, obsolete
+		//     275: Splice, TODO
+		//     276: Tee, TODO
+		//     277: SyncFileRange, TODO
+		//     278: Vmsplice, TODO
+		279: syscalls.CapError(linux.CAP_SYS_NICE), // MovePages, requires cap_sys_nice (mostly)
+		280: Utimensat,
+		281: EpollPwait,
+		//     282: Signalfd, TODO
+		283: TimerfdCreate,
+		284: Eventfd,
+		285: Fallocate,
+		286: TimerfdSettime,
+		287: TimerfdGettime,
+		288: Accept4,
+		//     289: Signalfd4, TODO
+		290: Eventfd2,
+		291: EpollCreate1,
+		292: Dup3,
+		293: Pipe2,
+		294: InotifyInit1,
+		295: Preadv,
+		296: Pwritev,
+		297: RtTgsigqueueinfo,
+		298: syscalls.ErrorWithEvent(syscall.ENODEV), // PerfEventOpen, no support for perf counters
+		299: RecvMMsg,
+		300: syscalls.ErrorWithEvent(syscall.ENOSYS), // FanotifyInit, needs CONFIG_FANOTIFY
+		301: syscalls.ErrorWithEvent(syscall.ENOSYS), // FanotifyMark, needs CONFIG_FANOTIFY
+		302: Prlimit64,
+		303: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP), // NameToHandleAt, needs filesystem support
+		304: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP), // OpenByHandleAt, needs filesystem support
+		305: syscalls.CapError(linux.CAP_SYS_TIME),       // ClockAdjtime, requires cap_sys_time
+		306: Syncfs,
+		307: SendMMsg,
+		//     308: Setns, TODO
+		309: Getcpu,
+		//     310: ProcessVmReadv, TODO may require cap_sys_ptrace
+		//     311: ProcessVmWritev, TODO may require cap_sys_ptrace
+		312: syscalls.CapError(linux.CAP_SYS_PTRACE), // Kcmp, requires cap_sys_ptrace
+		313: syscalls.CapError(linux.CAP_SYS_MODULE), // FinitModule, requires cap_sys_module
+		// "Backports."
+		318: GetRandom,
+	},
+
+	Emulate: map[usermem.Addr]uintptr{
+		0xffffffffff600000: 96,  // vsyscall gettimeofday(2)
+		0xffffffffff600400: 201, // vsyscall time(2)
+		0xffffffffff600800: 309, // vsyscall getcpu(2)
+	},
+	Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
+		syscalls.UnimplementedEvent(t)
+		return 0, syserror.ENOSYS
+	},
+}
diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go
new file mode 100644
index 000000000..bfb541634
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sigset.go
@@ -0,0 +1,69 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// copyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and
+// STOP are clear.
+func copyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) {
+	if size != linux.SignalSetSize {
+		return 0, syscall.EINVAL
+	}
+	b := t.CopyScratchBuffer(8)
+	if _, err := t.CopyInBytes(sigSetAddr, b); err != nil {
+		return 0, err
+	}
+	mask := usermem.ByteOrder.Uint64(b[:])
+	return linux.SignalSet(mask) &^ kernel.UnblockableSignals, nil
+}
+
+// copyOutSigSet copies out a sigset_t.
+func copyOutSigSet(t *kernel.Task, sigSetAddr usermem.Addr, mask linux.SignalSet) error {
+	b := t.CopyScratchBuffer(8)
+	usermem.ByteOrder.PutUint64(b, uint64(mask))
+	_, err := t.CopyOutBytes(sigSetAddr, b)
+	return err
+}
+
+// copyInSigSetWithSize copies in a structure as below
+//
+//   struct {
+//       sigset_t* sigset_addr;
+//       size_t sizeof_sigset;
+//   };
+//
+// and returns sigset_addr and size.
+func copyInSigSetWithSize(t *kernel.Task, addr usermem.Addr) (usermem.Addr, uint, error) {
+	switch t.Arch().Width() {
+	case 8:
+		in := t.CopyScratchBuffer(16)
+		if _, err := t.CopyInBytes(addr, in); err != nil {
+			return 0, 0, err
+		}
+		maskAddr := usermem.Addr(usermem.ByteOrder.Uint64(in[0:]))
+		maskSize := uint(usermem.ByteOrder.Uint64(in[8:]))
+		return maskAddr, maskSize, nil
+	default:
+		return 0, 0, syserror.ENOSYS
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
new file mode 100644
index 000000000..80407a082
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -0,0 +1,402 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"encoding/binary"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// I/O commands.
+const (
+	_IOCB_CMD_PREAD   = 0
+	_IOCB_CMD_PWRITE  = 1
+	_IOCB_CMD_FSYNC   = 2
+	_IOCB_CMD_FDSYNC  = 3
+	_IOCB_CMD_NOOP    = 6
+	_IOCB_CMD_PREADV  = 7
+	_IOCB_CMD_PWRITEV = 8
+)
+
+// I/O flags.
+const (
+	_IOCB_FLAG_RESFD = 1
+)
+
+// ioCallback describes an I/O request.
+//
+// The priority field is currently ignored in the implementation below. Also
+// note that the IOCB_FLAG_RESFD feature is not supported.
+type ioCallback struct {
+	Data      uint64
+	Key       uint32
+	Reserved1 uint32
+
+	OpCode  uint16
+	ReqPrio int16
+	FD      uint32
+
+	Buf    uint64
+	Bytes  uint64
+	Offset int64
+
+	Reserved2 uint64
+	Flags     uint32
+
+	// eventfd to signal if IOCB_FLAG_RESFD is set in flags.
+	ResFD uint32
+}
+
+// ioEvent describes an I/O result.
+type ioEvent struct {
+	Data    uint64
+	Obj     uint64
+	Result  int64
+	Result2 int64
+}
+
+// ioEventSize is the size of an ioEvent encoded.
+var ioEventSize = binary.Size(ioEvent{})
+
+// IoSetup implements linux syscall io_setup(2).
+func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nrEvents := args[0].Int()
+	idAddr := args[1].Pointer()
+
+	// Linux uses the native long as the aio ID.
+	//
+	// The context pointer _must_ be zero initially.
+	var idIn uint64
+	if _, err := t.CopyIn(idAddr, &idIn); err != nil {
+		return 0, nil, err
+	}
+	if idIn != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	id, err := t.MemoryManager().NewAIOContext(t, uint32(nrEvents))
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Copy out the new ID.
+	if _, err := t.CopyOut(idAddr, &id); err != nil {
+		t.MemoryManager().DestroyAIOContext(t, id)
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// IoDestroy implements linux syscall io_destroy(2).
+func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Uint64()
+
+	// Destroy the given context.
+	if !t.MemoryManager().DestroyAIOContext(t, id) {
+		// Does not exist.
+		return 0, nil, syserror.EINVAL
+	}
+	// FIXME: Linux blocks until all AIO to the destroyed context is
+	// done.
+	return 0, nil, nil
+}
+
+// IoGetevents implements linux syscall io_getevents(2).
+func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Uint64()
+	minEvents := args[1].Int()
+	events := args[2].Int()
+	eventsAddr := args[3].Pointer()
+	timespecAddr := args[4].Pointer()
+
+	// Sanity check arguments.
+	if minEvents > events {
+		return 0, nil, syserror.EINVAL
+	}
+
+	ctx, ok := t.MemoryManager().LookupAIOContext(t, id)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Setup the timeout.
+	var haveDeadline bool
+	var deadline ktime.Time
+	if timespecAddr != 0 {
+		d, err := copyTimespecIn(t, timespecAddr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if !d.Valid() {
+			return 0, nil, syserror.EINVAL
+		}
+		deadline = t.Kernel().MonotonicClock().Now().Add(d.ToDuration())
+		haveDeadline = true
+	}
+
+	// Loop over all requests.
+	for count := int32(0); count < events; count++ {
+		// Get a request, per semantics.
+		var v interface{}
+		if count >= minEvents {
+			var ok bool
+			v, ok = ctx.PopRequest()
+			if !ok {
+				return uintptr(count), nil, nil
+			}
+		} else {
+			var err error
+			v, err = waitForRequest(ctx, t, haveDeadline, deadline)
+			if err != nil {
+				if count > 0 || err == syserror.ETIMEDOUT {
+					return uintptr(count), nil, nil
+				}
+				return 0, nil, syserror.ConvertIntr(err, syserror.EINTR)
+			}
+		}
+
+		ev := v.(*ioEvent)
+
+		// Copy out the result.
+		if _, err := t.CopyOut(eventsAddr, ev); err != nil {
+			if count > 0 {
+				return uintptr(count), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Keep rolling.
+		eventsAddr += usermem.Addr(ioEventSize)
+	}
+
+	// Everything finished.
+	return uintptr(events), nil, nil
+}
+
+func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (interface{}, error) {
+	for {
+		if v, ok := ctx.PopRequest(); ok {
+			// Request was readly available. Just return it.
+			return v, nil
+		}
+
+		// Need to wait for request completion.
+		done, active := ctx.WaitChannel()
+		if !active {
+			// Context has been destroyed.
+			return nil, syserror.EINVAL
+		}
+		if err := t.BlockWithDeadline(done, haveDeadline, deadline); err != nil {
+			return nil, err
+		}
+	}
+}
+
+// memoryFor returns appropriate memory for the given callback.
+func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
+	bytes := int(cb.Bytes)
+	if bytes < 0 {
+		// Linux also requires that this field fit in ssize_t.
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+
+	// Since this I/O will be asynchronous with respect to t's task goroutine,
+	// we have no guarantee that t's AddressSpace will be active during the
+	// I/O.
+	switch cb.OpCode {
+	case _IOCB_CMD_PREAD, _IOCB_CMD_PWRITE:
+		return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+			AddressSpaceActive: false,
+		})
+
+	case _IOCB_CMD_PREADV, _IOCB_CMD_PWRITEV:
+		return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+			AddressSpaceActive: false,
+		})
+
+	case _IOCB_CMD_FSYNC, _IOCB_CMD_FDSYNC, _IOCB_CMD_NOOP:
+		return usermem.IOSequence{}, nil
+
+	default:
+		// Not a supported command.
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+}
+
+func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) {
+	ev := &ioEvent{
+		Data: cb.Data,
+		Obj:  uint64(cbAddr),
+	}
+
+	// Construct a context.Context that will not be interrupted if t is
+	// interrupted.
+	c := t.AsyncContext()
+
+	var err error
+	switch cb.OpCode {
+	case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV:
+		ev.Result, err = file.Preadv(c, ioseq, cb.Offset)
+	case _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV:
+		ev.Result, err = file.Pwritev(c, ioseq, cb.Offset)
+	case _IOCB_CMD_FSYNC:
+		err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncAll)
+	case _IOCB_CMD_FDSYNC:
+		err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncData)
+	}
+
+	// Update the result.
+	if err != nil {
+		err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file)
+		ev.Result = -int64(t.ExtractErrno(err, 0))
+	}
+
+	file.DecRef()
+
+	// Queue the result for delivery.
+	ctx.FinishRequest(ev)
+
+	// Notify the event file if one was specified. This needs to happen
+	// *after* queueing the result to avoid racing with the thread we may
+	// wake up.
+	if eventFile != nil {
+		eventFile.FileOperations.(*eventfd.EventOperations).Signal(1)
+		eventFile.DecRef()
+	}
+}
+
+// submitCallback processes a single callback.
+func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error {
+	file := t.FDMap().GetFile(kdefs.FD(cb.FD))
+	if file == nil {
+		// File not found.
+		return syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Was there an eventFD? Extract it.
+	var eventFile *fs.File
+	if cb.Flags&_IOCB_FLAG_RESFD != 0 {
+		eventFile := t.FDMap().GetFile(kdefs.FD(cb.ResFD))
+		if eventFile == nil {
+			// Bad FD.
+			return syserror.EBADF
+		}
+		defer eventFile.DecRef()
+
+		// Check that it is an eventfd.
+		if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok {
+			// Not an event FD.
+			return syserror.EINVAL
+		}
+	}
+
+	ioseq, err := memoryFor(t, cb)
+	if err != nil {
+		return err
+	}
+
+	// Prepare the request.
+	ctx, ok := t.MemoryManager().LookupAIOContext(t, id)
+	if !ok {
+		return syserror.EINVAL
+	}
+	if ready := ctx.Prepare(); !ready {
+		// Context is busy.
+		return syserror.EAGAIN
+	}
+
+	if eventFile != nil {
+		// The request is set. Make sure there's a ref on the file.
+		//
+		// This is necessary when the callback executes on completion,
+		// which is also what will release this reference.
+		eventFile.IncRef()
+	}
+
+	// Perform the request asynchronously.
+	file.IncRef()
+	fs.Async(func() { performCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile) })
+
+	// All set.
+	return nil
+}
+
+// IoSubmit implements linux syscall io_submit(2).
+func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Uint64()
+	nrEvents := args[1].Int()
+	addr := args[2].Pointer()
+
+	for i := int32(0); i < nrEvents; i++ {
+		// Copy in the address.
+		cbAddrNative := t.Arch().Native(0)
+		if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
+			if i > 0 {
+				// Some successful.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Copy in this callback.
+		var cb ioCallback
+		cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
+		if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+
+			if i > 0 {
+				// Some have been successful.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Process this callback.
+		if err := submitCallback(t, id, &cb, cbAddr); err != nil {
+			if i > 0 {
+				// Partial success.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Advance to the next one.
+		addr += usermem.Addr(t.Arch().Width())
+	}
+
+	return uintptr(nrEvents), nil, nil
+}
+
+// IoCancel implements linux syscall io_cancel(2).
+//
+// It is not presently supported (ENOSYS indicates no support on this
+// architecture).
+func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, syserror.ENOSYS
+}
diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go
new file mode 100644
index 000000000..89c81ac90
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_capability.go
@@ -0,0 +1,149 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func lookupCaps(t *kernel.Task, tid kernel.ThreadID) (permitted, inheritable, effective auth.CapabilitySet, err error) {
+	if tid < 0 {
+		err = syserror.EINVAL
+		return
+	}
+	if tid > 0 {
+		t = t.PIDNamespace().TaskWithID(tid)
+	}
+	if t == nil {
+		err = syserror.ESRCH
+		return
+	}
+	creds := t.Credentials()
+	permitted, inheritable, effective = creds.PermittedCaps, creds.InheritableCaps, creds.EffectiveCaps
+	return
+}
+
+// Capget implements Linux syscall capget.
+func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	hdrAddr := args[0].Pointer()
+	dataAddr := args[1].Pointer()
+
+	var hdr linux.CapUserHeader
+	if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+		return 0, nil, err
+	}
+	// hdr.Pid doesn't need to be valid if this capget() is a "version probe"
+	// (hdr.Version is unrecognized and dataAddr is null), so we can't do the
+	// lookup yet.
+	switch hdr.Version {
+	case linux.LINUX_CAPABILITY_VERSION_1:
+		if dataAddr == 0 {
+			return 0, nil, nil
+		}
+		p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid))
+		if err != nil {
+			return 0, nil, err
+		}
+		data := linux.CapUserData{
+			Effective:   uint32(e),
+			Permitted:   uint32(p),
+			Inheritable: uint32(i),
+		}
+		_, err = t.CopyOut(dataAddr, &data)
+		return 0, nil, err
+
+	case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
+		if dataAddr == 0 {
+			return 0, nil, nil
+		}
+		p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid))
+		if err != nil {
+			return 0, nil, err
+		}
+		data := [2]linux.CapUserData{
+			{
+				Effective:   uint32(e),
+				Permitted:   uint32(p),
+				Inheritable: uint32(i),
+			},
+			{
+				Effective:   uint32(e >> 32),
+				Permitted:   uint32(p >> 32),
+				Inheritable: uint32(i >> 32),
+			},
+		}
+		_, err = t.CopyOut(dataAddr, &data)
+		return 0, nil, err
+
+	default:
+		hdr.Version = linux.HighestCapabilityVersion
+		if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+			return 0, nil, err
+		}
+		if dataAddr != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		return 0, nil, nil
+	}
+}
+
+// Capset implements Linux syscall capset.
+func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	hdrAddr := args[0].Pointer()
+	dataAddr := args[1].Pointer()
+
+	var hdr linux.CapUserHeader
+	if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+		return 0, nil, err
+	}
+	switch hdr.Version {
+	case linux.LINUX_CAPABILITY_VERSION_1:
+		if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() {
+			return 0, nil, syserror.EPERM
+		}
+		var data linux.CapUserData
+		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+			return 0, nil, err
+		}
+		p := auth.CapabilitySet(data.Permitted) & auth.AllCapabilities
+		i := auth.CapabilitySet(data.Inheritable) & auth.AllCapabilities
+		e := auth.CapabilitySet(data.Effective) & auth.AllCapabilities
+		return 0, nil, t.SetCapabilitySets(p, i, e)
+
+	case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
+		if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() {
+			return 0, nil, syserror.EPERM
+		}
+		var data [2]linux.CapUserData
+		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+			return 0, nil, err
+		}
+		p := (auth.CapabilitySet(data[0].Permitted) | (auth.CapabilitySet(data[1].Permitted) << 32)) & auth.AllCapabilities
+		i := (auth.CapabilitySet(data[0].Inheritable) | (auth.CapabilitySet(data[1].Inheritable) << 32)) & auth.AllCapabilities
+		e := (auth.CapabilitySet(data[0].Effective) | (auth.CapabilitySet(data[1].Effective) << 32)) & auth.AllCapabilities
+		return 0, nil, t.SetCapabilitySets(p, i, e)
+
+	default:
+		hdr.Version = linux.HighestCapabilityVersion
+		if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
new file mode 100644
index 000000000..e69dfc77a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -0,0 +1,171 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/syscalls"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// EpollCreate1 implements the epoll_create1(2) linux syscall.
+func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	if flags & ^syscall.EPOLL_CLOEXEC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	closeOnExec := flags&syscall.EPOLL_CLOEXEC != 0
+	fd, err := syscalls.CreateEpoll(t, closeOnExec)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// EpollCreate implements the epoll_create(2) linux syscall.
+func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := args[0].Int()
+
+	if size <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	fd, err := syscalls.CreateEpoll(t, false)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// EpollCtl implements the epoll_ctl(2) linux syscall.
+func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := kdefs.FD(args[0].Int())
+	op := args[1].Int()
+	fd := kdefs.FD(args[2].Int())
+	eventAddr := args[3].Pointer()
+
+	// Capture the event state if needed.
+	flags := epoll.EntryFlags(0)
+	mask := waiter.EventMask(0)
+	var data [2]int32
+	if op != syscall.EPOLL_CTL_DEL {
+		var e syscall.EpollEvent
+		if _, err := t.CopyIn(eventAddr, &e); err != nil {
+			return 0, nil, err
+		}
+
+		if e.Events&syscall.EPOLLONESHOT != 0 {
+			flags |= epoll.OneShot
+		}
+
+		// syscall.EPOLLET is incorrectly generated as a negative number
+		// in Go, see https://github.com/golang/go/issues/5328 for
+		// details.
+		if e.Events&-syscall.EPOLLET != 0 {
+			flags |= epoll.EdgeTriggered
+		}
+
+		mask = waiter.EventMask(e.Events)
+		data[0] = e.Fd
+		data[1] = e.Pad
+	}
+
+	// Perform the requested operations.
+	switch op {
+	case syscall.EPOLL_CTL_ADD:
+		// See fs/eventpoll.c.
+		mask |= waiter.EventHUp | waiter.EventErr
+		return 0, nil, syscalls.AddEpoll(t, epfd, fd, flags, mask, data)
+	case syscall.EPOLL_CTL_DEL:
+		return 0, nil, syscalls.RemoveEpoll(t, epfd, fd)
+	case syscall.EPOLL_CTL_MOD:
+		// Same as EPOLL_CTL_ADD.
+		mask |= waiter.EventHUp | waiter.EventErr
+		return 0, nil, syscalls.UpdateEpoll(t, epfd, fd, flags, mask, data)
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+// copyOutEvents copies epoll events from the kernel to user memory.
+func copyOutEvents(t *kernel.Task, addr usermem.Addr, e []epoll.Event) error {
+	const itemLen = 12
+	if _, ok := addr.AddLength(uint64(len(e)) * itemLen); !ok {
+		return syserror.EFAULT
+	}
+
+	b := t.CopyScratchBuffer(itemLen)
+	for i := range e {
+		usermem.ByteOrder.PutUint32(b[0:], e[i].Events)
+		usermem.ByteOrder.PutUint32(b[4:], uint32(e[i].Data[0]))
+		usermem.ByteOrder.PutUint32(b[8:], uint32(e[i].Data[1]))
+		if _, err := t.CopyOutBytes(addr, b); err != nil {
+			return err
+		}
+		addr += itemLen
+	}
+
+	return nil
+}
+
+// EpollWait implements the epoll_wait(2) linux syscall.
+func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := kdefs.FD(args[0].Int())
+	eventsAddr := args[1].Pointer()
+	maxEvents := int(args[2].Int())
+	timeout := int(args[3].Int())
+
+	r, err := syscalls.WaitEpoll(t, epfd, maxEvents, timeout)
+	if err != nil {
+		return 0, nil, syserror.ConvertIntr(err, syserror.EINTR)
+	}
+
+	if len(r) != 0 {
+		if err := copyOutEvents(t, eventsAddr, r); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	return uintptr(len(r)), nil, nil
+}
+
+// EpollPwait implements the epoll_pwait(2) linux syscall.
+func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	maskAddr := args[4].Pointer()
+	maskSize := uint(args[5].Uint())
+
+	if maskAddr != 0 {
+		mask, err := copyInSigSet(t, maskAddr, maskSize)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		oldmask := t.SignalMask()
+		t.SetSignalMask(mask)
+		t.SetSavedSignalMask(oldmask)
+	}
+
+	return EpollWait(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go
new file mode 100644
index 000000000..60fe5a133
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_eventfd.go
@@ -0,0 +1,65 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd"
+)
+
+const (
+	// EFD_SEMAPHORE is a flag used in syscall eventfd(2) and eventfd2(2). Please
+	// see its man page for more information.
+	EFD_SEMAPHORE = 1
+	EFD_NONBLOCK  = 0x800
+	EFD_CLOEXEC   = 0x80000
+)
+
+// Eventfd2 implements linux syscall eventfd2(2).
+func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	initVal := args[0].Int()
+	flags := uint(args[1].Uint())
+	allOps := uint(EFD_SEMAPHORE | EFD_NONBLOCK | EFD_CLOEXEC)
+
+	if flags & ^allOps != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	event := eventfd.New(t, uint64(initVal), flags&EFD_SEMAPHORE != 0)
+	event.SetFlags(fs.SettableFileFlags{
+		NonBlocking: flags&EFD_NONBLOCK != 0,
+	})
+	defer event.DecRef()
+
+	fd, err := t.FDMap().NewFDFrom(0, event, kernel.FDFlags{
+		CloseOnExec: flags&EFD_CLOEXEC != 0,
+	},
+		t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// Eventfd implements linux syscall eventfd(2).
+func Eventfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	args[1].Value = 0
+	return Eventfd2(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
new file mode 100644
index 000000000..a2dbba7e0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -0,0 +1,1942 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"io"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// fileOpAt performs an operation on the second last component in the path.
+func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string) error) error {
+	// Extract the last component.
+	dir, name := fs.SplitLast(path)
+	if dir == "/" {
+		// Common case: we are accessing a file in the root.
+		root := t.FSContext().RootDirectory()
+		err := fn(root, root, name)
+		root.DecRef()
+		return err
+	} else if dir == "." && dirFD == linux.AT_FDCWD {
+		// Common case: we are accessing a file relative to the current
+		// working directory; skip the look-up.
+		wd := t.FSContext().WorkingDirectory()
+		root := t.FSContext().RootDirectory()
+		err := fn(root, wd, name)
+		wd.DecRef()
+		root.DecRef()
+		return err
+	}
+
+	return fileOpOn(t, dirFD, dir, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return fn(root, d, name)
+	})
+}
+
+// fileOpOn performs an operation on the last entry of the path.
+func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error {
+	var (
+		d   *fs.Dirent // The file.
+		wd  *fs.Dirent // The working directory (if required.)
+		rel *fs.Dirent // The relative directory for search (if required.)
+		f   *fs.File   // The file corresponding to dirFD (if required.)
+		err error
+	)
+
+	// Extract the working directory (maybe).
+	if len(path) > 0 && path[0] == '/' {
+		// Absolute path; rel can be nil.
+	} else if dirFD == linux.AT_FDCWD {
+		// Need to reference the working directory.
+		wd = t.FSContext().WorkingDirectory()
+		rel = wd
+	} else {
+		// Need to extract the given FD.
+		f = t.FDMap().GetFile(dirFD)
+		if f == nil {
+			return syserror.EBADF
+		}
+		rel = f.Dirent
+		if !fs.IsDir(rel.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+	}
+
+	// Grab the root (always required.)
+	root := t.FSContext().RootDirectory()
+
+	// Lookup the node.
+	if resolve {
+		d, err = t.MountNamespace().FindInode(t, root, rel, path, linux.MaxSymlinkTraversals)
+	} else {
+		d, err = t.MountNamespace().FindLink(t, root, rel, path, linux.MaxSymlinkTraversals)
+	}
+	root.DecRef()
+	if wd != nil {
+		wd.DecRef()
+	}
+	if f != nil {
+		f.DecRef()
+	}
+	if err != nil {
+		return err
+	}
+
+	err = fn(root, d)
+	d.DecRef()
+	return err
+}
+
+// copyInPath copies a path in.
+func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string, dirPath bool, err error) {
+	path, err = t.CopyInString(addr, syscall.PathMax)
+	if err != nil {
+		return "", false, err
+	}
+	if path == "" && !allowEmpty {
+		return "", false, syserror.ENOENT
+	}
+
+	// If the path ends with a /, then checks must be enforced in various
+	// ways in the different callers. We pass this back to the caller.
+	path, dirPath = fs.TrimTrailingSlashes(path)
+
+	return path, dirPath, nil
+}
+
+func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd uintptr, err error) {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, err
+	}
+
+	err = fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		// First check a few things about the filesystem before trying to get the file
+		// reference.
+		//
+		// It's required that Check does not try to open files not that aren't backed by
+		// this dirent (e.g. pipes and sockets) because this would result in opening these
+		// files an extra time just to check permissions.
+		if err := d.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
+			return err
+		}
+
+		fileFlags := linuxToFlags(flags)
+		isDir := fs.IsDir(d.Inode.StableAttr)
+
+		// If O_DIRECTORY is set, but the file is not a directory, then fail.
+		if fileFlags.Directory && !isDir {
+			return syserror.ENOTDIR
+		}
+
+		// If it's a directory, then make sure.
+		if dirPath && !isDir {
+			return syserror.ENOTDIR
+		}
+
+		// Don't allow directories to be opened writable.
+		if isDir && fileFlags.Write {
+			return syserror.EISDIR
+		}
+
+		file, err := d.Inode.GetFile(t, d, fileFlags)
+		if err != nil {
+			return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+		}
+		defer file.DecRef()
+
+		// Success.
+		fdFlags := kernel.FDFlags{CloseOnExec: flags&syscall.O_CLOEXEC != 0}
+		newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
+		if err != nil {
+			return err
+		}
+
+		// Set return result in frame.
+		fd = uintptr(newFD)
+
+		// Generate notification for opened file.
+		d.InotifyEvent(linux.IN_OPEN, 0)
+
+		return nil
+	})
+	return fd, err // Use result in frame.
+}
+
+func mknodAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Do we have the appropriate permissions on the parent?
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+			return err
+		}
+
+		// Attempt a creation.
+		perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+
+		switch mode.FileType() {
+		case 0:
+			// "Zero file type is equivalent to type S_IFREG." - mknod(2)
+			fallthrough
+		case linux.ModeRegular:
+			// We are not going to return the file, so the actual
+			// flags used don't matter, but they cannot be empty or
+			// Create will complain.
+			flags := fs.FileFlags{Read: true, Write: true}
+			file, err := d.Create(t, root, name, flags, perms)
+			if err != nil {
+				return err
+			}
+			file.DecRef()
+			return nil
+
+		case linux.ModeNamedPipe:
+			return d.CreateFifo(t, root, name, perms)
+
+		case linux.ModeSocket:
+			// While it is possible create a unix domain socket file on linux
+			// using mknod(2), in practice this is pretty useless from an
+			// application. Linux internally uses mknod() to create the socket
+			// node during bind(2), but we implement bind(2) independently. If
+			// an application explicitly creates a socket node using mknod(),
+			// you can't seem to bind() or connect() to the resulting socket.
+			//
+			// Instead of emulating this seemingly useless behaviour, we'll
+			// indicate that the filesystem doesn't support the creation of
+			// sockets.
+			return syserror.EOPNOTSUPP
+
+		case linux.ModeCharacterDevice:
+			fallthrough
+		case linux.ModeBlockDevice:
+			// TODO: We don't support creating block or character
+			// devices at the moment.
+			//
+			// When we start supporting block and character devices, we'll
+			// need to check for CAP_MKNOD here.
+			return syserror.EPERM
+
+		default:
+			// "EINVAL - mode requested creation of something other than a
+			// regular file, device special file, FIFO or socket." - mknod(2)
+			return syserror.EINVAL
+		}
+	})
+}
+
+// Mknod implements the linux syscall mknod(2).
+func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	path := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+	// We don't need this argument until we support creation of device nodes.
+	_ = args[2].Uint() // dev
+
+	return 0, nil, mknodAt(t, linux.AT_FDCWD, path, mode)
+}
+
+// Mknodat implements the linux syscall mknodat(2).
+func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	path := args[1].Pointer()
+	mode := linux.FileMode(args[2].ModeT())
+	// We don't need this argument until we support creation of device nodes.
+	_ = args[3].Uint() // dev
+
+	return 0, nil, mknodAt(t, dirFD, path, mode)
+}
+
+func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, err
+	}
+	if dirPath {
+		return 0, syserror.ENOENT
+	}
+
+	err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does this file exist already?
+		targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, linux.MaxSymlinkTraversals)
+		var newFile *fs.File
+		switch err {
+		case nil:
+			// The file existed.
+			defer targetDirent.DecRef()
+
+			// Check if we wanted to create.
+			if flags&syscall.O_EXCL != 0 {
+				return syserror.EEXIST
+			}
+
+			// Like sys_open, check for a few things about the
+			// filesystem before trying to get a reference to the
+			// fs.File. The same constraints on Check apply.
+			if err := targetDirent.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
+				return err
+			}
+
+			// Should we truncate the file?
+			if flags&syscall.O_TRUNC != 0 {
+				if err := targetDirent.Inode.Truncate(t, targetDirent, 0); err != nil {
+					return err
+				}
+			}
+
+			// Create a new fs.File.
+			newFile, err = targetDirent.Inode.GetFile(t, targetDirent, linuxToFlags(flags))
+			if err != nil {
+				return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+			}
+			defer newFile.DecRef()
+		case syserror.EACCES:
+			// Permission denied while walking to the file.
+			return err
+		default:
+			// Do we have write permissions on the parent?
+			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+
+			// Attempt a creation.
+			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+			newFile, err = d.Create(t, root, name, linuxToFlags(flags), perms)
+			if err != nil {
+				// No luck, bail.
+				return err
+			}
+			defer newFile.DecRef()
+			targetDirent = newFile.Dirent
+		}
+
+		// Success.
+		fdFlags := kernel.FDFlags{CloseOnExec: flags&syscall.O_CLOEXEC != 0}
+		newFD, err := t.FDMap().NewFDFrom(0, newFile, fdFlags, t.ThreadGroup().Limits())
+		if err != nil {
+			return err
+		}
+
+		// Set result in frame.
+		fd = uintptr(newFD)
+
+		// Queue the open inotify event. The creation event is
+		// automatically queued when the dirent is targetDirent. The
+		// open events are implemented at the syscall layer so we need
+		// to manually queue one here.
+		targetDirent.InotifyEvent(linux.IN_OPEN, 0)
+
+		return nil
+	})
+	return fd, err // Use result in frame.
+}
+
+// Open implements linux syscall open(2).
+func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := uint(args[1].Uint())
+	if flags&syscall.O_CREAT != 0 {
+		mode := linux.FileMode(args[2].ModeT())
+		n, err := createAt(t, linux.AT_FDCWD, addr, flags, mode)
+		return n, nil, err
+	}
+	n, err := openAt(t, linux.AT_FDCWD, addr, flags)
+	return n, nil, err
+}
+
+// Openat implements linux syscall openat(2).
+func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	flags := uint(args[2].Uint())
+	if flags&syscall.O_CREAT != 0 {
+		mode := linux.FileMode(args[3].ModeT())
+		n, err := createAt(t, dirFD, addr, flags, mode)
+		return n, nil, err
+	}
+	n, err := openAt(t, dirFD, addr, flags)
+	return n, nil, err
+}
+
+// Creat implements linux syscall creat(2).
+func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+	n, err := createAt(t, linux.AT_FDCWD, addr, syscall.O_WRONLY|syscall.O_TRUNC, mode)
+	return n, nil, err
+}
+
+// accessContext is a context that overrides the credentials used, but
+// otherwise carries the same values as the embedded context.
+//
+// accessContext should only be used for access(2).
+type accessContext struct {
+	context.Context
+	creds auth.Credentials
+}
+
+// Value implements context.Context.
+func (ac accessContext) Value(key interface{}) interface{} {
+	switch key {
+	case auth.CtxCredentials:
+		return &ac.creds
+	default:
+		return ac.Context.Value(key)
+	}
+}
+
+func accessAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, resolve bool, mode uint) error {
+	const rOK = 4
+	const wOK = 2
+	const xOK = 1
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	// Sanity check the mode.
+	if mode&^(rOK|wOK|xOK) != 0 {
+		return syserror.EINVAL
+	}
+
+	return fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error {
+		// access(2) and faccessat(2) check permissions using real
+		// UID/GID, not effective UID/GID.
+		//
+		// "access() needs to use the real uid/gid, not the effective
+		// uid/gid. We do this by temporarily clearing all FS-related
+		// capabilities and switching the fsuid/fsgid around to the
+		// real ones." -fs/open.c:faccessat
+		creds := t.Credentials()
+		creds.EffectiveKUID = creds.RealKUID
+		creds.EffectiveKGID = creds.RealKGID
+		if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
+			creds.EffectiveCaps = creds.PermittedCaps
+		} else {
+			creds.EffectiveCaps = 0
+		}
+
+		ctx := &accessContext{
+			Context: t,
+			creds:   creds,
+		}
+
+		if err := d.Inode.CheckPermission(ctx, fs.PermMask{
+			Read:    mode&rOK != 0,
+			Write:   mode&wOK != 0,
+			Execute: mode&xOK != 0,
+		}); err != nil {
+			return err
+		}
+		return nil
+	})
+}
+
+// Access implements linux syscall access(2).
+func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+
+	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, true, mode)
+}
+
+// Faccessat implements linux syscall faccessat(2).
+func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+	flags := args[3].Int()
+
+	return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode)
+}
+
+// Ioctl implements linux syscall ioctl(2).
+func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	request := int(args[1].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Shared flags between file and socket.
+	switch request {
+	case linux.FIONCLEX:
+		t.FDMap().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: false,
+		})
+		return 0, nil, nil
+	case linux.FIOCLEX:
+		t.FDMap().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: true,
+		})
+		return 0, nil, nil
+
+	case linux.FIONBIO:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		flags := file.Flags()
+		if set != 0 {
+			flags.NonBlocking = true
+		} else {
+			flags.NonBlocking = false
+		}
+		file.SetFlags(flags.Settable())
+		return 0, nil, nil
+
+	default:
+		ret, err := file.FileOperations.Ioctl(t, t.MemoryManager(), args)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		return ret, nil, nil
+	}
+}
+
+// Getcwd implements the linux syscall getcwd(2).
+func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	size := args[1].SizeT()
+	cwd := t.FSContext().WorkingDirectory()
+	defer cwd.DecRef()
+	root := t.FSContext().RootDirectory()
+	defer root.DecRef()
+
+	// Get our fullname from the root and preprend unreachable if the root was
+	// unreachable from our current dirent this is the same behavior as on linux.
+	s, reachable := cwd.FullName(root)
+	if !reachable {
+		s = "(unreachable)" + s
+	}
+
+	// Note this is >= because we need a terminator.
+	if uint(len(s)) >= size {
+		return 0, nil, syserror.ERANGE
+	}
+
+	// Copy out the path name for the node.
+	bytes, err := t.CopyOutBytes(addr, []byte(s))
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Top it off with a terminator.
+	_, err = t.CopyOut(addr+usermem.Addr(bytes), []byte("\x00"))
+	return uintptr(bytes + 1), nil, err
+}
+
+// Chroot implements the linux syscall chroot(2).
+func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	if !t.HasCapability(linux.CAP_SYS_CHROOT) {
+		return 0, nil, syserror.EPERM
+	}
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		// Is it a directory?
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does it have execute permissions?
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+			return err
+		}
+
+		t.FSContext().SetRootDirectory(d)
+		return nil
+	})
+}
+
+// Chdir implements the linux syscall chdir(2).
+func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		// Is it a directory?
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does it have execute permissions?
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+			return err
+		}
+
+		t.FSContext().SetWorkingDirectory(d)
+		return nil
+	})
+}
+
+// Fchdir implements the linux syscall fchdir(2).
+func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Is it a directory?
+	if !fs.IsDir(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ENOTDIR
+	}
+
+	// Does it have execute permissions?
+	if err := file.Dirent.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+		return 0, nil, err
+	}
+
+	t.FSContext().SetWorkingDirectory(file.Dirent)
+	return 0, nil, nil
+}
+
+// Close implements linux syscall close(2).
+func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file, ok := t.FDMap().Remove(fd)
+	if !ok {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.Flush(t)
+	return 0, nil, handleIOError(t, false /* partial */, err, syscall.EINTR, "close", file)
+}
+
+// Dup implements linux syscall dup(2).
+func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newfd, err := t.FDMap().NewFDFrom(0, file, kernel.FDFlags{}, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, syserror.EMFILE
+	}
+	return uintptr(newfd), nil, nil
+}
+
+// Dup2 implements linux syscall dup2(2).
+func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := kdefs.FD(args[0].Int())
+	newfd := kdefs.FD(args[1].Int())
+
+	// If oldfd is a valid file descriptor, and newfd has the same value as oldfd,
+	// then dup2() does nothing, and returns newfd.
+	if oldfd == newfd {
+		oldFile := t.FDMap().GetFile(oldfd)
+		if oldFile == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer oldFile.DecRef()
+
+		return uintptr(newfd), nil, nil
+	}
+
+	// Zero out flags arg to be used by Dup3.
+	args[2].Value = 0
+	return Dup3(t, args)
+}
+
+// Dup3 implements linux syscall dup3(2).
+func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := kdefs.FD(args[0].Int())
+	newfd := kdefs.FD(args[1].Int())
+	flags := args[2].Uint()
+
+	if oldfd == newfd {
+		return 0, nil, syserror.EINVAL
+	}
+
+	oldFile := t.FDMap().GetFile(oldfd)
+	if oldFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer oldFile.DecRef()
+
+	err := t.FDMap().NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&syscall.O_CLOEXEC != 0}, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(newfd), nil, nil
+}
+
+// Fcntl implements linux syscall fcntl(2).
+func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	cmd := args[1].Int()
+
+	file, flags := t.FDMap().GetDescriptor(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	switch cmd {
+	case syscall.F_DUPFD, syscall.F_DUPFD_CLOEXEC:
+		from := kdefs.FD(args[2].Int())
+		fdFlags := kernel.FDFlags{CloseOnExec: cmd == syscall.F_DUPFD_CLOEXEC}
+		fd, err := t.FDMap().NewFDFrom(from, file, fdFlags, t.ThreadGroup().Limits())
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(fd), nil, nil
+	case syscall.F_GETFD:
+		return uintptr(fdFlagsToLinux(flags)), nil, nil
+	case syscall.F_SETFD:
+		flags := args[2].Uint()
+		t.FDMap().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: flags&syscall.FD_CLOEXEC != 0,
+		})
+	case syscall.F_GETFL:
+		return uintptr(flagsToLinux(file.Flags())), nil, nil
+	case syscall.F_SETFL:
+		flags := uint(args[2].Uint())
+		file.SetFlags(linuxToSettableFlags(flags))
+	case syscall.F_SETLK, syscall.F_SETLKW:
+		// In Linux the file system can choose to provide lock operations for an inode.
+		// Normally pipe and socket types lack lock operations. We diverge and use a heavy
+		// hammer by only allowing locks on files and directories.
+		if !fs.IsFile(file.Dirent.Inode.StableAttr) && !fs.IsDir(file.Dirent.Inode.StableAttr) {
+			return 0, nil, syserror.EBADF
+		}
+
+		// Copy in the lock request.
+		flockAddr := args[2].Pointer()
+		var flock syscall.Flock_t
+		if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+			return 0, nil, err
+		}
+
+		// Compute the lock whence.
+		var sw fs.SeekWhence
+		switch flock.Whence {
+		case 0:
+			sw = fs.SeekSet
+		case 1:
+			sw = fs.SeekCurrent
+		case 2:
+			sw = fs.SeekEnd
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Compute the lock offset.
+		var off int64
+		switch sw {
+		case fs.SeekSet:
+			off = 0
+		case fs.SeekCurrent:
+			// Note that Linux does not hold any mutexes while retrieving the file offset,
+			// see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
+			off = file.Offset()
+		case fs.SeekEnd:
+			uattr, err := file.Dirent.Inode.UnstableAttr(t)
+			if err != nil {
+				return 0, nil, err
+			}
+			off = uattr.Size
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Compute the lock range.
+		rng, err := lock.ComputeRange(flock.Start, flock.Len, off)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		// The lock uid is that of the Task's FDMap.
+		lockUniqueID := lock.UniqueID(t.FDMap().ID())
+
+		// These locks don't block; execute the non-blocking operation using the inode's lock
+		// context directly.
+		switch flock.Type {
+		case syscall.F_RDLCK:
+			if !file.Flags().Read {
+				return 0, nil, syserror.EBADF
+			}
+			if cmd == syscall.F_SETLK {
+				// Non-blocking lock, provide a nil lock.Blocker.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+					return 0, nil, syserror.EAGAIN
+				}
+			} else {
+				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+					return 0, nil, syserror.EINTR
+				}
+			}
+			return 0, nil, nil
+		case syscall.F_WRLCK:
+			if !file.Flags().Write {
+				return 0, nil, syserror.EBADF
+			}
+			if cmd == syscall.F_SETLK {
+				// Non-blocking lock, provide a nil lock.Blocker.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+					return 0, nil, syserror.EAGAIN
+				}
+			} else {
+				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+					return 0, nil, syserror.EINTR
+				}
+			}
+			return 0, nil, nil
+		case syscall.F_UNLCK:
+			file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng)
+			return 0, nil, nil
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+	default:
+		// Everything else is not yet supported.
+		return 0, nil, syserror.EINVAL
+	}
+	return 0, nil, nil
+}
+
+const (
+	_FADV_NORMAL     = 0
+	_FADV_RANDOM     = 1
+	_FADV_SEQUENTIAL = 2
+	_FADV_WILLNEED   = 3
+	_FADV_DONTNEED   = 4
+	_FADV_NOREUSE    = 5
+)
+
+// Fadvise64 implements linux syscall fadvise64(2).
+// This implementation currently ignores the provided advice.
+func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	offset := args[1].Int64()
+	length := args[2].Uint()
+	advice := args[3].Int()
+
+	if offset < 0 || length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	switch advice {
+	case _FADV_NORMAL:
+	case _FADV_RANDOM:
+	case _FADV_SEQUENTIAL:
+	case _FADV_WILLNEED:
+	case _FADV_DONTNEED:
+	case _FADV_NOREUSE:
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Sure, whatever.
+	return 0, nil, nil
+}
+
+func mkdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does this directory exist already?
+		f, err := t.MountNamespace().FindInode(t, root, d, name, linux.MaxSymlinkTraversals)
+		switch err {
+		case nil:
+			// The directory existed.
+			defer f.DecRef()
+			return syserror.EEXIST
+		case syserror.EACCES:
+			// Permission denied while walking to the directory.
+			return err
+		default:
+			// Do we have write permissions on the parent?
+			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+
+			// Create the directory.
+			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+			return d.CreateDirectory(t, root, name, perms)
+		}
+	})
+}
+
+// Mkdir implements linux syscall mkdir(2).
+func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+
+	return 0, nil, mkdirAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Mkdirat implements linux syscall mkdirat(2).
+func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	mode := linux.FileMode(args[2].ModeT())
+
+	return 0, nil, mkdirAt(t, dirFD, addr, mode)
+}
+
+func rmdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error {
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	// Special case: rmdir rejects anything with '.' as last component.
+	// This would be handled by the busy check for the current working
+	// directory, but this is how it's done.
+	if (len(path) == 1 && path == ".") || (len(path) > 1 && path[len(path)-2:] == "/.") {
+		return syserror.EINVAL
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		if err := fs.MayDelete(t, root, d, name); err != nil {
+			return err
+		}
+
+		return d.RemoveDirectory(t, root, name)
+	})
+}
+
+// Rmdir implements linux syscall rmdir(2).
+func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr)
+}
+
+func symlinkAt(t *kernel.Task, dirFD kdefs.FD, newAddr usermem.Addr, oldAddr usermem.Addr) error {
+	newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	// The oldPath is copied in verbatim. This is because the symlink
+	// will include all details, including trailing slashes.
+	oldPath, err := t.CopyInString(oldAddr, syscall.PathMax)
+	if err != nil {
+		return err
+	}
+	if oldPath == "" {
+		return syserror.ENOENT
+	}
+
+	return fileOpAt(t, dirFD, newPath, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Make sure we have write permissions on the parent directory.
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+			return err
+		}
+		return d.CreateLink(t, root, oldPath, name)
+	})
+}
+
+// Symlink implements linux syscall symlink(2).
+func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	newAddr := args[1].Pointer()
+
+	return 0, nil, symlinkAt(t, linux.AT_FDCWD, newAddr, oldAddr)
+}
+
+// Symlinkat implements linux syscall symlinkat(2).
+func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	dirFD := kdefs.FD(args[1].Int())
+	newAddr := args[2].Pointer()
+
+	return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr)
+}
+
+// mayLinkAt determines whether t can create a hard link to target.
+//
+// This corresponds to Linux's fs/namei.c:may_linkat.
+func mayLinkAt(t *kernel.Task, target *fs.Inode) error {
+	// Technically Linux is more restrictive in 3.11.10 (requires CAP_FOWNER in
+	// root user namespace); this is from the later f2ca379642d7 "namei: permit
+	// linking with CAP_FOWNER in userns".
+	if !target.CheckOwnership(t) {
+		return syserror.EPERM
+	}
+
+	// Check that the target is not a directory and that permissions are okay.
+	if fs.IsDir(target.StableAttr) || target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil {
+		return syserror.EPERM
+	}
+
+	return nil
+}
+
+// linkAt creates a hard link to the target specified by oldDirFD and oldAddr,
+// specified by newDirFD and newAddr.  If resolve is true, then the symlinks
+// will be followed when evaluating the target.
+func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr, resolve, allowEmpty bool) error {
+	oldPath, _, err := copyInPath(t, oldAddr, allowEmpty)
+	if err != nil {
+		return err
+	}
+	newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	if allowEmpty && oldPath == "" {
+		target := t.FDMap().GetFile(oldDirFD)
+		if target == nil {
+			return syserror.EBADF
+		}
+		defer target.DecRef()
+		if err := mayLinkAt(t, target.Dirent.Inode); err != nil {
+			return err
+		}
+
+		// Resolve the target directory.
+		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error {
+			if !fs.IsDir(newParent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
+
+			// Make sure we have write permissions on the parent directory.
+			if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+			return newParent.CreateHardLink(t, root, target.Dirent, newName)
+		})
+	}
+
+	// Resolve oldDirFD and oldAddr to a dirent.  The "resolve" argument
+	// only applies to this name.
+	return fileOpOn(t, oldDirFD, oldPath, resolve, func(root *fs.Dirent, target *fs.Dirent) error {
+		if err := mayLinkAt(t, target.Inode); err != nil {
+			return err
+		}
+
+		// Next resolve newDirFD and newAddr to the parent dirent and name.
+		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error {
+			if !fs.IsDir(newParent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
+
+			// Make sure we have write permissions on the parent directory.
+			if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+			return newParent.CreateHardLink(t, root, target, newName)
+		})
+	})
+}
+
+// Link implements linux syscall link(2).
+func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	newAddr := args[1].Pointer()
+
+	// man link(2):
+	// POSIX.1-2001 says that link() should dereference oldpath if it is a
+	// symbolic link. However, since kernel 2.0, Linux does not do so: if
+	// oldpath is a symbolic link, then newpath is created as a (hard) link
+	// to the same symbolic link file (i.e., newpath becomes a symbolic
+	// link to the same file that oldpath refers to).
+	resolve := false
+	return 0, nil, linkAt(t, linux.AT_FDCWD, oldAddr, linux.AT_FDCWD, newAddr, resolve, false /* allowEmpty */)
+}
+
+// Linkat implements linux syscall linkat(2).
+func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldDirFD := kdefs.FD(args[0].Int())
+	oldAddr := args[1].Pointer()
+	newDirFD := kdefs.FD(args[2].Int())
+	newAddr := args[3].Pointer()
+
+	// man linkat(2):
+	// By default, linkat(), does not dereference oldpath if it is a
+	// symbolic link (like link(2)). Since Linux 2.6.18, the flag
+	// AT_SYMLINK_FOLLOW can be specified in flags to cause oldpath to be
+	// dereferenced if it is a symbolic link.
+	flags := args[4].Int()
+	resolve := flags&linux.AT_SYMLINK_FOLLOW == linux.AT_SYMLINK_FOLLOW
+	allowEmpty := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH
+
+	if allowEmpty && !t.HasCapabilityIn(linux.CAP_DAC_READ_SEARCH, t.UserNamespace().Root()) {
+		return 0, nil, syserror.ENOENT
+	}
+
+	return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
+}
+
+func readlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, err
+	}
+	if dirPath {
+		return 0, syserror.ENOENT
+	}
+
+	err = fileOpOn(t, dirFD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		// Check for Read permission.
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Read: true}); err != nil {
+			return err
+		}
+
+		s, err := d.Inode.Readlink(t)
+		if err == syserror.ENOLINK {
+			return syserror.EINVAL
+		}
+		if err != nil {
+			return err
+		}
+
+		buffer := []byte(s)
+		if uint(len(buffer)) > size {
+			buffer = buffer[:size]
+		}
+
+		n, err := t.CopyOutBytes(bufAddr, buffer)
+
+		// Update frame return value.
+		copied = uintptr(n)
+
+		return err
+	})
+	return copied, err // Return frame value.
+}
+
+// Readlink implements linux syscall readlink(2).
+func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	n, err := readlinkAt(t, linux.AT_FDCWD, addr, bufAddr, size)
+	return n, nil, err
+}
+
+// Readlinkat implements linux syscall readlinkat(2).
+func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	bufAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	n, err := readlinkAt(t, dirFD, addr, bufAddr, size)
+	return n, nil, err
+}
+
+func unlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		if err := fs.MayDelete(t, root, d, name); err != nil {
+			return err
+		}
+
+		return d.Remove(t, root, name)
+	})
+}
+
+// Unlink implements linux syscall unlink(2).
+func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	return 0, nil, unlinkAt(t, linux.AT_FDCWD, addr)
+}
+
+// Unlinkat implements linux syscall unlinkat(2).
+func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	flags := args[2].Uint()
+	if flags&linux.AT_REMOVEDIR != 0 {
+		return 0, nil, rmdirAt(t, dirFD, addr)
+	}
+	return 0, nil, unlinkAt(t, dirFD, addr)
+}
+
+// Truncate implements linux syscall truncate(2).
+func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+	if dirPath {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(syscall.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		if fs.IsDir(d.Inode.StableAttr) {
+			return syserror.EISDIR
+		}
+		if !fs.IsFile(d.Inode.StableAttr) {
+			return syserror.EINVAL
+		}
+
+		// Reject truncation if the access permissions do not allow truncation.
+		// This is different from the behavior of sys_ftruncate, see below.
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
+			return err
+		}
+
+		if err := d.Inode.Truncate(t, d, length); err != nil {
+			return err
+		}
+
+		// File length modified, generate notification.
+		d.InotifyEvent(linux.IN_MODIFY, 0)
+
+		return nil
+	})
+}
+
+// Ftruncate implements linux syscall ftruncate(2).
+func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	length := args[1].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Reject truncation if the file flags do not permit this operation.
+	// This is different from truncate(2) above.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Note that this is different from truncate(2) above, where a
+	// directory returns EISDIR.
+	if !fs.IsFile(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(syscall.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	if err := file.Dirent.Inode.Truncate(t, file.Dirent, length); err != nil {
+		return 0, nil, err
+	}
+
+	// File length modified, generate notification.
+	file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+
+	return 0, nil, nil
+}
+
+// Umask implements linux syscall umask(2).
+func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	mask := args[0].ModeT()
+	mask = t.FSContext().SwapUmask(mask & 0777)
+	return uintptr(mask), nil, nil
+}
+
+// Change ownership of a file.
+//
+// uid and gid may be -1, in which case they will not be changed.
+func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error {
+	owner := fs.FileOwner{
+		UID: auth.NoID,
+		GID: auth.NoID,
+	}
+
+	uattr, err := d.Inode.UnstableAttr(t)
+	if err != nil {
+		return err
+	}
+	c := t.Credentials()
+	hasCap := d.Inode.CheckCapability(t, linux.CAP_CHOWN)
+	isOwner := uattr.Owner.UID == c.EffectiveKUID
+	if uid.Ok() {
+		kuid := c.UserNamespace.MapToKUID(uid)
+		// Valid UID must be supplied if UID is to be changed.
+		if !kuid.Ok() {
+			return syserror.EINVAL
+		}
+
+		// "Only a privileged process (CAP_CHOWN) may change the owner
+		// of a file." -chown(2)
+		//
+		// Linux also allows chown if you own the file and are
+		// explicitly not changing its UID.
+		isNoop := uattr.Owner.UID == kuid
+		if !(hasCap || (isOwner && isNoop)) {
+			return syserror.EPERM
+		}
+
+		owner.UID = kuid
+	}
+	if gid.Ok() {
+		kgid := c.UserNamespace.MapToKGID(gid)
+		// Valid GID must be supplied if GID is to be changed.
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+
+		// "The owner of a file may change the group of the file to any
+		// group of which that owner is a member. A privileged process
+		// (CAP_CHOWN) may change the group arbitrarily." -chown(2)
+		isNoop := uattr.Owner.GID == kgid
+		isMemberGroup := c.InGroup(kgid)
+		if !(hasCap || (isOwner && (isNoop || isMemberGroup))) {
+			return syserror.EPERM
+		}
+
+		owner.GID = kgid
+	}
+
+	// FIXME: This is racy; the inode's owner may have changed in
+	// the meantime. (Linux holds i_mutex while calling
+	// fs/attr.c:notify_change() => inode_operations::setattr =>
+	// inode_change_ok().)
+	if err := d.Inode.SetOwner(t, d, owner); err != nil {
+		return err
+	}
+
+	// When the owner or group are changed by an unprivileged user,
+	// chown(2) also clears the set-user-ID and set-group-ID bits, but
+	// we do not support them.
+	return nil
+}
+
+func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error {
+	path, _, err := copyInPath(t, addr, allowEmpty)
+	if err != nil {
+		return err
+	}
+
+	if path == "" {
+		// Annoying. What's wrong with fchown?
+		file := t.FDMap().GetFile(fd)
+		if file == nil {
+			return syserror.EBADF
+		}
+		defer file.DecRef()
+
+		return chown(t, file.Dirent, uid, gid)
+	}
+
+	return fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error {
+		return chown(t, d, uid, gid)
+	})
+}
+
+// Chown implements linux syscall chown(2).
+func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	uid := auth.UID(args[1].Uint())
+	gid := auth.GID(args[2].Uint())
+
+	return 0, nil, chownAt(t, linux.AT_FDCWD, addr, true /* resolve */, false /* allowEmpty */, uid, gid)
+}
+
+// Lchown implements linux syscall lchown(2).
+func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	uid := auth.UID(args[1].Uint())
+	gid := auth.GID(args[2].Uint())
+
+	return 0, nil, chownAt(t, linux.AT_FDCWD, addr, false /* resolve */, false /* allowEmpty */, uid, gid)
+}
+
+// Fchown implements linux syscall fchown(2).
+func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	uid := auth.UID(args[1].Uint())
+	gid := auth.GID(args[2].Uint())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, chown(t, file.Dirent, uid, gid)
+}
+
+// Fchownat implements Linux syscall fchownat(2).
+func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	uid := auth.UID(args[2].Uint())
+	gid := auth.GID(args[3].Uint())
+	flags := args[4].Int()
+
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, chownAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, flags&linux.AT_EMPTY_PATH != 0, uid, gid)
+}
+
+func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error {
+	// Must own file to change mode.
+	if !d.Inode.CheckOwnership(t) {
+		return syserror.EPERM
+	}
+
+	p := fs.FilePermsFromMode(mode)
+	if !d.Inode.SetPermissions(t, d, p) {
+		return syserror.EPERM
+	}
+
+	// File attribute changed, generate notification.
+	d.InotifyEvent(linux.IN_ATTRIB, 0)
+
+	return nil
+}
+
+func chmodAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpOn(t, fd, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return chmod(t, d, mode)
+	})
+}
+
+// Chmod implements linux syscall chmod(2).
+func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+
+	return 0, nil, chmodAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Fchmod implements linux syscall fchmod(2).
+func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	mode := linux.FileMode(args[1].ModeT())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, chmod(t, file.Dirent, mode)
+}
+
+// Fchmodat implements linux syscall fchmodat(2).
+func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	mode := linux.FileMode(args[2].ModeT())
+
+	return 0, nil, chmodAt(t, fd, addr, mode)
+}
+
+// defaultSetToSystemTimeSpec returns a TimeSpec that will set ATime and MTime
+// to the system time.
+func defaultSetToSystemTimeSpec() fs.TimeSpec {
+	return fs.TimeSpec{
+		ATimeSetSystemTime: true,
+		MTimeSetSystemTime: true,
+	}
+}
+
+func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error {
+	setTimestamp := func(root *fs.Dirent, d *fs.Dirent) error {
+		// Does the task own the file?
+		if !d.Inode.CheckOwnership(t) {
+			// Trying to set a specific time? Must be owner.
+			if (ts.ATimeOmit || !ts.ATimeSetSystemTime) && (ts.MTimeOmit || !ts.MTimeSetSystemTime) {
+				return syserror.EPERM
+			}
+
+			// Trying to set to current system time? Must have write access.
+			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
+				return err
+			}
+		}
+
+		return d.Inode.SetTimestamps(t, d, ts)
+	}
+
+	// From utimes.c:
+	// "If filename is NULL and dfd refers to an open file, then operate on
+	// the file.  Otherwise look up filename, possibly using dfd as a
+	// starting point."
+	if addr == 0 && dirFD != linux.AT_FDCWD {
+		if !resolve {
+			// Linux returns EINVAL in this case. See utimes.c.
+			return syserror.EINVAL
+		}
+		f := t.FDMap().GetFile(dirFD)
+		if f == nil {
+			return syserror.EBADF
+		}
+		defer f.DecRef()
+
+		root := t.FSContext().RootDirectory()
+		defer root.DecRef()
+
+		return setTimestamp(root, f.Dirent)
+	}
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpOn(t, dirFD, path, resolve, setTimestamp)
+}
+
+// Utime implements linux syscall utime(2).
+func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	filenameAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times syscall.Utimbuf
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		ts = fs.TimeSpec{
+			ATime: ktime.FromSeconds(times.Actime),
+			MTime: ktime.FromSeconds(times.Modtime),
+		}
+	}
+	return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
+}
+
+// Utimes implements linux syscall utimes(2).
+func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	filenameAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times [2]linux.Timeval
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		ts = fs.TimeSpec{
+			ATime: ktime.FromTimeval(times[0]),
+			MTime: ktime.FromTimeval(times[1]),
+		}
+	}
+	return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
+}
+
+// timespecIsValid checks that the timespec is valid for use in utimensat.
+func timespecIsValid(ts linux.Timespec) bool {
+	// Nsec must be UTIME_OMIT, UTIME_NOW, or less than 10^9.
+	return ts.Nsec == linux.UTIME_OMIT || ts.Nsec == linux.UTIME_NOW || ts.Nsec < 1e9
+}
+
+// Utimensat implements linux syscall utimensat(2).
+func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	pathnameAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+	flags := args[3].Int()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times [2]linux.Timespec
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) {
+			return 0, nil, syserror.EINVAL
+		}
+
+		// If both are UTIME_OMIT, this is a noop.
+		if times[0].Nsec == linux.UTIME_OMIT && times[1].Nsec == linux.UTIME_OMIT {
+			return 0, nil, nil
+		}
+
+		ts = fs.TimeSpec{
+			ATime:              ktime.FromTimespec(times[0]),
+			ATimeOmit:          times[0].Nsec == linux.UTIME_OMIT,
+			ATimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
+			MTime:              ktime.FromTimespec(times[1]),
+			MTimeOmit:          times[1].Nsec == linux.UTIME_OMIT,
+			MTimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
+		}
+	}
+	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, flags&linux.AT_SYMLINK_NOFOLLOW == 0)
+}
+
+// Futimesat implements linux syscall futimesat(2).
+func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	pathnameAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times [2]linux.Timeval
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		if times[0].Usec >= 1e6 || times[0].Usec < 0 ||
+			times[1].Usec >= 1e6 || times[1].Usec < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+
+		ts = fs.TimeSpec{
+			ATime: ktime.FromTimeval(times[0]),
+			MTime: ktime.FromTimeval(times[1]),
+		}
+	}
+	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
+}
+
+func renameAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr) error {
+	newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	oldPath, _, err := copyInPath(t, oldAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpAt(t, oldDirFD, oldPath, func(root *fs.Dirent, oldParent *fs.Dirent, oldName string) error {
+		if !fs.IsDir(oldParent.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Root cannot be renamed to anything.
+		//
+		// TODO: This catches the case when the rename
+		// argument is exactly "/", but we should return EBUSY when
+		// renaming any mount point, or when the argument is not
+		// exactly "/" but still resolves to the root, like "/.." or
+		// "/bin/..".
+		if oldParent == root && oldName == "." {
+			return syscall.EBUSY
+		}
+		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error {
+			if !fs.IsDir(newParent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
+
+			// Nothing can be renamed to root.
+			//
+			// TODO: Same as above.
+			if newParent == root && newName == "." {
+				return syscall.EBUSY
+			}
+			return fs.Rename(t, root, oldParent, oldName, newParent, newName)
+		})
+	})
+}
+
+// Rename implements linux syscall rename(2).
+func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldPathAddr := args[0].Pointer()
+	newPathAddr := args[1].Pointer()
+	return 0, nil, renameAt(t, linux.AT_FDCWD, oldPathAddr, linux.AT_FDCWD, newPathAddr)
+}
+
+// Renameat implements linux syscall renameat(2).
+func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldDirFD := kdefs.FD(args[0].Int())
+	oldPathAddr := args[1].Pointer()
+	newDirFD := kdefs.FD(args[2].Int())
+	newPathAddr := args[3].Pointer()
+	return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
+}
+
+// Fallocate implements linux system call fallocate(2).
+// (well, not really, but at least we return the expected error codes)
+func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	offset := args[2].Int64()
+	length := args[3].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	if offset < 0 || length <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, syserror.EOPNOTSUPP
+}
+
+// Flock implements linux syscall flock(2).
+func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	operation := args[1].Int()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		// flock(2): EBADF fd is not an open file descriptor.
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	nonblocking := operation&linux.LOCK_NB != 0
+	operation &^= linux.LOCK_NB
+
+	// flock(2):
+	// Locks created by flock() are associated with an open file table entry. This means that
+	// duplicate file descriptors (created by, for example, fork(2) or dup(2)) refer to the
+	// same lock, and this lock may be modified or released using any of these descriptors. Furthermore,
+	// the lock is released either by an explicit LOCK_UN operation on any of these duplicate
+	// descriptors, or when all such descriptors have been closed.
+	//
+	// If a process uses open(2) (or similar) to obtain more than one descriptor for the same file,
+	// these descriptors are treated independently by flock(). An attempt to lock the file using
+	// one of these file descriptors may be denied by a lock that the calling process has already placed via
+	// another descriptor.
+	//
+	// We use the File UniqueID as the lock UniqueID because it needs to reference the same lock across dup(2)
+	// and fork(2).
+	lockUniqueID := lock.UniqueID(file.UniqueID)
+
+	// A BSD style lock spans the entire file.
+	rng := lock.LockRange{
+		Start: 0,
+		End:   lock.LockEOF,
+	}
+
+	switch operation {
+	case linux.LOCK_EX:
+		if nonblocking {
+			// Since we're nonblocking we pass a nil lock.Blocker implementation.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+				return 0, nil, syserror.EWOULDBLOCK
+			}
+		} else {
+			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+				return 0, nil, syserror.EINTR
+			}
+		}
+	case linux.LOCK_SH:
+		if nonblocking {
+			// Since we're nonblocking we pass a nil lock.Blocker implementation.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+				return 0, nil, syserror.EWOULDBLOCK
+			}
+		} else {
+			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+				return 0, nil, syserror.EINTR
+			}
+		}
+	case linux.LOCK_UN:
+		file.Dirent.Inode.LockCtx.BSD.UnlockRegion(lockUniqueID, rng)
+	default:
+		// flock(2): EINVAL operation is invalid.
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, nil
+}
+
+// Sendfile implements linux system call sendfile(2).
+func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	outFD := kdefs.FD(args[0].Int())
+	inFD := kdefs.FD(args[1].Int())
+	offsetAddr := args[2].Pointer()
+	count := int64(args[3].SizeT())
+
+	// Don't send a negative number of bytes.
+	if count < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get files.
+	outFile := t.FDMap().GetFile(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	inFile := t.FDMap().GetFile(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+
+	// Verify that the outfile is writable.
+	outFlags := outFile.Flags()
+	if !outFlags.Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Verify that the outfile Append flag is not set.
+	if outFlags.Append {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Verify that we have a regular infile.
+	// http://elixir.free-electrons.com/linux/latest/source/fs/splice.c#L933
+	if !fs.IsRegular(inFile.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Verify that the infile is readable.
+	if !inFile.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Setup for sending data.
+	var offset uint64
+	var n int64
+	var err error
+	w := &fs.FileWriter{t, outFile}
+	hasOffset := offsetAddr != 0
+	// If we have a provided offset.
+	if hasOffset {
+		// Copy in the offset.
+		if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+			return 0, nil, err
+		}
+		// Send data using Preadv.
+		r := io.NewSectionReader(&fs.FileReader{t, inFile}, int64(offset), count)
+		n, err = io.Copy(w, r)
+		// Copy out the new offset.
+		if _, err := t.CopyOut(offsetAddr, n+int64(offset)); err != nil {
+			return 0, nil, err
+		}
+		// If we don't have a provided offset.
+	} else {
+		// Send data using readv.
+		r := &io.LimitedReader{R: &fs.FileReader{t, inFile}, N: count}
+		n, err = io.Copy(w, r)
+	}
+
+	// We can only pass a single file to handleIOError, so pick inFile
+	// arbitrarily.
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "sendfile", inFile)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
new file mode 100644
index 000000000..57762d058
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -0,0 +1,319 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// futexChecker is a futex.Checker that uses a Task's MemoryManager.
+type futexChecker struct {
+	t *kernel.Task
+}
+
+// Check checks if the address contains the given value, and returns
+// syserror.EAGAIN if it doesn't. See Checker interface in futex package
+// for more information.
+func (f futexChecker) Check(addr uintptr, val uint32) error {
+	in := f.t.CopyScratchBuffer(4)
+	_, err := f.t.CopyInBytes(usermem.Addr(addr), in)
+	if err != nil {
+		return err
+	}
+	nval := usermem.ByteOrder.Uint32(in)
+	if val != nval {
+		return syserror.EAGAIN
+	}
+	return nil
+}
+
+func (f futexChecker) atomicOp(addr uintptr, op func(uint32) uint32) (uint32, error) {
+	in := f.t.CopyScratchBuffer(4)
+	_, err := f.t.CopyInBytes(usermem.Addr(addr), in)
+	if err != nil {
+		return 0, err
+	}
+	o := usermem.ByteOrder.Uint32(in)
+	mm := f.t.MemoryManager()
+	for {
+		n := op(o)
+		r, err := mm.CompareAndSwapUint32(f.t, usermem.Addr(addr), o, n, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		if err != nil {
+			return 0, err
+		}
+
+		if r == o {
+			return o, nil
+		}
+		o = r
+	}
+}
+
+// Op performs an operation on addr and returns a result based on the operation.
+func (f futexChecker) Op(addr uintptr, opIn uint32) (bool, error) {
+	op := (opIn >> 28) & 0xf
+	cmp := (opIn >> 24) & 0xf
+	opArg := (opIn >> 12) & 0xfff
+	cmpArg := opIn & 0xfff
+
+	if op&linux.FUTEX_OP_OPARG_SHIFT != 0 {
+		opArg = 1 << opArg
+		op &^= linux.FUTEX_OP_OPARG_SHIFT // clear flag
+	}
+
+	var oldVal uint32
+	var err error
+	switch op {
+	case linux.FUTEX_OP_SET:
+		oldVal, err = f.t.MemoryManager().SwapUint32(f.t, usermem.Addr(addr), opArg, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+	case linux.FUTEX_OP_ADD:
+		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
+			return a + opArg
+		})
+	case linux.FUTEX_OP_OR:
+		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
+			return a | opArg
+		})
+	case linux.FUTEX_OP_ANDN:
+		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
+			return a & ^opArg
+		})
+	case linux.FUTEX_OP_XOR:
+		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
+			return a ^ opArg
+		})
+	default:
+		return false, syserror.ENOSYS
+	}
+	if err != nil {
+		return false, err
+	}
+
+	switch cmp {
+	case linux.FUTEX_OP_CMP_EQ:
+		return oldVal == cmpArg, nil
+	case linux.FUTEX_OP_CMP_NE:
+		return oldVal != cmpArg, nil
+	case linux.FUTEX_OP_CMP_LT:
+		return oldVal < cmpArg, nil
+	case linux.FUTEX_OP_CMP_LE:
+		return oldVal <= cmpArg, nil
+	case linux.FUTEX_OP_CMP_GT:
+		return oldVal > cmpArg, nil
+	case linux.FUTEX_OP_CMP_GE:
+		return oldVal >= cmpArg, nil
+	default:
+		return false, syserror.ENOSYS
+	}
+}
+
+// futexWaitRestartBlock encapsulates the state required to restart futex(2)
+// via restart_syscall(2).
+type futexWaitRestartBlock struct {
+	duration time.Duration
+
+	// addr stored as uint64 since uintptr is not save-able.
+	addr uint64
+
+	val  uint32
+	mask uint32
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return futexWaitDuration(t, f.duration, false, uintptr(f.addr), f.val, f.mask)
+}
+
+// futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is
+// complete.
+//
+// The wait blocks forever if forever is true, otherwise it blocks until ts.
+//
+// If blocking is interrupted, the syscall is restarted with the original
+// arguments.
+func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr uintptr, val, mask uint32) (uintptr, error) {
+	w := t.FutexWaiter()
+	err := t.Futex().WaitPrepare(w, futexChecker{t}, addr, val, mask)
+	if err != nil {
+		return 0, err
+	}
+
+	if forever {
+		err = t.Block(w.C)
+	} else if clockRealtime {
+		notifier, tchan := ktime.NewChannelNotifier()
+		timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
+		timer.Swap(ktime.Setting{
+			Enabled: true,
+			Next:    ktime.FromTimespec(ts),
+		})
+		err = t.BlockWithTimer(w.C, tchan)
+		timer.Destroy()
+	} else {
+		err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts))
+	}
+
+	t.Futex().WaitComplete(w)
+	return 0, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is
+// complete.
+//
+// The wait blocks forever if forever is true, otherwise is blocks for
+// duration.
+//
+// If blocking is interrupted, forever determines how to restart the
+// syscall. If forever is true, the syscall is restarted with the original
+// arguments. If forever is false, duration is a relative timeout and the
+// syscall is restarted with the remaining timeout.
+func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr uintptr, val, mask uint32) (uintptr, error) {
+	w := t.FutexWaiter()
+	err := t.Futex().WaitPrepare(w, futexChecker{t}, addr, val, mask)
+	if err != nil {
+		return 0, err
+	}
+
+	remaining, err := t.BlockWithTimeout(w.C, !forever, duration)
+	t.Futex().WaitComplete(w)
+	if err == nil {
+		return 0, nil
+	}
+
+	// The wait was unsuccessful for some reason other than interruption. Simply
+	// forward the error.
+	if err != syserror.ErrInterrupted {
+		return 0, err
+	}
+
+	// The wait was interrupted and we need to restart. Decide how.
+
+	// The wait duration was absolute, restart with the original arguments.
+	if forever {
+		return 0, kernel.ERESTARTSYS
+	}
+
+	// The wait duration was relative, restart with the remaining duration.
+	t.SetSyscallRestartBlock(&futexWaitRestartBlock{
+		duration: remaining,
+		addr:     uint64(addr),
+		val:      val,
+		mask:     mask,
+	})
+	return 0, kernel.ERESTART_RESTARTBLOCK
+}
+
+// Futex implements linux syscall futex(2).
+// It provides a method for a program to wait for a value at a given address to
+// change, and a method to wake up anyone waiting on a particular address.
+func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	uaddr := args[0].Pointer()
+	futexOp := args[1].Int()
+	val := int(args[2].Int())
+	nreq := int(args[3].Int())
+	timeout := args[3].Pointer()
+	uaddr2 := args[4].Pointer()
+	val3 := args[5].Int()
+
+	addr := uintptr(uaddr)
+	naddr := uintptr(uaddr2)
+	cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME)
+	clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME
+	mask := uint32(val3)
+
+	switch cmd {
+	case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET:
+		// WAIT{_BITSET} wait forever if the timeout isn't passed.
+		forever := timeout == 0
+
+		var timespec linux.Timespec
+		if !forever {
+			var err error
+			timespec, err = copyTimespecIn(t, timeout)
+			if err != nil {
+				return 0, nil, err
+			}
+		}
+
+		switch cmd {
+		case linux.FUTEX_WAIT:
+			// WAIT uses a relative timeout.
+			mask = ^uint32(0)
+			var timeoutDur time.Duration
+			if !forever {
+				timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond
+			}
+			n, err := futexWaitDuration(t, timeoutDur, forever, addr, uint32(val), mask)
+			return n, nil, err
+
+		case linux.FUTEX_WAIT_BITSET:
+			// WAIT_BITSET uses an absolute timeout which is either
+			// CLOCK_MONOTONIC or CLOCK_REALTIME.
+			if mask == 0 {
+				return 0, nil, syserror.EINVAL
+			}
+			n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, uint32(val), mask)
+			return n, nil, err
+		default:
+			panic("unreachable")
+		}
+
+	case linux.FUTEX_WAKE:
+		mask = ^uint32(0)
+		fallthrough
+
+	case linux.FUTEX_WAKE_BITSET:
+		if mask == 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		n, err := t.Futex().Wake(addr, mask, val)
+		return uintptr(n), nil, err
+
+	case linux.FUTEX_REQUEUE:
+		n, err := t.Futex().Requeue(addr, naddr, val, nreq)
+		return uintptr(n), nil, err
+
+	case linux.FUTEX_CMP_REQUEUE:
+		// 'val3' contains the value to be checked at 'addr' and
+		// 'val' is the number of waiters that should be woken up.
+		nval := uint32(val3)
+		n, err := t.Futex().RequeueCmp(futexChecker{t}, addr, nval, naddr, val, nreq)
+		return uintptr(n), nil, err
+
+	case linux.FUTEX_WAKE_OP:
+		op := uint32(val3)
+		n, err := t.Futex().WakeOp(futexChecker{t}, addr, naddr, val, nreq, op)
+		return uintptr(n), nil, err
+
+	case linux.FUTEX_LOCK_PI, linux.FUTEX_UNLOCK_PI, linux.FUTEX_TRYLOCK_PI, linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
+		// We don't support any priority inversion futexes.
+		return 0, nil, syserror.ENOSYS
+
+	default:
+		// We don't even know about this command.
+		return 0, nil, syserror.ENOSYS
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
new file mode 100644
index 000000000..178714b07
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -0,0 +1,269 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"bytes"
+	"io"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Getdents implements linux syscall getdents(2) for 64bit systems.
+func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	size := int(args[2].Uint())
+
+	minSize := int(smallestDirent(t.Arch()))
+	if size < minSize {
+		// size is smaller than smallest possible dirent.
+		return 0, nil, syserror.EINVAL
+	}
+
+	n, err := getdents(t, fd, addr, size, (*dirent).Serialize)
+	return n, nil, err
+}
+
+// Getdents64 implements linux syscall getdents64(2).
+func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	size := int(args[2].Uint())
+
+	minSize := int(smallestDirent64(t.Arch()))
+	if size < minSize {
+		// size is smaller than smallest possible dirent.
+		return 0, nil, syserror.EINVAL
+	}
+
+	n, err := getdents(t, fd, addr, size, (*dirent).Serialize64)
+	return n, nil, err
+}
+
+// getdents implements the core of getdents(2)/getdents64(2).
+// f is the syscall implementation dirent serialization function.
+func getdents(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) {
+	dir := t.FDMap().GetFile(fd)
+	if dir == nil {
+		return 0, syserror.EBADF
+	}
+	defer dir.DecRef()
+
+	w := &usermem.IOReadWriter{
+		Ctx:  t,
+		IO:   t.MemoryManager(),
+		Addr: addr,
+		Opts: usermem.IOOpts{
+			AddressSpaceActive: true,
+		},
+	}
+
+	ds := newDirentSerializer(f, w, t.Arch(), size)
+	rerr := dir.Readdir(t, ds)
+
+	switch err := handleIOError(t, ds.Written() > 0, rerr, kernel.ERESTARTSYS, "getdents", dir); err {
+	case nil:
+		dir.Dirent.InotifyEvent(syscall.IN_ACCESS, 0)
+		return uintptr(ds.Written()), nil
+	case io.EOF:
+		return 0, nil
+	default:
+		return 0, err
+	}
+}
+
+// oldDirentHdr is a fixed sized header matching the fixed size
+// fields found in the old linux dirent struct.
+type oldDirentHdr struct {
+	Ino    uint64
+	Off    uint64
+	Reclen uint16
+}
+
+// direntHdr is a fixed sized header matching the fixed size
+// fields found in the new linux dirent struct.
+type direntHdr struct {
+	OldHdr oldDirentHdr
+	Typ    uint8
+}
+
+// dirent contains the data pointed to by a new linux dirent struct.
+type dirent struct {
+	Hdr  direntHdr
+	Name []byte
+}
+
+// newDirent returns a dirent from an fs.InodeOperationsInfo.
+func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent {
+	d := &dirent{
+		Hdr: direntHdr{
+			OldHdr: oldDirentHdr{
+				Ino: attr.InodeID,
+				Off: offset,
+			},
+			Typ: toType(attr.Type),
+		},
+		Name: []byte(name),
+	}
+	d.Hdr.OldHdr.Reclen = d.padRec(int(width))
+	return d
+}
+
+// smallestDirent returns the size of the smallest possible dirent using
+// the old linux dirent format.
+func smallestDirent(a arch.Context) uint {
+	d := dirent{}
+	return uint(binary.Size(d.Hdr.OldHdr)) + a.Width() + 1
+}
+
+// smallestDirent64 returns the size of the smallest possible dirent using
+// the new linux dirent format.
+func smallestDirent64(a arch.Context) uint {
+	d := dirent{}
+	return uint(binary.Size(d.Hdr)) + a.Width()
+}
+
+// toType converts an fs.InodeOperationsInfo to a linux dirent typ field.
+func toType(nodeType fs.InodeType) uint8 {
+	switch nodeType {
+	case fs.RegularFile, fs.SpecialFile:
+		return syscall.DT_REG
+	case fs.Symlink:
+		return syscall.DT_LNK
+	case fs.Directory:
+		return syscall.DT_DIR
+	case fs.Pipe:
+		return syscall.DT_FIFO
+	case fs.CharacterDevice:
+		return syscall.DT_CHR
+	case fs.BlockDevice:
+		return syscall.DT_BLK
+	case fs.Socket:
+		return syscall.DT_SOCK
+	default:
+		return syscall.DT_UNKNOWN
+	}
+}
+
+// padRec pads the name field until the rec length is a multiple of the width,
+// which must be a power of 2. It returns the padded rec length.
+func (d *dirent) padRec(width int) uint16 {
+	a := int(binary.Size(d.Hdr)) + len(d.Name)
+	r := (a + width) &^ (width - 1)
+	padding := r - a
+	d.Name = append(d.Name, make([]byte, padding)...)
+	return uint16(r)
+}
+
+// Serialize64 serializes a Dirent struct to a byte slice, keeping the new
+// linux dirent format. Returns the number of bytes serialized or an error.
+func (d *dirent) Serialize64(w io.Writer) (int, error) {
+	n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr))
+	if err != nil {
+		return 0, err
+	}
+	n2, err := w.Write(d.Name)
+	if err != nil {
+		return 0, err
+	}
+	return n1 + n2, nil
+}
+
+// Serialize serializes a Dirent struct to a byte slice, using the old linux
+// dirent format.
+// Returns the number of bytes serialized or an error.
+func (d *dirent) Serialize(w io.Writer) (int, error) {
+	n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr.OldHdr))
+	if err != nil {
+		return 0, err
+	}
+	n2, err := w.Write(d.Name)
+	if err != nil {
+		return 0, err
+	}
+	n3, err := w.Write([]byte{d.Hdr.Typ})
+	if err != nil {
+		return 0, err
+	}
+	return n1 + n2 + n3, nil
+}
+
+// direntSerializer implements fs.InodeOperationsInfoSerializer, serializing dirents to an
+// io.Writer.
+type direntSerializer struct {
+	serialize func(*dirent, io.Writer) (int, error)
+	w         io.Writer
+	// width is the arch native value width.
+	width uint
+	// offset is the current dirent offset.
+	offset uint64
+	// written is the total bytes serialized.
+	written int
+	// size is the size of the buffer to serialize into.
+	size int
+}
+
+func newDirentSerializer(f func(d *dirent, w io.Writer) (int, error), w io.Writer, ac arch.Context, size int) *direntSerializer {
+	return &direntSerializer{
+		serialize: f,
+		w:         w,
+		width:     ac.Width(),
+		size:      size,
+	}
+}
+
+// CopyOut implements fs.InodeOperationsInfoSerializer.CopyOut.
+// It serializes and writes the fs.DentAttr to the direntSerializer io.Writer.
+func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error {
+	ds.offset++
+
+	d := newDirent(ds.width, name, attr, ds.offset)
+
+	// Serialize dirent into a temp buffer.
+	var b bytes.Buffer
+	n, err := ds.serialize(d, &b)
+	if err != nil {
+		ds.offset--
+		return err
+	}
+
+	// Check that we have enough room remaining to write the dirent.
+	if n > (ds.size - ds.written) {
+		ds.offset--
+		return io.EOF
+	}
+
+	// Write out the temp buffer.
+	if _, err := b.WriteTo(ds.w); err != nil {
+		ds.offset--
+		return err
+	}
+
+	ds.written += n
+	return nil
+}
+
+// Written returns the total number of bytes written.
+func (ds *direntSerializer) Written() int {
+	return ds.written
+}
diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go
new file mode 100644
index 000000000..4fd0ed794
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_identity.go
@@ -0,0 +1,180 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	// As NGROUPS_MAX in include/uapi/linux/limits.h.
+	maxNGroups = 65536
+)
+
+// Getuid implements the Linux syscall getuid.
+func Getuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	c := t.Credentials()
+	ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
+	return uintptr(ruid), nil, nil
+}
+
+// Geteuid implements the Linux syscall geteuid.
+func Geteuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	c := t.Credentials()
+	euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
+	return uintptr(euid), nil, nil
+}
+
+// Getresuid implements the Linux syscall getresuid.
+func Getresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ruidAddr := args[0].Pointer()
+	euidAddr := args[1].Pointer()
+	suidAddr := args[2].Pointer()
+	c := t.Credentials()
+	ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
+	euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
+	suid := c.SavedKUID.In(c.UserNamespace).OrOverflow()
+	if _, err := t.CopyOut(ruidAddr, ruid); err != nil {
+		return 0, nil, err
+	}
+	if _, err := t.CopyOut(euidAddr, euid); err != nil {
+		return 0, nil, err
+	}
+	if _, err := t.CopyOut(suidAddr, suid); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, nil
+}
+
+// Getgid implements the Linux syscall getgid.
+func Getgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	c := t.Credentials()
+	rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
+	return uintptr(rgid), nil, nil
+}
+
+// Getegid implements the Linux syscall getegid.
+func Getegid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	c := t.Credentials()
+	egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
+	return uintptr(egid), nil, nil
+}
+
+// Getresgid implements the Linux syscall getresgid.
+func Getresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	rgidAddr := args[0].Pointer()
+	egidAddr := args[1].Pointer()
+	sgidAddr := args[2].Pointer()
+	c := t.Credentials()
+	rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
+	egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
+	sgid := c.SavedKGID.In(c.UserNamespace).OrOverflow()
+	if _, err := t.CopyOut(rgidAddr, rgid); err != nil {
+		return 0, nil, err
+	}
+	if _, err := t.CopyOut(egidAddr, egid); err != nil {
+		return 0, nil, err
+	}
+	if _, err := t.CopyOut(sgidAddr, sgid); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, nil
+}
+
+// Setuid implements the Linux syscall setuid.
+func Setuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	uid := auth.UID(args[0].Int())
+	return 0, nil, t.SetUID(uid)
+}
+
+// Setreuid implements the Linux syscall setreuid.
+func Setreuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ruid := auth.UID(args[0].Int())
+	euid := auth.UID(args[1].Int())
+	return 0, nil, t.SetREUID(ruid, euid)
+}
+
+// Setresuid implements the Linux syscall setreuid.
+func Setresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ruid := auth.UID(args[0].Int())
+	euid := auth.UID(args[1].Int())
+	suid := auth.UID(args[2].Int())
+	return 0, nil, t.SetRESUID(ruid, euid, suid)
+}
+
+// Setgid implements the Linux syscall setgid.
+func Setgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	gid := auth.GID(args[0].Int())
+	return 0, nil, t.SetGID(gid)
+}
+
+// Setregid implements the Linux syscall setregid.
+func Setregid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	rgid := auth.GID(args[0].Int())
+	egid := auth.GID(args[1].Int())
+	return 0, nil, t.SetREGID(rgid, egid)
+}
+
+// Setresgid implements the Linux syscall setregid.
+func Setresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	rgid := auth.GID(args[0].Int())
+	egid := auth.GID(args[1].Int())
+	sgid := auth.GID(args[2].Int())
+	return 0, nil, t.SetRESGID(rgid, egid, sgid)
+}
+
+// Getgroups implements the Linux syscall getgroups.
+func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := int(args[0].Int())
+	if size < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	kgids := t.Credentials().ExtraKGIDs
+	// "If size is zero, list is not modified, but the total number of
+	// supplementary group IDs for the process is returned." - getgroups(2)
+	if size == 0 {
+		return uintptr(len(kgids)), nil, nil
+	}
+	if size < len(kgids) {
+		return 0, nil, syserror.EINVAL
+	}
+	gids := make([]auth.GID, len(kgids))
+	for i, kgid := range kgids {
+		gids[i] = kgid.In(t.UserNamespace()).OrOverflow()
+	}
+	if _, err := t.CopyOut(args[1].Pointer(), gids); err != nil {
+		return 0, nil, err
+	}
+	return uintptr(len(gids)), nil, nil
+}
+
+// Setgroups implements the Linux syscall setgroups.
+func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := args[0].Int()
+	if size < 0 || size > maxNGroups {
+		return 0, nil, syserror.EINVAL
+	}
+	if size == 0 {
+		return 0, nil, t.SetExtraGIDs(nil)
+	}
+	gids := make([]auth.GID, size)
+	if _, err := t.CopyIn(args[1].Pointer(), &gids); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, t.SetExtraGIDs(gids)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go
new file mode 100644
index 000000000..725204dff
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_inotify.go
@@ -0,0 +1,135 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+)
+
+const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC)
+
+// InotifyInit1 implements the inotify_init1() syscalls.
+func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := int(args[0].Int())
+
+	if flags&^allFlags != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	dirent := fs.NewDirent(anon.NewInode(t), "inotify")
+	fileFlags := fs.FileFlags{
+		Read:        true,
+		Write:       true,
+		NonBlocking: flags&linux.IN_NONBLOCK != 0,
+	}
+	n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t))
+	defer n.DecRef()
+
+	fd, err := t.FDMap().NewFDFrom(0, n, kernel.FDFlags{
+		CloseOnExec: flags&linux.IN_CLOEXEC != 0,
+	}, t.ThreadGroup().Limits())
+
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// InotifyInit implements the inotify_init() syscalls.
+func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	args[0].Value = 0
+	return InotifyInit1(t, args)
+}
+
+// fdToInotify resolves an fd to an inotify object. If successful, the file will
+// have an extra ref and the caller is responsible for releasing the ref.
+func fdToInotify(t *kernel.Task, fd kdefs.FD) (*fs.Inotify, *fs.File, error) {
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		// Invalid fd.
+		return nil, nil, syscall.EBADF
+	}
+
+	ino, ok := file.FileOperations.(*fs.Inotify)
+	if !ok {
+		// Not an inotify fd.
+		file.DecRef()
+		return nil, nil, syscall.EINVAL
+	}
+
+	return ino, file, nil
+}
+
+// InotifyAddWatch implements the inotify_add_watch() syscall.
+func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	mask := args[2].Uint()
+
+	// "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link."
+	//  -- inotify(7)
+	resolve := mask&linux.IN_DONT_FOLLOW == 0
+
+	// "EINVAL: The given event mask contains no valid events."
+	// -- inotify_add_watch(2)
+	if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	ino, file, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, dirent *fs.Dirent) error {
+		// "IN_ONLYDIR: Only watch pathname if it is a directory." -- inotify(7)
+		if onlyDir := mask&linux.IN_ONLYDIR != 0; onlyDir && !fs.IsDir(dirent.Inode.StableAttr) {
+			return syscall.ENOTDIR
+		}
+
+		// Copy out to the return frame.
+		fd = kdefs.FD(ino.AddWatch(dirent, mask))
+
+		return nil
+	})
+	return uintptr(fd), nil, err // Return from the existing value.
+}
+
+// InotifyRmWatch implements the inotify_rm_watch() syscall.
+func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	wd := args[1].Int()
+
+	ino, file, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+	return 0, nil, ino.RmWatch(wd)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
new file mode 100644
index 000000000..97b51ba7c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -0,0 +1,55 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Lseek implements linux syscall lseek(2).
+func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	offset := args[1].Int64()
+	whence := args[2].Int()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	var sw fs.SeekWhence
+	switch whence {
+	case 0:
+		sw = fs.SeekSet
+	case 1:
+		sw = fs.SeekCurrent
+	case 2:
+		sw = fs.SeekEnd
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	offset, serr := file.Seek(t, sw, offset)
+	err := handleIOError(t, false /* partialResult */, serr, kernel.ERESTARTSYS, "lseek", file)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(offset), nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
new file mode 100644
index 000000000..2c7d41de0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -0,0 +1,435 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"bytes"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Brk implements linux syscall brk(2).
+func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr, _ := t.MemoryManager().Brk(t, args[0].Pointer())
+	// "However, the actual Linux system call returns the new program break on
+	// success. On failure, the system call returns the current break." -
+	// brk(2)
+	return uintptr(addr), nil, nil
+}
+
+// Mmap implements linux syscall mmap(2).
+func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	prot := args[2].Int()
+	flags := args[3].Int()
+	fd := kdefs.FD(args[4].Int())
+	fixed := flags&linux.MAP_FIXED != 0
+	private := flags&linux.MAP_PRIVATE != 0
+	shared := flags&linux.MAP_SHARED != 0
+	anon := flags&linux.MAP_ANONYMOUS != 0
+
+	// Require exactly one of MAP_PRIVATE and MAP_SHARED.
+	if private == shared {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts := memmap.MMapOpts{
+		Length:  args[1].Uint64(),
+		Offset:  args[5].Uint64(),
+		Addr:    args[0].Pointer(),
+		Fixed:   fixed,
+		Unmap:   fixed,
+		Private: private,
+		Perms: usermem.AccessType{
+			Read:    linux.PROT_READ&prot != 0,
+			Write:   linux.PROT_WRITE&prot != 0,
+			Execute: linux.PROT_EXEC&prot != 0,
+		},
+		MaxPerms:  usermem.AnyAccess,
+		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
+		Precommit: linux.MAP_POPULATE&flags != 0,
+	}
+	defer func() {
+		if opts.MappingIdentity != nil {
+			opts.MappingIdentity.DecRef()
+		}
+	}()
+
+	if !anon {
+		// Convert the passed FD to a file reference.
+		file := t.FDMap().GetFile(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		flags := file.Flags()
+		// mmap unconditionally requires that the FD is readable.
+		if !flags.Read {
+			return 0, nil, syserror.EACCES
+		}
+		// MAP_SHARED requires that the FD be writable for PROT_WRITE.
+		if shared && !flags.Write {
+			opts.MaxPerms.Write = false
+		}
+
+		if err := file.ConfigureMMap(t, &opts); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	rv, err := t.MemoryManager().MMap(t, opts)
+	return uintptr(rv), nil, err
+}
+
+// Munmap implements linux syscall munmap(2).
+func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64())
+}
+
+// Mremap implements linux syscall mremap(2).
+func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	oldSize := args[1].Uint64()
+	newSize := args[2].Uint64()
+	flags := args[3].Uint64()
+	newAddr := args[4].Pointer()
+
+	if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	mayMove := flags&linux.MREMAP_MAYMOVE != 0
+	fixed := flags&linux.MREMAP_FIXED != 0
+	var moveMode mm.MRemapMoveMode
+	switch {
+	case !mayMove && !fixed:
+		moveMode = mm.MRemapNoMove
+	case mayMove && !fixed:
+		moveMode = mm.MRemapMayMove
+	case mayMove && fixed:
+		moveMode = mm.MRemapMustMove
+	case !mayMove && fixed:
+		// "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be
+		// specified." - mremap(2)
+		return 0, nil, syserror.EINVAL
+	}
+
+	rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{
+		Move:    moveMode,
+		NewAddr: newAddr,
+	})
+	return uintptr(rv), nil, err
+}
+
+// Mprotect implements linux syscall mprotect(2).
+func Mprotect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	length := args[1].Uint64()
+	prot := args[2].Int()
+	err := t.MemoryManager().MProtect(args[0].Pointer(), length, usermem.AccessType{
+		Read:    linux.PROT_READ&prot != 0,
+		Write:   linux.PROT_WRITE&prot != 0,
+		Execute: linux.PROT_EXEC&prot != 0,
+	}, linux.PROT_GROWSDOWN&prot != 0)
+	return 0, nil, err
+}
+
+// Madvise implements linux syscall madvise(2).
+func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := uint64(args[1].SizeT())
+	adv := args[2].Int()
+
+	// "The Linux implementation requires that the address addr be
+	// page-aligned, and allows length to be zero." - madvise(2)
+	if addr.RoundDown() != addr {
+		return 0, nil, syserror.EINVAL
+	}
+	if length == 0 {
+		return 0, nil, nil
+	}
+	// Not explicitly stated: length need not be page-aligned.
+	lenAddr, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+	length = uint64(lenAddr)
+
+	switch adv {
+	case linux.MADV_DONTNEED:
+		return 0, nil, t.MemoryManager().Decommit(addr, length)
+	case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE:
+		fallthrough
+	case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE:
+		fallthrough
+	case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED:
+		// Do nothing, we totally ignore the suggestions above.
+		return 0, nil, nil
+	case linux.MADV_REMOVE, linux.MADV_DOFORK, linux.MADV_DONTFORK:
+		// These "suggestions" have application-visible side effects, so we
+		// have to indicate that we don't support them.
+		return 0, nil, syserror.ENOSYS
+	case linux.MADV_HWPOISON:
+		// Only privileged processes are allowed to poison pages.
+		return 0, nil, syserror.EPERM
+	default:
+		// If adv is not a valid value tell the caller.
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+func copyOutIfNotNull(t *kernel.Task, ptr usermem.Addr, val interface{}) (int, error) {
+	if ptr != 0 {
+		return t.CopyOut(ptr, val)
+	}
+	return 0, nil
+}
+
+// GetMempolicy implements the syscall get_mempolicy(2).
+func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	mode := args[0].Pointer()
+	nodemask := args[1].Pointer()
+	maxnode := args[2].Uint()
+	addr := args[3].Pointer()
+	flags := args[4].Uint()
+
+	memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0
+	nodeFlag := flags&linux.MPOL_F_NODE != 0
+	addrFlag := flags&linux.MPOL_F_ADDR != 0
+
+	// TODO: Once sysfs is implemented, report a single numa node in
+	// /sys/devices/system/node.
+	if nodemask != 0 && maxnode < 1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// 'addr' provided iff 'addrFlag' set.
+	if addrFlag == (addr == 0) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Default policy for the thread.
+	if flags == 0 {
+		policy, nodemaskVal := t.NumaPolicy()
+		if _, err := copyOutIfNotNull(t, mode, policy); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+		if _, err := copyOutIfNotNull(t, nodemask, nodemaskVal); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+		return 0, nil, nil
+	}
+
+	// Report all nodes available to caller.
+	if memsAllowed {
+		// MPOL_F_NODE and MPOL_F_ADDR not allowed with MPOL_F_MEMS_ALLOWED.
+		if nodeFlag || addrFlag {
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Report a single numa node.
+		if _, err := copyOutIfNotNull(t, nodemask, uint32(0x1)); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+		return 0, nil, nil
+	}
+
+	if addrFlag {
+		if nodeFlag {
+			// Return the id for the node where 'addr' resides, via 'mode'.
+			//
+			// The real get_mempolicy(2) allocates the page referenced by 'addr'
+			// by simulating a read, if it is unallocated before the call. It
+			// then returns the node the page is allocated on through the mode
+			// pointer.
+			b := t.CopyScratchBuffer(1)
+			_, err := t.CopyInBytes(addr, b)
+			if err != nil {
+				return 0, nil, syserror.EFAULT
+			}
+			if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil {
+				return 0, nil, syserror.EFAULT
+			}
+		} else {
+			storedPolicy, _ := t.NumaPolicy()
+			// Return the policy governing the memory referenced by 'addr'.
+			if _, err := copyOutIfNotNull(t, mode, int32(storedPolicy)); err != nil {
+				return 0, nil, syserror.EFAULT
+			}
+		}
+		return 0, nil, nil
+	}
+
+	storedPolicy, _ := t.NumaPolicy()
+	if nodeFlag && (storedPolicy&^linux.MPOL_MODE_FLAGS == linux.MPOL_INTERLEAVE) {
+		// Policy for current thread is to interleave memory between
+		// nodes. Return the next node we'll allocate on. Since we only have a
+		// single node, this is always node 0.
+		if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+		return 0, nil, nil
+	}
+
+	return 0, nil, syserror.EINVAL
+}
+
+func allowedNodesMask() uint32 {
+	const maxNodes = 1
+	return ^uint32((1 << maxNodes) - 1)
+}
+
+// SetMempolicy implements the syscall set_mempolicy(2).
+func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	modeWithFlags := args[0].Int()
+	nodemask := args[1].Pointer()
+	maxnode := args[2].Uint()
+
+	if maxnode < 1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if modeWithFlags&linux.MPOL_MODE_FLAGS == linux.MPOL_MODE_FLAGS {
+		// Can't specify multiple modes simultaneously. Must also contain a
+		// valid mode, which we check below.
+		return 0, nil, syserror.EINVAL
+	}
+
+	mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS
+	if mode < 0 || mode >= linux.MPOL_MAX {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var nodemaskVal uint32
+	if _, err := t.CopyIn(nodemask, &nodemaskVal); err != nil {
+		return 0, nil, syserror.EFAULT
+	}
+
+	// When setting MPOL_INTERLEAVE, nodemask must not be empty.
+	if mode == linux.MPOL_INTERLEAVE && nodemaskVal == 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if nodemaskVal&allowedNodesMask() != 0 {
+		// Invalid node specified.
+		return 0, nil, syserror.EINVAL
+	}
+
+	t.SetNumaPolicy(int32(modeWithFlags), nodemaskVal)
+
+	return 0, nil, nil
+}
+
+// Mincore implements the syscall mincore(2).
+func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	vec := args[2].Pointer()
+
+	if addr != addr.RoundDown() {
+		return 0, nil, syserror.EINVAL
+	}
+	// "The length argument need not be a multiple of the page size, but since
+	// residency information is returned for whole pages, length is effectively
+	// rounded up to the next multiple of the page size." - mincore(2)
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return 0, nil, syserror.ENOMEM
+	}
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return 0, nil, syserror.ENOMEM
+	}
+
+	// Pretend that all mapped pages are "resident in core".
+	mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
+	// "ENOMEM: addr to addr + length contained unmapped memory."
+	if mapped != uint64(la) {
+		return 0, nil, syserror.ENOMEM
+	}
+	resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize))
+	_, err := t.CopyOut(vec, resident)
+	return 0, nil, err
+}
+
+// Msync implements Linux syscall msync(2).
+func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	flags := args[2].Int()
+
+	if addr != addr.RoundDown() {
+		return 0, nil, syserror.EINVAL
+	}
+	if length == 0 {
+		return 0, nil, nil
+	}
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return 0, nil, syserror.ENOMEM
+	}
+	// "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
+	// and may additionally include the MS_INVALIDATE bit. ... However, Linux
+	// permits a call to msync() that specifies neither of these flags, with
+	// semantics that are (currently) equivalent to specifying MS_ASYNC." -
+	// msync(2)
+	if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	sync := flags&linux.MS_SYNC != 0
+	if sync && flags&linux.MS_ASYNC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// MS_INVALIDATE "asks to invalidate other mappings of the same file (so
+	// that they can be updated with the fresh values just written)". This is a
+	// no-op given that shared memory exists. However, MS_INVALIDATE can also
+	// be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags,
+	// and a memory lock exists for the specified address range." Given that
+	// mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since
+	// some user program could be using it for synchronization.
+	if flags&linux.MS_INVALIDATE != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	// MS_SYNC "requests an update and waits for it to complete."
+	if sync {
+		err := t.MemoryManager().Sync(t, addr, uint64(la))
+		// Sync calls fsync, the same interrupt conversion rules apply, see
+		// mm/msync.c, fsync POSIX.1-2008.
+		return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+	}
+	// MS_ASYNC "specifies that an update be scheduled, but the call returns
+	// immediately". As long as dirty pages are tracked and eventually written
+	// back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC
+	// is in fact a no-op, since the kernel properly tracks dirty pages and
+	// flushes them to storage as necessary.")
+	//
+	// However: "ENOMEM: The indicated memory (or part of it) was not mapped."
+	// This applies even for MS_ASYNC.
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return 0, nil, syserror.ENOMEM
+	}
+	mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
+	if mapped != uint64(la) {
+		return 0, nil, syserror.ENOMEM
+	}
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
new file mode 100644
index 000000000..d70b79e4f
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -0,0 +1,140 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Mount implements Linux syscall mount(2).
+func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sourceAddr := args[0].Pointer()
+	targetAddr := args[1].Pointer()
+	typeAddr := args[2].Pointer()
+	flags := args[3].Uint64()
+	dataAddr := args[4].Pointer()
+
+	fsType, err := t.CopyInString(typeAddr, usermem.PageSize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	sourcePath, _, err := copyInPath(t, sourceAddr, true /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	targetPath, _, err := copyInPath(t, targetAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// In Linux, a full page is always copied in regardless of null
+	// character placement, and the address is passed to each file system.
+	// Most file systems always treat this data as a string, though, and so
+	// do all of the ones we implement.
+	data, err := t.CopyInString(dataAddr, usermem.PageSize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Ignore magic value that was required before Linux 2.4.
+	if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL {
+		flags = flags &^ linux.MS_MGC_MSK
+	}
+
+	// Must have CAP_SYS_ADMIN in the mount namespace's associated user
+	// namespace.
+	if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) {
+		return 0, nil, syserror.EPERM
+	}
+
+	const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND |
+		linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE |
+		linux.MS_UNBINDABLE | linux.MS_MOVE
+
+	// Silently allow MS_NOSUID, since we don't implement set-id bits
+	// anyway.
+	const unsupportedFlags = linux.MS_NODEV | linux.MS_NOEXEC |
+		linux.MS_NODIRATIME | linux.MS_STRICTATIME
+
+	// Linux just allows passing any flags to mount(2) - it won't fail when
+	// unknown or unsupported flags are passed. Since we don't implement
+	// everything, we fail explicitly on flags that are unimplemented.
+	if flags&(unsupportedOps|unsupportedFlags) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	rsys, ok := fs.FindFilesystem(fsType)
+	if !ok {
+		return 0, nil, syserror.ENODEV
+	}
+	if !rsys.AllowUserMount() {
+		return 0, nil, syserror.EPERM
+	}
+
+	var superFlags fs.MountSourceFlags
+	if flags&linux.MS_NOATIME == linux.MS_NOATIME {
+		superFlags.NoAtime = true
+	}
+	if flags&linux.MS_RDONLY == linux.MS_RDONLY {
+		superFlags.ReadOnly = true
+	}
+
+	rootInode, err := rsys.Mount(t, sourcePath, superFlags, data)
+	if err != nil {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, targetPath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return t.MountNamespace().Mount(t, d, rootInode)
+	})
+}
+
+// Umount2 implements Linux syscall umount2(2).
+func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Int()
+
+	const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE
+	if flags&unsupported != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Must have CAP_SYS_ADMIN in the mount namespace's associated user
+	// namespace.
+	//
+	// Currently, this is always the init task's user namespace.
+	if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) {
+		return 0, nil, syserror.EPERM
+	}
+
+	resolve := flags&linux.UMOUNT_NOFOLLOW != linux.UMOUNT_NOFOLLOW
+	detachOnly := flags&linux.MNT_DETACH == linux.MNT_DETACH
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error {
+		return t.MountNamespace().Unmount(t, d, detachOnly)
+	})
+}
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
new file mode 100644
index 000000000..3efc06a27
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -0,0 +1,78 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// pipe2 implements the actual system call with flags.
+func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
+	if flags&^(syscall.O_NONBLOCK|syscall.O_CLOEXEC) != 0 {
+		return 0, syscall.EINVAL
+	}
+	r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize)
+
+	r.SetFlags(linuxToFlags(flags).Settable())
+	defer r.DecRef()
+
+	w.SetFlags(linuxToFlags(flags).Settable())
+	defer w.DecRef()
+
+	rfd, err := t.FDMap().NewFDFrom(0, r, kernel.FDFlags{
+		CloseOnExec: flags&syscall.O_CLOEXEC != 0},
+		t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, err
+	}
+
+	wfd, err := t.FDMap().NewFDFrom(0, w, kernel.FDFlags{
+		CloseOnExec: flags&syscall.O_CLOEXEC != 0},
+		t.ThreadGroup().Limits())
+	if err != nil {
+		t.FDMap().Remove(rfd)
+		return 0, err
+	}
+
+	if _, err := t.CopyOut(addr, []kdefs.FD{rfd, wfd}); err != nil {
+		t.FDMap().Remove(rfd)
+		t.FDMap().Remove(wfd)
+		return 0, syscall.EFAULT
+	}
+	return 0, nil
+}
+
+// Pipe implements linux syscall pipe(2).
+func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	n, err := pipe2(t, addr, 0)
+	return n, nil, err
+}
+
+// Pipe2 implements linux syscall pipe2(2).
+func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := uint(args[1].Uint())
+
+	n, err := pipe2(t, addr, flags)
+	return n, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
new file mode 100644
index 000000000..d4dbfd285
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -0,0 +1,429 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/syscalls"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// fileCap is the maximum allowable files for poll & select.
+const fileCap = 1024 * 1024
+
+// Masks for "readable", "writable", and "exceptional" events as defined by
+// select(2).
+const (
+	// selectReadEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLIN_SET.
+	selectReadEvents = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+
+	// selectWriteEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLOUT_SET.
+	selectWriteEvents = waiter.EventOut | waiter.EventErr
+
+	// selectExceptEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLEX_SET.
+	selectExceptEvents = waiter.EventPri
+)
+
+func doPoll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
+	if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
+		return timeout, 0, syserror.EINVAL
+	}
+
+	pfd := make([]syscalls.PollFD, nfds)
+	if nfds > 0 {
+		if _, err := t.CopyIn(pfdAddr, &pfd); err != nil {
+			return timeout, 0, err
+		}
+	}
+
+	// Compatibility warning: Linux adds POLLHUP and POLLERR just before
+	// polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
+	// polling, changing event masks here is an application-visible difference.
+	// (Linux also doesn't copy out event masks at all, only revents.)
+	for i := range pfd {
+		pfd[i].Events |= waiter.EventHUp | waiter.EventErr
+	}
+	remainingTimeout, n, err := syscalls.Poll(t, pfd, timeout)
+	err = syserror.ConvertIntr(err, syserror.EINTR)
+
+	// The poll entries are copied out regardless of whether
+	// any are set or not. This aligns with the Linux behavior.
+	if nfds > 0 && err == nil {
+		if _, err := t.CopyOut(pfdAddr, pfd); err != nil {
+			return remainingTimeout, 0, err
+		}
+	}
+
+	return remainingTimeout, n, err
+}
+
+func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
+	if nfds < 0 || uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
+		return 0, syserror.EINVAL
+	}
+
+	// Capture all the provided input vectors.
+	//
+	// N.B. This only works on little-endian architectures.
+	byteCount := (nfds + 7) / 8
+	bitsInLastPartialByte := uint(nfds % 8)
+	r := make([]byte, byteCount)
+	w := make([]byte, byteCount)
+	e := make([]byte, byteCount)
+
+	if readFDs != 0 {
+		if _, err := t.CopyIn(readFDs, &r); err != nil {
+			return 0, err
+		}
+		// Mask out bits above nfds.
+		if bitsInLastPartialByte != 0 {
+			r[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
+		}
+	}
+
+	if writeFDs != 0 {
+		if _, err := t.CopyIn(writeFDs, &w); err != nil {
+			return 0, err
+		}
+		if bitsInLastPartialByte != 0 {
+			w[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
+		}
+	}
+
+	if exceptFDs != 0 {
+		if _, err := t.CopyIn(exceptFDs, &e); err != nil {
+			return 0, err
+		}
+		if bitsInLastPartialByte != 0 {
+			e[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
+		}
+	}
+
+	// Count how many FDs are actually being requested so that we can build
+	// a PollFD array.
+	fdCount := 0
+	for i := 0; i < byteCount; i++ {
+		v := r[i] | w[i] | e[i]
+		for v != 0 {
+			v &= (v - 1)
+			fdCount++
+		}
+	}
+
+	// Build the PollFD array.
+	pfd := make([]syscalls.PollFD, 0, fdCount)
+	fd := kdefs.FD(0)
+	for i := 0; i < byteCount; i++ {
+		rV, wV, eV := r[i], w[i], e[i]
+		v := rV | wV | eV
+		m := byte(1)
+		for j := 0; j < 8; j++ {
+			if (v & m) != 0 {
+				// Make sure the fd is valid and decrement the reference
+				// immediately to ensure we don't leak. Note, another thread
+				// might be about to close fd. This is racy, but that's
+				// OK. Linux is racy in the same way.
+				file := t.FDMap().GetFile(fd)
+				if file == nil {
+					return 0, syserror.EBADF
+				}
+				file.DecRef()
+
+				mask := waiter.EventMask(0)
+				if (rV & m) != 0 {
+					mask |= selectReadEvents
+				}
+
+				if (wV & m) != 0 {
+					mask |= selectWriteEvents
+				}
+
+				if (eV & m) != 0 {
+					mask |= selectExceptEvents
+				}
+
+				pfd = append(pfd, syscalls.PollFD{
+					FD:     fd,
+					Events: mask,
+				})
+			}
+
+			fd++
+			m <<= 1
+		}
+	}
+
+	// Do the syscall, then count the number of bits set.
+	_, _, err := syscalls.Poll(t, pfd, timeout)
+	if err != nil {
+		return 0, syserror.ConvertIntr(err, syserror.EINTR)
+	}
+
+	// r, w, and e are currently event mask bitsets; unset bits corresponding
+	// to events that *didn't* occur.
+	bitSetCount := uintptr(0)
+	for idx := range pfd {
+		events := pfd[idx].REvents
+		i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
+		m := byte(1) << j
+		if r[i]&m != 0 {
+			if (events & selectReadEvents) != 0 {
+				bitSetCount++
+			} else {
+				r[i] &^= m
+			}
+		}
+		if w[i]&m != 0 {
+			if (events & selectWriteEvents) != 0 {
+				bitSetCount++
+			} else {
+				w[i] &^= m
+			}
+		}
+		if e[i]&m != 0 {
+			if (events & selectExceptEvents) != 0 {
+				bitSetCount++
+			} else {
+				e[i] &^= m
+			}
+		}
+	}
+
+	// Copy updated vectors back.
+	if readFDs != 0 {
+		if _, err := t.CopyOut(readFDs, r); err != nil {
+			return 0, err
+		}
+	}
+
+	if writeFDs != 0 {
+		if _, err := t.CopyOut(writeFDs, w); err != nil {
+			return 0, err
+		}
+	}
+
+	if exceptFDs != 0 {
+		if _, err := t.CopyOut(exceptFDs, e); err != nil {
+			return 0, err
+		}
+	}
+
+	return bitSetCount, nil
+}
+
+// timeoutRemaining returns the amount of time remaining for the specified
+// timeout or 0 if it has elapsed.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
+	now := t.Kernel().MonotonicClock().Now()
+	remaining := timeout - now.Sub(startNs)
+	if remaining < 0 {
+		remaining = 0
+	}
+	return remaining
+}
+
+// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
+	return copyTimespecOut(t, timespecAddr, &tsRemaining)
+}
+
+// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
+	return copyTimevalOut(t, timevalAddr, &tvRemaining)
+}
+
+// pollRestartBlock encapsulates the state required to restart poll(2) via
+// restart_syscall(2).
+type pollRestartBlock struct {
+	pfdAddr usermem.Addr
+	nfds    uint
+	timeout time.Duration
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return poll(t, p.pfdAddr, p.nfds, p.timeout)
+}
+
+func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
+	remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	// On an interrupt poll(2) is restarted with the remaining timeout.
+	if err == syserror.EINTR {
+		t.SetSyscallRestartBlock(&pollRestartBlock{
+			pfdAddr: pfdAddr,
+			nfds:    nfds,
+			timeout: remainingTimeout,
+		})
+		return 0, kernel.ERESTART_RESTARTBLOCK
+	}
+	return n, err
+}
+
+// Poll implements linux syscall poll(2).
+func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timeout := time.Duration(args[2].Int()) * time.Millisecond
+	n, err := poll(t, pfdAddr, nfds, timeout)
+	return n, nil, err
+}
+
+// Ppoll implements linux syscall ppoll(2).
+func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timespecAddr := args[2].Pointer()
+	maskAddr := args[3].Pointer()
+	maskSize := uint(args[4].Uint())
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if maskAddr != 0 {
+		mask, err := copyInSigSet(t, maskAddr, maskSize)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		oldmask := t.SignalMask()
+		t.SetSignalMask(mask)
+		t.SetSavedSignalMask(oldmask)
+	}
+
+	_, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// doPoll returns EINTR if interrupted, but ppoll is normally restartable
+	// if interrupted by something other than a signal handled by the
+	// application (i.e. returns ERESTARTNOHAND). However, if
+	// copyOutTimespecRemaining failed, then the restarted ppoll would use the
+	// wrong timeout, so the error should be left as EINTR.
+	//
+	// Note that this means that if err is nil but copyErr is not, copyErr is
+	// ignored. This is consistent with Linux.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Select implements linux syscall select(2).
+func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timevalAddr := args[4].Pointer()
+
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timevalAddr != 0 {
+		timeval, err := copyTimevalIn(t, timevalAddr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if timeval.Sec < 0 || timeval.Usec < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		timeout = time.Duration(timeval.ToNsecCapped())
+	}
+	startNs := t.Kernel().MonotonicClock().Now()
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Pselect implements linux syscall pselect(2).
+func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timespecAddr := args[4].Pointer()
+	maskWithSizeAddr := args[5].Pointer()
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if maskWithSizeAddr != 0 {
+		maskAddr, size, err := copyInSigSetWithSize(t, maskWithSizeAddr)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		if maskAddr != 0 {
+			mask, err := copyInSigSet(t, maskAddr, size)
+			if err != nil {
+				return 0, nil, err
+			}
+			oldmask := t.SignalMask()
+			t.SetSignalMask(mask)
+			t.SetSavedSignalMask(oldmask)
+		}
+	}
+
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
new file mode 100644
index 000000000..2ca7471cf
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -0,0 +1,188 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bpf"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
+type userSockFprog struct {
+	// Len is the length of the filter in BPF instructions.
+	Len uint16
+
+	_ [6]byte // padding for alignment
+
+	// Filter is a user pointer to the struct sock_filter array that makes up
+	// the filter program. Filter is a uint64 rather than a usermem.Addr
+	// because usermem.Addr is actually uintptr, which is not a fixed-size
+	// type, and encoding/binary.Read objects to this.
+	Filter uint64
+}
+
+// Prctl implements linux syscall prctl(2).
+// It has a list of subfunctions which operate on the process. The arguments are
+// all based on each subfunction.
+func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	option := args[0].Int()
+
+	switch option {
+	case linux.PR_SET_PDEATHSIG:
+		sig := linux.Signal(args[1].Int())
+		if sig != 0 && !sig.IsValid() {
+			return 0, nil, syscall.EINVAL
+		}
+		t.SetParentDeathSignal(sig)
+		return 0, nil, nil
+
+	case linux.PR_GET_PDEATHSIG:
+		_, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal()))
+		return 0, nil, err
+
+	case linux.PR_GET_KEEPCAPS:
+		if t.Credentials().KeepCaps {
+			return 1, nil, nil
+		}
+
+		return 0, nil, nil
+
+	case linux.PR_SET_KEEPCAPS:
+		val := args[1].Int()
+		// prctl(2): arg2 must be either 0 (permitted capabilities are cleared)
+		// or 1 (permitted capabilities are kept).
+		if val == 0 {
+			t.SetKeepCaps(false)
+		} else if val == 1 {
+			t.SetKeepCaps(true)
+		} else {
+			return 0, nil, syscall.EINVAL
+		}
+
+		return 0, nil, nil
+
+	case linux.PR_SET_NAME:
+		addr := args[1].Pointer()
+		name, err := t.CopyInString(addr, linux.TASK_COMM_LEN-1)
+		if err != nil && err != syscall.ENAMETOOLONG {
+			return 0, nil, err
+		}
+		t.SetName(name)
+
+	case linux.PR_GET_NAME:
+		addr := args[1].Pointer()
+		buf := make([]byte, linux.TASK_COMM_LEN)
+		len := copy(buf, t.Name())
+		if len < linux.TASK_COMM_LEN {
+			buf[len] = 0
+			len++
+		}
+		_, err := t.CopyOut(addr, buf[:len])
+		if err != nil {
+			return 0, nil, err
+		}
+
+	case linux.PR_SET_MM:
+		switch args[1].Int() {
+		case linux.PR_SET_MM_EXE_FILE:
+			fd := kdefs.FD(args[2].Int())
+
+			file := t.FDMap().GetFile(fd)
+			if file == nil {
+				return 0, nil, syscall.EBADF
+			}
+			defer file.DecRef()
+
+			// They trying to set exe to a non-file?
+			if !fs.IsFile(file.Dirent.Inode.StableAttr) {
+				return 0, nil, syscall.EBADF
+			}
+
+			// Set the underlying executable.
+			t.MemoryManager().SetExecutable(file.Dirent)
+		default:
+			return 0, nil, syscall.EINVAL
+		}
+
+	case linux.PR_SET_NO_NEW_PRIVS:
+		if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
+			return 0, nil, syscall.EINVAL
+		}
+		// no_new_privs is assumed to always be set. See
+		// auth.Credentials.UpdateForExec.
+		return 0, nil, nil
+
+	case linux.PR_GET_NO_NEW_PRIVS:
+		if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
+			return 0, nil, syscall.EINVAL
+		}
+		return 1, nil, nil
+
+	case linux.PR_SET_SECCOMP:
+		if args[1].Int() != linux.SECCOMP_MODE_FILTER {
+			// Unsupported mode.
+			return 0, nil, syscall.EINVAL
+		}
+		var fprog userSockFprog
+		if _, err := t.CopyIn(args[2].Pointer(), &fprog); err != nil {
+			return 0, nil, err
+		}
+		filter := make([]linux.BPFInstruction, int(fprog.Len))
+		if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil {
+			return 0, nil, err
+		}
+		compiledFilter, err := bpf.Compile(filter)
+		if err != nil {
+			t.Debugf("Invalid seccomp-bpf filter: %v", err)
+			return 0, nil, syscall.EINVAL
+		}
+		return 0, nil, t.AppendSyscallFilter(compiledFilter)
+
+	case linux.PR_GET_SECCOMP:
+		return uintptr(t.SeccompMode()), nil, nil
+
+	case linux.PR_CAPBSET_READ:
+		cp := linux.Capability(args[1].Uint64())
+		if !cp.Ok() {
+			return 0, nil, syscall.EINVAL
+		}
+		var rv uintptr
+		if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 {
+			rv = 1
+		}
+		return rv, nil, nil
+
+	case linux.PR_CAPBSET_DROP:
+		cp := linux.Capability(args[1].Uint64())
+		if !cp.Ok() {
+			return 0, nil, syscall.EINVAL
+		}
+		return 0, nil, t.DropBoundingCapability(cp)
+
+	default:
+		t.Warningf("Unsupported prctl %d", option)
+		return 0, nil, syscall.EINVAL
+	}
+
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go
new file mode 100644
index 000000000..2dd59b1c3
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_random.go
@@ -0,0 +1,92 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"crypto/rand"
+	"io"
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	_GRND_NONBLOCK = 0x1
+	_GRND_RANDOM   = 0x2
+)
+
+// GetRandom implements the linux syscall getrandom(2).
+//
+// In a multi-tenant/shared environment, the only valid implementation is to
+// fetch data from the urandom pool, otherwise starvation attacks become
+// possible. The urandom pool is also expected to have plenty of entropy, thus
+// the GRND_RANDOM flag is ignored. The GRND_NONBLOCK flag does not apply, as
+// the pool will already be initialized.
+func GetRandom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	flags := args[2].Int()
+
+	// Flags are checked for validity but otherwise ignored. See above.
+	if flags & ^(_GRND_NONBLOCK|_GRND_RANDOM) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if length > math.MaxInt32 {
+		length = math.MaxInt32
+	}
+	ar, ok := addr.ToRange(uint64(length))
+	if !ok {
+		return 0, nil, syserror.EFAULT
+	}
+
+	// "If the urandom source has been initialized, reads of up to 256 bytes
+	// will always return as many bytes as requested and will not be
+	// interrupted by signals. No such guarantees apply for larger buffer
+	// sizes." - getrandom(2)
+	min := int(length)
+	if min > 256 {
+		min = 256
+	}
+	n, err := t.MemoryManager().CopyOutFrom(t, usermem.AddrRangeSeqOf(ar), safemem.FromIOReader{&randReader{-1, min}}, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if n >= int64(min) {
+		return uintptr(n), nil, nil
+	}
+	return 0, nil, err
+}
+
+// randReader is a io.Reader that handles partial reads from rand.Reader.
+type randReader struct {
+	done int
+	min  int
+}
+
+// Read implements io.Reader.Read.
+func (r *randReader) Read(dst []byte) (int, error) {
+	if r.done >= r.min {
+		return rand.Reader.Read(dst)
+	}
+	min := r.min - r.done
+	if min > len(dst) {
+		min = len(dst)
+	}
+	return io.ReadAtLeast(rand.Reader, dst, min)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
new file mode 100644
index 000000000..0be2d195a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -0,0 +1,274 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+const (
+	// EventMaskRead contains events that can be triggerd on reads.
+	EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+)
+
+// Read implements linux syscall read(2).  Note that we try to get a buffer that
+// is exactly the size requested because some applications like qemu expect
+// they can do large reads all at once.  Bug for bug.  Same for other read
+// calls below.
+func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := readv(t, file, dst)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+}
+
+// Pread64 implements linux syscall pread64(2).
+func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is reading at an offset supported?
+	if !file.Flags().Pread {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := preadv(t, file, dst, offset)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+}
+
+// Readv implements linux syscall readv(2).
+func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := readv(t, file, dst)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+}
+
+// Preadv implements linux syscall preadv(2).
+func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is reading at an offset supported?
+	if !file.Flags().Pread {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := preadv(t, file, dst, offset)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+}
+
+func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) {
+	n, err := f.Readv(t, dst)
+	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+		if n > 0 {
+			// Queue notification if we read anything.
+			f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	f.EventRegister(&w, EventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst64(n)
+
+		// Issue the request and break out if it completes with anything
+		// other than "would block".
+		n, err = f.Readv(t, dst)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.Block(ch); err != nil {
+			break
+		}
+	}
+
+	f.EventUnregister(&w)
+
+	if total > 0 {
+		// Queue notification if we read anything.
+		f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+	}
+
+	return total, err
+}
+
+func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	n, err := f.Preadv(t, dst, offset)
+	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+		if n > 0 {
+			// Queue notification if we read anything.
+			f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	f.EventRegister(&w, EventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst64(n)
+
+		// Issue the request and break out if it completes with anything
+		// other than "would block".
+		n, err = f.Preadv(t, dst, offset+total)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.Block(ch); err != nil {
+			break
+		}
+	}
+
+	f.EventUnregister(&w)
+
+	if total > 0 {
+		// Queue notification if we read anything.
+		f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+	}
+
+	return total, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
new file mode 100644
index 000000000..481e79eaa
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -0,0 +1,217 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// rlimit describes an implementation of 'struct rlimit', which may vary from
+// system-to-system.
+type rlimit interface {
+	// toLimit converts an rlimit to a limits.Limit.
+	toLimit() *limits.Limit
+
+	// fromLimit converts a limits.Limit to an rlimit.
+	fromLimit(lim limits.Limit)
+
+	// copyIn copies an rlimit from the untrusted app to the kernel.
+	copyIn(t *kernel.Task, addr usermem.Addr) error
+
+	// copyOut copies an rlimit from the kernel to the untrusted app.
+	copyOut(t *kernel.Task, addr usermem.Addr) error
+}
+
+// newRlimit returns the appropriate rlimit type for 'struct rlimit' on this system.
+func newRlimit(t *kernel.Task) (rlimit, error) {
+	switch t.Arch().Width() {
+	case 8:
+		// On 64-bit system, struct rlimit and struct rlimit64 are identical.
+		return &rlimit64{}, nil
+	default:
+		return nil, syserror.ENOSYS
+	}
+}
+
+type rlimit64 struct {
+	Cur uint64
+	Max uint64
+}
+
+func (r *rlimit64) toLimit() *limits.Limit {
+	return &limits.Limit{
+		Cur: limits.FromLinux(r.Cur),
+		Max: limits.FromLinux(r.Max),
+	}
+}
+
+func (r *rlimit64) fromLimit(lim limits.Limit) {
+	*r = rlimit64{
+		Cur: limits.ToLinux(lim.Cur),
+		Max: limits.ToLinux(lim.Max),
+	}
+}
+
+func (r *rlimit64) copyIn(t *kernel.Task, addr usermem.Addr) error {
+	_, err := t.CopyIn(addr, r)
+	return err
+}
+
+func (r *rlimit64) copyOut(t *kernel.Task, addr usermem.Addr) error {
+	_, err := t.CopyOut(addr, *r)
+	return err
+}
+
+func makeRlimit64(lim limits.Limit) *rlimit64 {
+	return &rlimit64{Cur: lim.Cur, Max: lim.Max}
+}
+
+// setableLimits is the set of supported setable limits.
+var setableLimits = map[limits.LimitType]struct{}{
+	limits.NumberOfFiles: {},
+	limits.AS:            {},
+	limits.CPU:           {},
+	limits.Data:          {},
+	limits.FileSize:      {},
+	limits.Stack:         {},
+	// These are not enforced, but we include them here to avoid returning
+	// EPERM, since some apps expect them to succeed.
+	limits.Core:         {},
+	limits.ProcessCount: {},
+}
+
+func prlimit64(t *kernel.Task, resource limits.LimitType, newLim *limits.Limit) (limits.Limit, error) {
+	if newLim == nil {
+		return t.ThreadGroup().Limits().Get(resource), nil
+	}
+
+	if _, ok := setableLimits[resource]; !ok {
+		return limits.Limit{}, syserror.EPERM
+	}
+	oldLim, err := t.ThreadGroup().Limits().Set(resource, *newLim)
+	if err != nil {
+		return limits.Limit{}, err
+	}
+
+	if resource == limits.CPU {
+		t.ThreadGroup().SetCPUTimer(newLim)
+	}
+	return oldLim, nil
+}
+
+// Getrlimit implements linux syscall getrlimit(2).
+func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	resource, ok := limits.FromLinuxResource[int(args[0].Int())]
+	if !ok {
+		// Return err; unknown limit.
+		return 0, nil, syserror.EINVAL
+	}
+	addr := args[1].Pointer()
+	rlim, err := newRlimit(t)
+	if err != nil {
+		return 0, nil, err
+	}
+	lim, err := prlimit64(t, resource, nil)
+	if err != nil {
+		return 0, nil, err
+	}
+	rlim.fromLimit(lim)
+	return 0, nil, rlim.copyOut(t, addr)
+}
+
+// Setrlimit implements linux syscall setrlimit(2).
+func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	resource, ok := limits.FromLinuxResource[int(args[0].Int())]
+	if !ok {
+		// Return err; unknown limit.
+		return 0, nil, syserror.EINVAL
+	}
+	addr := args[1].Pointer()
+	rlim, err := newRlimit(t)
+	if err != nil {
+		return 0, nil, err
+	}
+	if err := rlim.copyIn(t, addr); err != nil {
+		return 0, nil, syserror.EFAULT
+	}
+	_, err = prlimit64(t, resource, rlim.toLimit())
+	return 0, nil, err
+}
+
+// Prlimit64 implements linux syscall prlimit64(2).
+func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := kernel.ThreadID(args[0].Int())
+	resource, ok := limits.FromLinuxResource[int(args[1].Int())]
+	if !ok {
+		// Return err; unknown limit.
+		return 0, nil, syserror.EINVAL
+	}
+	newRlimAddr := args[2].Pointer()
+	oldRlimAddr := args[3].Pointer()
+
+	var newLim *limits.Limit
+	if newRlimAddr != 0 {
+		var nrl rlimit64
+		if err := nrl.copyIn(t, newRlimAddr); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+		newLim = nrl.toLimit()
+	}
+
+	if tid < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	ot := t
+	if tid > 0 {
+		if ot = t.PIDNamespace().TaskWithID(tid); ot == nil {
+			return 0, nil, syserror.ESRCH
+		}
+	}
+
+	// "To set or get the resources of a process other than itself, the caller
+	// must have the CAP_SYS_RESOURCE capability, or the real, effective, and
+	// saved set user IDs of the target process must match the real user ID of
+	// the caller and the real, effective, and saved set group IDs of the
+	// target process must match the real group ID of the caller."
+	if !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) {
+		cred, tcred := t.Credentials(), ot.Credentials()
+		if cred.RealKUID != tcred.RealKUID ||
+			cred.RealKUID != tcred.EffectiveKUID ||
+			cred.RealKUID != tcred.SavedKUID ||
+			cred.RealKGID != tcred.RealKGID ||
+			cred.RealKGID != tcred.EffectiveKGID ||
+			cred.RealKGID != tcred.SavedKGID {
+			return 0, nil, syserror.EPERM
+		}
+	}
+
+	oldLim, err := prlimit64(ot, resource, newLim)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if oldRlimAddr != 0 {
+		if err := makeRlimit64(oldLim).copyOut(t, oldRlimAddr); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+	}
+
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go
new file mode 100644
index 000000000..82e42b589
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rusage.go
@@ -0,0 +1,112 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func getrusage(t *kernel.Task, which int32) linux.Rusage {
+	var cs usage.CPUStats
+
+	switch which {
+	case linux.RUSAGE_SELF:
+		cs = t.ThreadGroup().CPUStats()
+
+	case linux.RUSAGE_CHILDREN:
+		cs = t.ThreadGroup().JoinedChildCPUStats()
+
+	case linux.RUSAGE_THREAD:
+		cs = t.CPUStats()
+
+	case linux.RUSAGE_BOTH:
+		tg := t.ThreadGroup()
+		cs = tg.CPUStats()
+		cs.Accumulate(tg.JoinedChildCPUStats())
+	}
+
+	return linux.Rusage{
+		UTime:  linux.NsecToTimeval(cs.UserTime.Nanoseconds()),
+		STime:  linux.NsecToTimeval(cs.SysTime.Nanoseconds()),
+		NVCSw:  int64(cs.VoluntarySwitches),
+		MaxRSS: int64(t.MaxRSS(which) / 1024),
+	}
+}
+
+// Getrusage implements linux syscall getrusage(2).
+//	marked "y" are supported now
+//	marked "*" are not used on Linux
+//	marked "p" are pending for support
+//
+//	y    struct timeval ru_utime; /* user CPU time used */
+//	y    struct timeval ru_stime; /* system CPU time used */
+//	p    long   ru_maxrss;        /* maximum resident set size */
+//	*    long   ru_ixrss;         /* integral shared memory size */
+//	*    long   ru_idrss;         /* integral unshared data size */
+//	*    long   ru_isrss;         /* integral unshared stack size */
+//	p    long   ru_minflt;        /* page reclaims (soft page faults) */
+//	p    long   ru_majflt;        /* page faults (hard page faults) */
+//	*    long   ru_nswap;         /* swaps */
+//	p    long   ru_inblock;       /* block input operations */
+//	p    long   ru_oublock;       /* block output operations */
+//	*    long   ru_msgsnd;        /* IPC messages sent */
+//	*    long   ru_msgrcv;        /* IPC messages received */
+//	*    long   ru_nsignals;      /* signals received */
+//	y    long   ru_nvcsw;         /* voluntary context switches */
+//	y    long   ru_nivcsw;        /* involuntary context switches */
+func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	which := args[0].Int()
+	addr := args[1].Pointer()
+
+	if which != linux.RUSAGE_SELF && which != linux.RUSAGE_CHILDREN && which != linux.RUSAGE_THREAD {
+		return 0, nil, syserror.EINVAL
+	}
+
+	ru := getrusage(t, which)
+	_, err := t.CopyOut(addr, &ru)
+	return 0, nil, err
+}
+
+// Times implements linux syscall times(2).
+func Times(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	// Calculate the ticks first, and figure out if any additional work is
+	// necessary. Linux allows for a NULL addr, in which case only the
+	// return value is meaningful. We don't need to do anything else.
+	ticks := uintptr(ktime.NowFromContext(t).Nanoseconds() / linux.ClockTick.Nanoseconds())
+	if addr == 0 {
+		return ticks, nil, nil
+	}
+
+	cs1 := t.ThreadGroup().CPUStats()
+	cs2 := t.ThreadGroup().JoinedChildCPUStats()
+	r := linux.Tms{
+		UTime:  linux.ClockTFromDuration(cs1.UserTime),
+		STime:  linux.ClockTFromDuration(cs1.SysTime),
+		CUTime: linux.ClockTFromDuration(cs2.UserTime),
+		CSTime: linux.ClockTFromDuration(cs2.SysTime),
+	}
+	if _, err := t.CopyOut(addr, &r); err != nil {
+		return 0, nil, err
+	}
+
+	return ticks, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
new file mode 100644
index 000000000..ff9e46077
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -0,0 +1,100 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+const (
+	onlyScheduler = linux.SCHED_NORMAL
+	onlyPriority  = 0
+)
+
+// SchedParam replicates struct sched_param in sched.h.
+type SchedParam struct {
+	schedPriority int64
+}
+
+// SchedGetparam implements linux syscall sched_getparam(2).
+func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := args[0].Int()
+	param := args[1].Pointer()
+	if param == 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	if pid < 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
+		return 0, nil, syscall.ESRCH
+	}
+	r := SchedParam{schedPriority: onlyPriority}
+	if _, err := t.CopyOut(param, r); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// SchedGetscheduler implements linux syscall sched_getscheduler(2).
+func SchedGetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := args[0].Int()
+	if pid < 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
+		return 0, nil, syscall.ESRCH
+	}
+	return onlyScheduler, nil, nil
+}
+
+// SchedSetscheduler implements linux syscall sched_setscheduler(2).
+func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := args[0].Int()
+	policy := args[1].Int()
+	param := args[2].Pointer()
+	if pid < 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	if policy != onlyScheduler {
+		return 0, nil, syscall.EINVAL
+	}
+	if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
+		return 0, nil, syscall.ESRCH
+	}
+	var r SchedParam
+	if _, err := t.CopyIn(param, &r); err != nil {
+		return 0, nil, syscall.EINVAL
+	}
+	if r.schedPriority != onlyPriority {
+		return 0, nil, syscall.EINVAL
+	}
+	return 0, nil, nil
+}
+
+// SchedGetPriorityMax implements linux syscall sched_get_priority_max(2).
+func SchedGetPriorityMax(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return onlyPriority, nil, nil
+}
+
+// SchedGetPriorityMin implements linux syscall sched_get_priority_min(2).
+func SchedGetPriorityMin(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return onlyPriority, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
new file mode 100644
index 000000000..a8983705b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -0,0 +1,166 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const opsMax = 500 // SEMOPM
+
+// Semget handles: semget(key_t key, int nsems, int semflg)
+func Semget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	key := args[0].Int()
+	nsems := args[1].Int()
+	flag := args[2].Int()
+
+	private := key == linux.IPC_PRIVATE
+	create := flag&linux.IPC_CREAT == linux.IPC_CREAT
+	exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL
+	mode := linux.FileMode(flag & 0777)
+
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set, err := r.FindOrCreate(t, key, nsems, mode, private, create, exclusive)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(set.ID), nil, nil
+}
+
+// Semop handles: semop(int semid, struct sembuf *sops, size_t nsops)
+func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Int()
+	sembufAddr := args[1].Pointer()
+	nsops := args[2].SizeT()
+
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return 0, nil, syserror.EINVAL
+	}
+	if nsops <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if nsops > opsMax {
+		return 0, nil, syserror.E2BIG
+	}
+
+	ops := make([]linux.Sembuf, nsops)
+	if _, err := t.CopyIn(sembufAddr, ops); err != nil {
+		return 0, nil, err
+	}
+
+	creds := auth.CredentialsFromContext(t)
+	for {
+		ch, num, err := set.ExecuteOps(t, ops, creds)
+		if ch == nil || err != nil {
+			// We're done (either on success or a failure).
+			return 0, nil, err
+		}
+		if err = t.Block(ch); err != nil {
+			set.AbortWait(num, ch)
+			return 0, nil, err
+		}
+	}
+}
+
+// Semctl handles: semctl(int semid, int semnum, int cmd, ...)
+func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Int()
+	num := args[1].Int()
+	cmd := args[2].Int()
+
+	switch cmd {
+	case linux.SETVAL:
+		val := args[3].Int()
+		if val > math.MaxInt16 {
+			return 0, nil, syserror.ERANGE
+		}
+		return 0, nil, setVal(t, id, num, int16(val))
+
+	case linux.GETVAL:
+		v, err := getVal(t, id, num)
+		return uintptr(v), nil, err
+
+	case linux.IPC_RMID:
+		return 0, nil, remove(t, id)
+
+	case linux.IPC_SET:
+		arg := args[3].Pointer()
+		s := linux.SemidDS{}
+		if _, err := t.CopyIn(arg, &s); err != nil {
+			return 0, nil, err
+		}
+
+		perms := fs.FilePermsFromMode(linux.FileMode(s.SemPerm.Mode & 0777))
+		return 0, nil, ipcSet(t, id, auth.UID(s.SemPerm.UID), auth.GID(s.SemPerm.GID), perms)
+
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+func remove(t *kernel.Task, id int32) error {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	creds := auth.CredentialsFromContext(t)
+	return r.RemoveID(id, creds)
+}
+
+func ipcSet(t *kernel.Task, id int32, uid auth.UID, gid auth.GID, perms fs.FilePermissions) error {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return syserror.EINVAL
+	}
+
+	creds := auth.CredentialsFromContext(t)
+	kuid := creds.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return syserror.EINVAL
+	}
+	kgid := creds.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return syserror.EINVAL
+	}
+	owner := fs.FileOwner{UID: kuid, GID: kgid}
+	return set.Change(t, creds, owner, perms)
+}
+
+func setVal(t *kernel.Task, id int32, num int32, val int16) error {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return syserror.EINVAL
+	}
+	creds := auth.CredentialsFromContext(t)
+	return set.SetVal(t, num, val, creds)
+}
+
+func getVal(t *kernel.Task, id int32, num int32) (int16, error) {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return 0, syserror.EINVAL
+	}
+	creds := auth.CredentialsFromContext(t)
+	return set.GetVal(num, creds)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
new file mode 100644
index 000000000..93b3f531a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -0,0 +1,553 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"math"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// "For a process to have permission to send a signal it must
+// - either be privileged (CAP_KILL), or
+// - the real or effective user ID of the sending process must be equal to the
+// real or saved set-user-ID of the target process.
+//
+// In the case of SIGCONT it suffices when the sending and receiving processes
+// belong to the same session." - kill(2)
+//
+// Equivalent to kernel/signal.c:check_kill_permission.
+func mayKill(t *kernel.Task, target *kernel.Task, sig linux.Signal) bool {
+	// kernel/signal.c:check_kill_permission also allows a signal if the
+	// sending and receiving tasks share a thread group, which is not
+	// mentioned in kill(2) since kill does not allow task-level
+	// granularity in signal sending.
+	if t.ThreadGroup() == target.ThreadGroup() {
+		return true
+	}
+
+	if t.HasCapabilityIn(linux.CAP_KILL, target.UserNamespace()) {
+		return true
+	}
+
+	creds := t.Credentials()
+	tcreds := target.Credentials()
+	if creds.EffectiveKUID == tcreds.SavedKUID ||
+		creds.EffectiveKUID == tcreds.RealKUID ||
+		creds.RealKUID == tcreds.SavedKUID ||
+		creds.RealKUID == tcreds.RealKUID {
+		return true
+	}
+
+	if sig == linux.SIGCONT && target.ThreadGroup().Session() == t.ThreadGroup().Session() {
+		return true
+	}
+	return false
+}
+
+// Kill implements linux syscall kill(2).
+func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := kernel.ThreadID(args[0].Int())
+	sig := linux.Signal(args[1].Int())
+
+	switch {
+	case pid > 0:
+		// "If pid is positive, then signal sig is sent to the process with the
+		// ID specified by pid." - kill(2)
+		// This loops to handle races with execve where target dies between
+		// TaskWithID and SendGroupSignal. Compare Linux's
+		// kernel/signal.c:kill_pid_info().
+		for {
+			target := t.PIDNamespace().TaskWithID(pid)
+			if target == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			if !mayKill(t, target, sig) {
+				return 0, nil, syserror.EPERM
+			}
+			info := &arch.SignalInfo{
+				Signo: int32(sig),
+				Code:  arch.SignalInfoUser,
+			}
+			info.SetPid(int32(target.PIDNamespace().IDOfTask(t)))
+			info.SetUid(int32(t.Credentials().RealKUID.In(target.UserNamespace()).OrOverflow()))
+			if err := target.SendGroupSignal(info); err != syserror.ESRCH {
+				return 0, nil, err
+			}
+		}
+	case pid == -1:
+		// "If pid equals -1, then sig is sent to every process for which the
+		// calling process has permission to send signals, except for process 1
+		// (init), but see below. ... POSIX.1-2001 requires that kill(-1,sig)
+		// send sig to all processes that the calling process may send signals
+		// to, except possibly for some implementation-defined system
+		// processes. Linux allows a process to signal itself, but on Linux the
+		// call kill(-1,sig) does not signal the calling process."
+		var (
+			lastErr   error
+			delivered int
+		)
+		for _, tg := range t.PIDNamespace().ThreadGroups() {
+			if tg == t.ThreadGroup() {
+				continue
+			}
+			if t.PIDNamespace().IDOfThreadGroup(tg) == kernel.InitTID {
+				continue
+			}
+
+			// If pid == -1, the returned error is the last non-EPERM error
+			// from any call to group_send_sig_info.
+			if !mayKill(t, tg.Leader(), sig) {
+				continue
+			}
+			// Here and below, whether or not kill returns an error may
+			// depend on the iteration order. We at least implement the
+			// semantics documented by the man page: "On success (at least
+			// one signal was sent), zero is returned."
+			info := &arch.SignalInfo{
+				Signo: int32(sig),
+				Code:  arch.SignalInfoUser,
+			}
+			info.SetPid(int32(tg.PIDNamespace().IDOfTask(t)))
+			info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow()))
+			err := tg.SendSignal(info)
+			if err == syserror.ESRCH {
+				// ESRCH is ignored because it means the task
+				// exited while we were iterating.  This is a
+				// race which would not normally exist on
+				// Linux, so we suppress it.
+				continue
+			}
+			delivered++
+			if err != nil {
+				lastErr = err
+			}
+		}
+		if delivered > 0 {
+			return 0, nil, lastErr
+		}
+		return 0, nil, syserror.ESRCH
+	default:
+		// "If pid equals 0, then sig is sent to every process in the process
+		// group of the calling process."
+		//
+		// "If pid is less than -1, then sig is sent to every process
+		// in the process group whose ID is -pid."
+		pgid := kernel.ProcessGroupID(-pid)
+		if pgid == 0 {
+			pgid = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())
+		}
+
+		// If pid != -1 (i.e. signalling a process group), the returned error
+		// is the last error from any call to group_send_sig_info.
+		lastErr := syserror.ESRCH
+		for _, tg := range t.PIDNamespace().ThreadGroups() {
+			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid {
+				if !mayKill(t, tg.Leader(), sig) {
+					lastErr = syserror.EPERM
+					continue
+				}
+
+				info := &arch.SignalInfo{
+					Signo: int32(sig),
+					Code:  arch.SignalInfoUser,
+				}
+				info.SetPid(int32(tg.PIDNamespace().IDOfTask(t)))
+				info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow()))
+				// See note above regarding ESRCH race above.
+				if err := tg.SendSignal(info); err != syserror.ESRCH {
+					lastErr = err
+				}
+			}
+		}
+
+		return 0, nil, lastErr
+	}
+}
+
+func tkillSigInfo(sender, receiver *kernel.Task, sig linux.Signal) *arch.SignalInfo {
+	info := &arch.SignalInfo{
+		Signo: int32(sig),
+		Code:  arch.SignalInfoTkill,
+	}
+	info.SetPid(int32(receiver.PIDNamespace().IDOfThreadGroup(sender.ThreadGroup())))
+	info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+	return info
+}
+
+// Tkill implements linux syscall tkill(2).
+func Tkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := kernel.ThreadID(args[0].Int())
+	sig := linux.Signal(args[1].Int())
+
+	// N.B. Inconsistent with man page, linux actually rejects calls with
+	// tid <=0 by EINVAL. This isn't the same for all signal calls.
+	if tid <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	target := t.PIDNamespace().TaskWithID(tid)
+	if target == nil {
+		return 0, nil, syserror.ESRCH
+	}
+
+	if !mayKill(t, target, sig) {
+		return 0, nil, syserror.EPERM
+	}
+	return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig))
+}
+
+// Tgkill implements linux syscall tgkill(2).
+func Tgkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tgid := kernel.ThreadID(args[0].Int())
+	tid := kernel.ThreadID(args[1].Int())
+	sig := linux.Signal(args[2].Int())
+
+	// N.B. Inconsistent with man page, linux actually rejects calls with
+	// tgid/tid <=0 by EINVAL. This isn't the same for all signal calls.
+	if tgid <= 0 || tid <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	targetTG := t.PIDNamespace().ThreadGroupWithID(tgid)
+	target := t.PIDNamespace().TaskWithID(tid)
+	if targetTG == nil || target == nil || target.ThreadGroup() != targetTG {
+		return 0, nil, syserror.ESRCH
+	}
+
+	if !mayKill(t, target, sig) {
+		return 0, nil, syserror.EPERM
+	}
+	return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig))
+}
+
+// RtSigaction implements linux syscall rt_sigaction(2).
+func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sig := linux.Signal(args[0].Int())
+	newactarg := args[1].Pointer()
+	oldactarg := args[2].Pointer()
+
+	var newactptr *arch.SignalAct
+	if newactarg != 0 {
+		newact, err := t.CopyInSignalAct(newactarg)
+		if err != nil {
+			return 0, nil, err
+		}
+		newactptr = &newact
+	}
+	oldact, err := t.ThreadGroup().SetSignalAct(sig, newactptr)
+	if err != nil {
+		return 0, nil, err
+	}
+	if oldactarg != 0 {
+		if err := t.CopyOutSignalAct(oldactarg, &oldact); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// Sigreturn implements linux syscall sigreturn(2).
+func Sigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ctrl, err := t.SignalReturn(false)
+	return 0, ctrl, err
+}
+
+// RtSigreturn implements linux syscall rt_sigreturn(2).
+func RtSigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ctrl, err := t.SignalReturn(true)
+	return 0, ctrl, err
+}
+
+// RtSigprocmask implements linux syscall rt_sigprocmask(2).
+func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	how := args[0].Int()
+	setaddr := args[1].Pointer()
+	oldaddr := args[2].Pointer()
+	sigsetsize := args[3].SizeT()
+
+	if sigsetsize != linux.SignalSetSize {
+		return 0, nil, syserror.EINVAL
+	}
+	oldmask := t.SignalMask()
+	if setaddr != 0 {
+		mask, err := copyInSigSet(t, setaddr, sigsetsize)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		switch how {
+		case linux.SIG_BLOCK:
+			t.SetSignalMask(oldmask | mask)
+		case linux.SIG_UNBLOCK:
+			t.SetSignalMask(oldmask &^ mask)
+		case linux.SIG_SETMASK:
+			t.SetSignalMask(mask)
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+	}
+	if oldaddr != 0 {
+		return 0, nil, copyOutSigSet(t, oldaddr, oldmask)
+	}
+
+	return 0, nil, nil
+}
+
+// Sigaltstack implements linux syscall sigaltstack(2).
+func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	setaddr := args[0].Pointer()
+	oldaddr := args[1].Pointer()
+
+	if oldaddr != 0 {
+		alt := t.SignalStack()
+		if t.OnSignalStack(alt) {
+			alt.Flags |= arch.SignalStackFlagOnStack
+		}
+		if err := t.CopyOutSignalStack(oldaddr, &alt); err != nil {
+			return 0, nil, err
+		}
+	}
+	if setaddr != 0 {
+		if t.OnSignalStack(t.SignalStack()) {
+			return 0, nil, syserror.EPERM
+		}
+		alt, err := t.CopyInSignalStack(setaddr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if err := t.SetSignalStack(alt); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	return 0, nil, nil
+}
+
+// Pause implements linux syscall pause(2).
+func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND)
+}
+
+func sigtimedwait(t *kernel.Task, mask linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
+	// Is it already pending?
+	if info := t.TakeSignal(^mask); info != nil {
+		return info, nil
+	}
+
+	// No signals available immediately and asked not to wait.
+	if timeout == 0 {
+		return nil, syserror.EAGAIN
+	}
+
+	// No signals available yet. Temporarily unblock the ones we are interested
+	// in then wait for either a timeout or a new signal.
+	oldmask := t.SignalMask()
+	t.SetSignalMask(oldmask &^ mask)
+	_, err := t.BlockWithTimeout(nil, true, timeout)
+	t.SetSignalMask(oldmask)
+
+	// How did the wait go?
+	switch err {
+	case syserror.ErrInterrupted:
+		if info := t.TakeSignal(^mask); info != nil {
+			// Got one of the signals we were waiting for.
+			return info, nil
+		}
+		// Got a signal we weren't waiting for.
+		return nil, syserror.EINTR
+	case syserror.ETIMEDOUT:
+		// Timed out and still no signals.
+		return nil, syserror.EAGAIN
+	default:
+		// Some other error? Shouldn't be possible. The event channel
+		// passed to BlockWithTimeout was nil, so the only two ways the
+		// block could've ended are a timeout or an interrupt.
+		panic("unreachable")
+	}
+}
+
+// RtSigpending implements linux syscall rt_sigpending(2).
+func RtSigpending(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	pending := t.PendingSignals()
+	_, err := t.CopyOut(addr, pending)
+	return 0, nil, err
+}
+
+// RtSigtimedwait implements linux syscall rt_sigtimedwait(2).
+func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sigset := args[0].Pointer()
+	siginfo := args[1].Pointer()
+	timespec := args[2].Pointer()
+	sigsetsize := args[3].SizeT()
+
+	mask, err := copyInSigSet(t, sigset, sigsetsize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var timeout time.Duration
+	if timespec != 0 {
+		d, err := copyTimespecIn(t, timespec)
+		if err != nil {
+			return 0, nil, err
+		}
+		if !d.Valid() {
+			return 0, nil, syserror.EINVAL
+		}
+		timeout = time.Duration(d.ToNsecCapped())
+	} else {
+		timeout = time.Duration(math.MaxInt64)
+	}
+
+	si, err := sigtimedwait(t, mask, timeout)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if si != nil {
+		if siginfo != 0 {
+			si.FixSignalCodeForUser()
+			if _, err := t.CopyOut(siginfo, si); err != nil {
+				return 0, nil, err
+			}
+		}
+		return uintptr(si.Signo), nil, nil
+	}
+
+	// sigtimedwait's not supposed to return nil si and err...
+	return 0, nil, nil
+}
+
+// RtSigqueueinfo implements linux syscall rt_sigqueueinfo(2).
+func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := kernel.ThreadID(args[0].Int())
+	sig := linux.Signal(args[1].Int())
+	infoAddr := args[2].Pointer()
+
+	// Copy in the info.
+	//
+	// We must ensure that the Signo is set (Linux overrides this in the
+	// same way), and that the code is in the allowed set. This same logic
+	// appears below in RtSigtgqueueinfo and should be kept in sync.
+	var info arch.SignalInfo
+	if _, err := t.CopyIn(infoAddr, &info); err != nil {
+		return 0, nil, err
+	}
+	info.Signo = int32(sig)
+
+	// This must loop to handle the race with execve described in Kill.
+	for {
+		// Deliver to the given task's thread group.
+		target := t.PIDNamespace().TaskWithID(pid)
+		if target == nil {
+			return 0, nil, syserror.ESRCH
+		}
+
+		// If the sender is not the receiver, it can't use si_codes used by the
+		// kernel or SI_TKILL.
+		if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t {
+			return 0, nil, syserror.EPERM
+		}
+
+		if !mayKill(t, target, sig) {
+			return 0, nil, syserror.EPERM
+		}
+
+		if err := target.SendGroupSignal(&info); err != syserror.ESRCH {
+			return 0, nil, err
+		}
+	}
+}
+
+// RtTgsigqueueinfo implements linux syscall rt_tgsigqueueinfo(2).
+func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tgid := kernel.ThreadID(args[0].Int())
+	tid := kernel.ThreadID(args[1].Int())
+	sig := linux.Signal(args[2].Int())
+	infoAddr := args[3].Pointer()
+
+	// N.B. Inconsistent with man page, linux actually rejects calls with
+	// tgid/tid <=0 by EINVAL. This isn't the same for all signal calls.
+	if tgid <= 0 || tid <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Copy in the info. See RtSigqueueinfo above.
+	var info arch.SignalInfo
+	if _, err := t.CopyIn(infoAddr, &info); err != nil {
+		return 0, nil, err
+	}
+	info.Signo = int32(sig)
+
+	// Deliver to the given task.
+	targetTG := t.PIDNamespace().ThreadGroupWithID(tgid)
+	target := t.PIDNamespace().TaskWithID(tid)
+	if targetTG == nil || target == nil || target.ThreadGroup() != targetTG {
+		return 0, nil, syserror.ESRCH
+	}
+
+	// If the sender is not the receiver, it can't use si_codes used by the
+	// kernel or SI_TKILL.
+	if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t {
+		return 0, nil, syserror.EPERM
+	}
+
+	if !mayKill(t, target, sig) {
+		return 0, nil, syserror.EPERM
+	}
+	return 0, nil, target.SendSignal(&info)
+}
+
+// RtSigsuspend implements linux syscall rt_sigsuspend(2).
+func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sigset := args[0].Pointer()
+
+	// Copy in the signal mask.
+	var mask linux.SignalSet
+	if _, err := t.CopyIn(sigset, &mask); err != nil {
+		return 0, nil, err
+	}
+	mask &^= kernel.UnblockableSignals
+
+	// Swap the mask.
+	oldmask := t.SignalMask()
+	t.SetSignalMask(mask)
+	t.SetSavedSignalMask(oldmask)
+
+	// Perform the wait.
+	return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND)
+}
+
+// RestartSyscall implements the linux syscall restart_syscall(2).
+func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	if r := t.SyscallRestartBlock(); r != nil {
+		n, err := r.Restart(t)
+		return n, nil, err
+	}
+	// The restart block should never be nil here, but it's possible
+	// ERESTART_RESTARTBLOCK was set by ptrace without the current syscall
+	// setting up a restart block. If ptrace didn't manipulate the return value,
+	// finding a nil restart block is a bug. Linux ensures that the restart
+	// function is never null by (re)initializing it with one that translates
+	// the restart into EINTR. We'll emulate that behaviour.
+	t.Debugf("Restart block missing in restart_syscall(2). Did ptrace inject a return value of ERESTART_RESTARTBLOCK?")
+	return 0, nil, syserror.EINTR
+}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
new file mode 100644
index 000000000..3797c0a5d
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -0,0 +1,1059 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+// minListenBacklog is the minimum reasonable backlog for listening sockets.
+const minListenBacklog = 8
+
+// maxListenBacklog is the maximum allowed backlog for listening sockets.
+const maxListenBacklog = 1024
+
+// maxAddrLen is the maximum socket address length we're willing to accept.
+const maxAddrLen = 200
+
+// maxOptLen is the maximum sockopt parameter length we're willing to accept.
+const maxOptLen = 1024
+
+// maxControlLen is the maximum length of the msghdr.msg_control buffer we're
+// willing to accept. Note that this limit is smaller than Linux, which allows
+// buffers upto INT_MAX.
+const maxControlLen = 10 * 1024 * 1024
+
+// nameLenOffset is the offset from the start of the MessageHeader64 struct to
+// the NameLen field.
+const nameLenOffset = 8
+
+// controlLenOffset is the offset form the start of the MessageHeader64 struct
+// to the ControlLen field.
+const controlLenOffset = 40
+
+// messageHeader64Len is the length of a MessageHeader64 struct.
+var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+
+// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
+var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+
+// MessageHeader64 is the 64-bit representation of the msghdr struct used in
+// the recvmsg and sendmsg syscalls.
+type MessageHeader64 struct {
+	// Name is the optional pointer to a network address buffer.
+	Name uint64
+
+	// NameLen is the length of the buffer pointed to by Name.
+	NameLen uint32
+	_       uint32
+
+	// Iov is a pointer to an array of io vectors that describe the memory
+	// locations involved in the io operation.
+	Iov uint64
+
+	// IovLen is the length of the array pointed to by Iov.
+	IovLen uint64
+
+	// Control is the optional pointer to ancillary control data.
+	Control uint64
+
+	// ControlLen is the length of the data pointed to by Control.
+	ControlLen uint64
+
+	// Flags on the sent/received message.
+	Flags int32
+	_     int32
+}
+
+// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
+// the recvmmsg and sendmmsg syscalls.
+type multipleMessageHeader64 struct {
+	msgHdr MessageHeader64
+	msgLen uint32
+	_      int32
+}
+
+// CopyInMessageHeader64 copies a message header from user to kernel memory.
+func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
+	b := t.CopyScratchBuffer(52)
+	if _, err := t.CopyInBytes(addr, b); err != nil {
+		return err
+	}
+
+	msg.Name = usermem.ByteOrder.Uint64(b[0:])
+	msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
+	msg.Iov = usermem.ByteOrder.Uint64(b[16:])
+	msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
+	msg.Control = usermem.ByteOrder.Uint64(b[32:])
+	msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
+	msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
+
+	return nil
+}
+
+// CaptureAddress allocates memory for and copies a socket address structure
+// from the untrusted address space range.
+func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
+	if addrlen > maxAddrLen {
+		return nil, syscall.EINVAL
+	}
+
+	addrBuf := make([]byte, addrlen)
+	if _, err := t.CopyInBytes(addr, addrBuf); err != nil {
+		return nil, err
+	}
+
+	return addrBuf, nil
+}
+
+// writeAddress writes a sockaddr structure and its length to an output buffer
+// in the unstrusted address space range. If the address is bigger than the
+// buffer, it is truncated.
+func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+	// Get the buffer length.
+	var bufLen uint32
+	if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+		return err
+	}
+
+	if int32(bufLen) < 0 {
+		return syscall.EINVAL
+	}
+
+	// Write the length unconditionally.
+	if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+		return err
+	}
+
+	if addr == nil {
+		return nil
+	}
+
+	if bufLen > addrLen {
+		bufLen = addrLen
+	}
+
+	// Copy as much of the address as will fit in the buffer.
+	encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+	if bufLen > uint32(len(encodedAddr)) {
+		bufLen = uint32(len(encodedAddr))
+	}
+	_, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)])
+	return err
+}
+
+// Socket implements the linux syscall socket(2).
+func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	// Create the new socket.
+	s, e := socket.New(t, domain, unix.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	s.SetFlags(fs.SettableFileFlags{
+		NonBlocking: stype&linux.SOCK_NONBLOCK != 0,
+	})
+	defer s.DecRef()
+
+	fd, err := t.FDMap().NewFDFrom(0, s, kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	}, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// SocketPair implements the linux syscall socketpair(2).
+func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+	socks := args[3].Pointer()
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	fileFlags := fs.SettableFileFlags{
+		NonBlocking: stype&linux.SOCK_NONBLOCK != 0,
+	}
+	fdFlags := kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	}
+
+	// Create the socket pair.
+	s1, s2, e := socket.Pair(t, domain, unix.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	s1.SetFlags(fileFlags)
+	s2.SetFlags(fileFlags)
+	defer s1.DecRef()
+	defer s2.DecRef()
+
+	// Create the FDs for the sockets.
+	fd1, err := t.FDMap().NewFDFrom(0, s1, fdFlags, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, err
+	}
+	fd2, err := t.FDMap().NewFDFrom(0, s2, fdFlags, t.ThreadGroup().Limits())
+	if err != nil {
+		t.FDMap().Remove(fd1)
+		return 0, nil, err
+	}
+
+	// Copy the file descriptors out.
+	if _, err := t.CopyOut(socks, []int32{int32(fd1), int32(fd2)}); err != nil {
+		t.FDMap().Remove(fd1)
+		t.FDMap().Remove(fd2)
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// Connect implements the linux syscall connect(2).
+func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	blocking := !file.Flags().NonBlocking
+	return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS)
+}
+
+// accept is the implementation of the accept syscall. It is called by accept
+// and accept4 syscall handlers.
+func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) {
+	// Check that no unsupported flags are passed in.
+	if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, syscall.ENOTSOCK
+	}
+
+	// Call the syscall implementation for this socket, then copy the
+	// output address if one is specified.
+	blocking := !file.Flags().NonBlocking
+
+	peerRequested := addrLen != 0
+	nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	if peerRequested {
+		// NOTE: Linux does not give you an error if it can't
+		// write the data back out so neither do we.
+		if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syscall.EINVAL {
+			return 0, err
+		}
+	}
+	return uintptr(nfd), nil
+}
+
+// Accept4 implements the linux syscall accept4(2).
+func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+	flags := int(args[3].Int())
+
+	n, err := accept(t, fd, addr, addrlen, flags)
+	return n, nil, err
+}
+
+// Accept implements the linux syscall accept(2).
+func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	n, err := accept(t, fd, addr, addrlen, 0)
+	return n, nil, err
+}
+
+// Bind implements the linux syscall bind(2).
+func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, s.Bind(t, a).ToError()
+}
+
+// Listen implements the linux syscall listen(2).
+func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	backlog := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Per Linux, the backlog is silently capped to reasonable values.
+	if backlog <= 0 {
+		backlog = minListenBacklog
+	}
+	if backlog > maxListenBacklog {
+		backlog = maxListenBacklog
+	}
+
+	return 0, nil, s.Listen(t, int(backlog)).ToError()
+}
+
+// Shutdown implements the linux syscall shutdown(2).
+func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	how := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Validate how, then call syscall implementation.
+	switch how {
+	case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR:
+	default:
+		return 0, nil, syscall.EINVAL
+	}
+
+	return 0, nil, s.Shutdown(t, int(how)).ToError()
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2).
+func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLenAddr := args[4].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Read the length if present. Reject negative values.
+	optLen := int32(0)
+	if optLenAddr != 0 {
+		if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+			return 0, nil, err
+		}
+
+		if optLen < 0 {
+			return 0, nil, syscall.EINVAL
+		}
+	}
+
+	// Call syscall implementation then copy both value and value len out.
+	v, e := s.GetSockOpt(t, int(level), int(name), int(optLen))
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+
+	if optLenAddr != 0 {
+		vLen := int32(binary.Size(v))
+		if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	if v != nil {
+		if _, err := t.CopyOut(optValAddr, v); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	return 0, nil, nil
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2).
+//
+// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket.
+func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLen := args[4].Int()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	if optLen <= 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	if optLen > maxOptLen {
+		return 0, nil, syscall.EINVAL
+	}
+	buf := make([]byte, optLen)
+	if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+		return 0, nil, err
+	}
+
+	// Call syscall implementation.
+	if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, nil
+}
+
+// GetSockName implements the linux syscall getsockname(2).
+func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Get the socket name and copy it to the caller.
+	v, vl, err := s.GetSockName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// GetPeerName implements the linux syscall getpeername(2).
+func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Get the socket peer name and copy it to the caller.
+	v, vl, err := s.GetPeerName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// RecvMsg implements the linux syscall recvmsg(2).
+func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := recvSingleMsg(t, s, msgPtr, flags, false, ktime.Time{})
+	return n, nil, err
+}
+
+// RecvMMsg implements the linux syscall recvmmsg(2).
+func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+	toPtr := args[4].Pointer()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syscall.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_TRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if toPtr != 0 {
+		ts, err := copyTimespecIn(t, toPtr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if !ts.Valid() {
+			return 0, nil, syscall.EINVAL
+		}
+		deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration())
+		haveDeadline = true
+	}
+
+	if !haveDeadline {
+		dl := s.RecvTimeout()
+		if dl != 0 {
+			deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+			haveDeadline = true
+		}
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syscall.EFAULT
+		}
+		var n uintptr
+		if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syscall.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
+	// Capture the message header and io vectors.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syscall.EMSGSIZE
+	}
+	dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	// FIXME: Pretend we have an empty error queue.
+	if flags&linux.MSG_ERRQUEUE != 0 {
+		return 0, syscall.EAGAIN
+	}
+
+	// Fast path when no control message nor name buffers are provided.
+	if msg.ControlLen == 0 && msg.NameLen == 0 {
+		n, _, _, _, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
+		if err != nil {
+			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
+		}
+		return uintptr(n), nil
+	}
+
+	if msg.ControlLen > maxControlLen {
+		return 0, syscall.ENOBUFS
+	}
+	n, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	defer cms.Release()
+
+	controlData := make([]byte, 0, msg.ControlLen)
+
+	if cr, ok := s.(unix.Credentialer); ok && cr.Passcred() {
+		creds, _ := cms.Credentials.(control.SCMCredentials)
+		controlData = control.PackCredentials(t, creds, controlData)
+	}
+
+	if cms.Rights != nil {
+		controlData = control.PackRights(t, cms.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData)
+	}
+
+	// Copy the address to the caller.
+	if msg.NameLen != 0 {
+		if err := writeAddress(t, sender, senderLen, usermem.Addr(msg.Name), usermem.Addr(msgPtr+nameLenOffset)); err != nil {
+			return 0, err
+		}
+	}
+
+	// Copy the control data to the caller.
+	if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+		return 0, err
+	}
+	if len(controlData) > 0 {
+		if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	return uintptr(n), nil
+}
+
+// recvFrom is the implementation of the recvfrom syscall. It is called by
+// recvfrom and recv syscall handlers.
+func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) {
+	if int(bufLen) < 0 {
+		return 0, syscall.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC) != 0 {
+		return 0, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, syscall.ENOTSOCK
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+
+	if dl := s.RecvTimeout(); dl != 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	}
+
+	n, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
+	cm.Release()
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+
+	// Copy the address to the caller.
+	if nameLenPtr != 0 {
+		if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil {
+			return 0, err
+		}
+	}
+
+	return uintptr(n), nil
+}
+
+// RecvFrom implements the linux syscall recvfrom(2).
+func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLenPtr := args[5].Pointer()
+
+	n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr)
+	return n, nil, err
+}
+
+// SendMsg implements the linux syscall sendmsg(2).
+func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := sendSingleMsg(t, s, file, msgPtr, flags)
+	return n, nil, err
+}
+
+// SendMMsg implements the linux syscall sendmmsg(2).
+func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syscall.EFAULT
+		}
+		var n uintptr
+		if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syscall.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr usermem.Addr, flags int32) (uintptr, error) {
+	// Capture the message header.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	var controlData []byte
+	if msg.ControlLen > 0 {
+		// Put an upper bound to prevent large allocations.
+		if msg.ControlLen > maxControlLen {
+			return 0, syscall.ENOBUFS
+		}
+		controlData = make([]byte, msg.ControlLen)
+		if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	if msg.NameLen != 0 {
+		var err error
+		to, err = CaptureAddress(t, usermem.Addr(msg.Name), msg.NameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	// Read data then call the sendmsg implementation.
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syscall.EMSGSIZE
+	}
+	src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	controlMessages, err := control.Parse(t, s, controlData)
+	if err != nil {
+		return 0, err
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), controlMessages)
+	err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
+	if err != nil {
+		controlMessages.Release()
+	}
+	return uintptr(n), err
+}
+
+// sendTo is the implementation of the sendto syscall. It is called by sendto
+// and send syscall handlers.
+func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) {
+	bl := int(bufLen)
+	if bl < 0 {
+		return 0, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, syscall.ENOTSOCK
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	var err error
+	if namePtr != 0 {
+		to, err = CaptureAddress(t, namePtr, nameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), control.New(t, s, nil))
+	return uintptr(n), handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
+}
+
+// SendTo implements the linux syscall sendto(2).
+func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLen := args[5].Uint()
+
+	n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
+	return n, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
new file mode 100644
index 000000000..6e21b34fd
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -0,0 +1,209 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Stat implements linux syscall stat(2).
+func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return stat(t, d, dirPath, statAddr)
+	})
+}
+
+// Fstatat implements linux syscall newfstatat, i.e. fstatat(2).
+func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	statAddr := args[2].Pointer()
+	flags := args[3].Int()
+
+	path, dirPath, err := copyInPath(t, addr, flags&linux.AT_EMPTY_PATH != 0)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if path == "" {
+		// Annoying. What's wrong with fstat?
+		file := t.FDMap().GetFile(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		return 0, nil, stat(t, file.Dirent, false, statAddr)
+	}
+
+	return 0, nil, fileOpOn(t, fd, path, flags&linux.AT_SYMLINK_NOFOLLOW == 0, func(root *fs.Dirent, d *fs.Dirent) error {
+		return stat(t, d, dirPath, statAddr)
+	})
+}
+
+// Lstat implements linux syscall lstat(2).
+func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return stat(t, d, dirPath, statAddr)
+	})
+}
+
+// Fstat implements linux syscall fstat(2).
+func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	statAddr := args[1].Pointer()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, stat(t, file.Dirent, false /* dirPath */, statAddr)
+}
+
+// stat implements stat from the given *fs.Dirent.
+func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) error {
+	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+		return syserror.ENOTDIR
+	}
+	uattr, err := d.Inode.UnstableAttr(t)
+	if err != nil {
+		return err
+	}
+
+	var mode uint32
+	switch d.Inode.StableAttr.Type {
+	case fs.RegularFile, fs.SpecialFile:
+		mode |= linux.ModeRegular
+	case fs.Symlink:
+		mode |= linux.ModeSymlink
+	case fs.Directory, fs.SpecialDirectory:
+		mode |= linux.ModeDirectory
+	case fs.Pipe:
+		mode |= linux.ModeNamedPipe
+	case fs.CharacterDevice:
+		mode |= linux.ModeCharacterDevice
+	case fs.BlockDevice:
+		mode |= linux.ModeBlockDevice
+	case fs.Socket:
+		mode |= linux.ModeSocket
+	}
+
+	_, err = t.CopyOut(statAddr, linux.Stat{
+		Dev:     uint64(d.Inode.StableAttr.DeviceID),
+		Rdev:    uint64(linux.MakeDeviceID(d.Inode.StableAttr.DeviceFileMajor, d.Inode.StableAttr.DeviceFileMinor)),
+		Ino:     uint64(d.Inode.StableAttr.InodeID),
+		Nlink:   uattr.Links,
+		Mode:    mode | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Size:    uattr.Size,
+		Blksize: d.Inode.StableAttr.BlockSize,
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	})
+	return err
+}
+
+// Statfs implements linux syscall statfs(2).
+func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	statfsAddr := args[1].Pointer()
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return statfsImpl(t, d, statfsAddr)
+	})
+}
+
+// Fstatfs implements linux syscall fstatfs(2).
+func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	statfsAddr := args[1].Pointer()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, statfsImpl(t, file.Dirent, statfsAddr)
+}
+
+// statfsImpl implements the linux syscall statfs and fstatfs based on a Dirent,
+// copying the statfs structure out to addr on success, otherwise an error is
+// returned.
+func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
+	info, err := d.Inode.StatFS(t)
+	if err != nil {
+		return err
+	}
+	// Construct the statfs structure and copy it out.
+	statfs := linux.Statfs{
+		Type: info.Type,
+		// Treat block size and fragment size as the same, as
+		// most consumers of this structure will expect one
+		// or the other to be filled in.
+		BlockSize: d.Inode.StableAttr.BlockSize,
+		Blocks:    info.TotalBlocks,
+		// We don't have the concept of reserved blocks, so
+		// report blocks free the same as available blocks.
+		// This is a normal thing for filesystems, to do, see
+		// udf, hugetlbfs, tmpfs, among others.
+		BlocksFree:      info.FreeBlocks,
+		BlocksAvailable: info.FreeBlocks,
+		Files:           info.TotalFiles,
+		FilesFree:       info.FreeFiles,
+		// Same as Linux for simple_statfs, see fs/libfs.c.
+		NameLength:   syscall.PathMax,
+		FragmentSize: d.Inode.StableAttr.BlockSize,
+		// Leave other fields 0 like simple_statfs does.
+	}
+	if _, err := t.CopyOut(addr, &statfs); err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
new file mode 100644
index 000000000..902d210db
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -0,0 +1,75 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Sync implements linux system call sync(2).
+func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	t.MountNamespace().SyncAll(t)
+	// Sync is always successful.
+	return 0, nil, nil
+}
+
+// Syncfs implements linux system call syncfs(2).
+func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Use "sync-the-world" for now, it's guaranteed that fd is at least
+	// on the root filesystem.
+	return Sync(t, args)
+}
+
+// Fsync implements linux syscall fsync(2).
+func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll)
+	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// Fdatasync implements linux syscall fdatasync(2).
+//
+// At the moment, it just calls Fsync, which is a big hammer, but correct.
+func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData)
+	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
new file mode 100644
index 000000000..bd0ffcd5c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -0,0 +1,42 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// Sysinfo implements the sysinfo syscall as described in man 2 sysinfo.
+func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	mem := t.Kernel().Platform.Memory()
+	mem.UpdateUsage()
+	_, totalUsage := usage.MemoryAccounting.Copy()
+	totalSize := usage.TotalMemory(mem.TotalSize(), totalUsage)
+
+	// Only a subset of the fields in sysinfo_t make sense to return.
+	si := linux.Sysinfo{
+		Procs:    uint16(len(t.PIDNamespace().Tasks())),
+		Uptime:   t.Kernel().MonotonicClock().Now().Seconds(),
+		TotalRAM: totalSize,
+		FreeRAM:  totalSize - totalUsage,
+	}
+	_, err := t.CopyOut(addr, si)
+	return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_syslog.go b/pkg/sentry/syscalls/linux/sys_syslog.go
new file mode 100644
index 000000000..792040c81
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_syslog.go
@@ -0,0 +1,61 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	_SYSLOG_ACTION_READ_ALL    = 3
+	_SYSLOG_ACTION_SIZE_BUFFER = 10
+)
+
+// logBufLen is the default syslog buffer size on Linux.
+const logBufLen = 1 << 17
+
+// Syslog implements part of Linux syscall syslog.
+//
+// Only the unpriviledged commands are implemented, allowing applications to
+// read a fun dmesg.
+func Syslog(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	command := args[0].Int()
+	buf := args[1].Pointer()
+	size := int(args[2].Int())
+
+	switch command {
+	case _SYSLOG_ACTION_READ_ALL:
+		if size < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if size > logBufLen {
+			size = logBufLen
+		}
+
+		log := t.Kernel().Syslog().Log()
+		if len(log) > size {
+			log = log[:size]
+		}
+
+		n, err := t.CopyOutBytes(buf, log)
+		return uintptr(n), nil, err
+	case _SYSLOG_ACTION_SIZE_BUFFER:
+		return logBufLen, nil, nil
+	default:
+		return 0, nil, syserror.ENOSYS
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
new file mode 100644
index 000000000..0adbf160f
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -0,0 +1,704 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	// ExecMaxTotalSize is the maximum length of all argv and envv entries.
+	//
+	// N.B. The behavior here is different than Linux. Linux provides a limit on
+	// individual arguments of 32 pages, and an aggregate limit of at least 32 pages
+	// but otherwise bounded by min(stack size / 4, 8 MB * 3 / 4). We don't implement
+	// any behavior based on the stack size, and instead provide a fixed hard-limit of
+	// 2 MB (which should work well given that 8 MB stack limits are common).
+	ExecMaxTotalSize = 2 * 1024 * 1024
+
+	// ExecMaxElemSize is the maximum length of a single argv or envv entry.
+	ExecMaxElemSize = 32 * usermem.PageSize
+
+	// exitSignalMask is the signal mask to be sent at exit. Same as CSIGNAL in linux.
+	exitSignalMask = 0xff
+)
+
+// Possible values for the idtype argument to waitid(2), defined in Linux's
+// include/uapi/linux/wait.h.
+const (
+	_P_ALL  = 0
+	_P_PID  = 1
+	_P_PGID = 2
+)
+
+// Getppid implements linux syscall getppid(2).
+func Getppid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	parent := t.Parent()
+	if parent == nil {
+		return 0, nil, nil
+	}
+	return uintptr(t.PIDNamespace().IDOfThreadGroup(parent.ThreadGroup())), nil, nil
+}
+
+// Getpid implements linux syscall getpid(2).
+func Getpid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return uintptr(t.ThreadGroup().ID()), nil, nil
+}
+
+// Gettid implements linux syscall gettid(2).
+func Gettid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return uintptr(t.ThreadID()), nil, nil
+}
+
+// Execve implements linux syscall execve(2).
+func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	filenameAddr := args[0].Pointer()
+	argvAddr := args[1].Pointer()
+	envvAddr := args[2].Pointer()
+
+	// Extract our arguments.
+	filename, err := t.CopyInString(filenameAddr, syscall.PathMax)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var argv, envv []string
+	if argvAddr != 0 {
+		var err error
+		argv, err = t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+	if envvAddr != 0 {
+		var err error
+		envv, err = t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+
+	root := t.FSContext().RootDirectory()
+	defer root.DecRef()
+	wd := t.FSContext().WorkingDirectory()
+	defer wd.DecRef()
+
+	// Load the new TaskContext.
+	tc, err := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, linux.MaxSymlinkTraversals, filename, argv, envv, t.Arch().FeatureSet())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	ctrl, err := t.Execve(tc)
+	return 0, ctrl, err
+}
+
+// Exit implements linux syscall exit(2).
+func Exit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	status := int(args[0].Int())
+	t.PrepareExit(kernel.ExitStatus{Code: status})
+	return 0, kernel.CtrlDoExit, nil
+}
+
+// ExitGroup implements linux syscall exit_group(2).
+func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	status := int(args[0].Int())
+	t.PrepareGroupExit(kernel.ExitStatus{Code: status})
+	return 0, kernel.CtrlDoExit, nil
+}
+
+// clone is used by Clone, Fork, and VFork.
+func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr, childTID usermem.Addr, tls usermem.Addr) (uintptr, *kernel.SyscallControl, error) {
+	opts := kernel.CloneOptions{
+		SharingOptions: kernel.SharingOptions{
+			NewAddressSpace:     flags&syscall.CLONE_VM == 0,
+			NewSignalHandlers:   flags&syscall.CLONE_SIGHAND == 0,
+			NewThreadGroup:      flags&syscall.CLONE_THREAD == 0,
+			TerminationSignal:   linux.Signal(flags & exitSignalMask),
+			NewPIDNamespace:     flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID,
+			NewUserNamespace:    flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER,
+			NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET,
+			NewFiles:            flags&syscall.CLONE_FILES == 0,
+			NewFSContext:        flags&syscall.CLONE_FS == 0,
+			NewUTSNamespace:     flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS,
+			NewIPCNamespace:     flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC,
+		},
+		Stack:         stack,
+		SetTLS:        flags&syscall.CLONE_SETTLS == syscall.CLONE_SETTLS,
+		TLS:           tls,
+		ChildClearTID: flags&syscall.CLONE_CHILD_CLEARTID == syscall.CLONE_CHILD_CLEARTID,
+		ChildSetTID:   flags&syscall.CLONE_CHILD_SETTID == syscall.CLONE_CHILD_SETTID,
+		ChildTID:      childTID,
+		ParentSetTID:  flags&syscall.CLONE_PARENT_SETTID == syscall.CLONE_PARENT_SETTID,
+		ParentTID:     parentTID,
+		Vfork:         flags&syscall.CLONE_VFORK == syscall.CLONE_VFORK,
+		Untraced:      flags&syscall.CLONE_UNTRACED == syscall.CLONE_UNTRACED,
+		InheritTracer: flags&syscall.CLONE_PTRACE == syscall.CLONE_PTRACE,
+	}
+	ntid, ctrl, err := t.Clone(&opts)
+	return uintptr(ntid), ctrl, err
+}
+
+// Clone implements linux syscall clone(2).
+// sys_clone has so many flavors. We implement the default one in the
+// current linux 3.11 x86_64:
+//    sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val)
+func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := int(args[0].Int())
+	stack := args[1].Pointer()
+	parentTID := args[2].Pointer()
+	childTID := args[3].Pointer()
+	tls := args[4].Pointer()
+	return clone(t, flags, stack, parentTID, childTID, tls)
+}
+
+// Fork implements Linux syscall fork(2).
+func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// "A call to fork() is equivalent to a call to clone(2) specifying flags
+	// as just SIGCHLD." - fork(2)
+	return clone(t, int(syscall.SIGCHLD), 0, 0, 0, 0)
+}
+
+// Vfork implements Linux syscall vfork(2).
+func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// """
+	// A call to vfork() is equivalent to calling clone(2) with flags specified as:
+	//
+	//     CLONE_VM | CLONE_VFORK | SIGCHLD
+	// """ - vfork(2)
+	return clone(t, syscall.CLONE_VM|syscall.CLONE_VFORK|int(syscall.SIGCHLD), 0, 0, 0, 0)
+}
+
+// wait4 waits for the given child process to exit.
+func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) {
+	if options&^(syscall.WNOHANG|syscall.WUNTRACED|syscall.WCONTINUED|syscall.WALL|syscall.WCLONE) != 0 {
+		return 0, syscall.EINVAL
+	}
+	wopts := kernel.WaitOptions{
+		Events:       kernel.EventExit | kernel.EventTraceeStop,
+		ConsumeEvent: true,
+	}
+	// There are four cases to consider:
+	//
+	// pid < -1    any child process whose process group ID is equal to the absolute value of pid
+	// pid == -1   any child process
+	// pid == 0    any child process whose process group ID is equal to that of the calling process
+	// pid > 0     the child whose process ID is equal to the value of pid
+	switch {
+	case pid < -1:
+		wopts.SpecificPGID = kernel.ProcessGroupID(-pid)
+	case pid == -1:
+		// Any process is the default.
+	case pid == 0:
+		wopts.SpecificPGID = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())
+	default:
+		wopts.SpecificTID = kernel.ThreadID(pid)
+	}
+
+	switch options & (syscall.WCLONE | syscall.WALL) {
+	case 0:
+		wopts.NonCloneTasks = true
+	case syscall.WCLONE:
+		wopts.CloneTasks = true
+	case syscall.WALL:
+		wopts.NonCloneTasks = true
+		wopts.CloneTasks = true
+	default:
+		return 0, syscall.EINVAL
+	}
+	if options&syscall.WUNTRACED != 0 {
+		wopts.Events |= kernel.EventChildGroupStop
+	}
+	if options&syscall.WCONTINUED != 0 {
+		wopts.Events |= kernel.EventGroupContinue
+	}
+	if options&syscall.WNOHANG == 0 {
+		wopts.BlockInterruptErr = kernel.ERESTARTSYS
+	}
+
+	wr, err := t.Wait(&wopts)
+	if err != nil {
+		if err == kernel.ErrNoWaitableEvent {
+			return 0, nil
+		}
+		return 0, err
+	}
+	if statusAddr != 0 {
+		if _, err := t.CopyOut(statusAddr, wr.Status); err != nil {
+			return 0, err
+		}
+	}
+	if rusageAddr != 0 {
+		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
+		if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+			return 0, err
+		}
+	}
+	return uintptr(wr.TID), nil
+}
+
+// Wait4 implements linux syscall wait4(2).
+func Wait4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := int(args[0].Int())
+	statusAddr := args[1].Pointer()
+	options := int(args[2].Uint())
+	rusageAddr := args[3].Pointer()
+
+	n, err := wait4(t, pid, statusAddr, options, rusageAddr)
+	return n, nil, err
+}
+
+// WaitPid implements linux syscall waitpid(2).
+func WaitPid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := int(args[0].Int())
+	statusAddr := args[1].Pointer()
+	options := int(args[2].Uint())
+
+	n, err := wait4(t, pid, statusAddr, options, 0)
+	return n, nil, err
+}
+
+// Waitid implements linux syscall waitid(2).
+func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	idtype := args[0].Int()
+	id := args[1].Int()
+	infop := args[2].Pointer()
+	options := int(args[3].Uint())
+	rusageAddr := args[4].Pointer()
+
+	if options&^(syscall.WNOHANG|syscall.WEXITED|syscall.WSTOPPED|syscall.WCONTINUED|syscall.WNOWAIT) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	if options&(syscall.WEXITED|syscall.WSTOPPED|syscall.WCONTINUED) == 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	wopts := kernel.WaitOptions{
+		NonCloneTasks: true,
+		Events:        kernel.EventTraceeStop,
+		ConsumeEvent:  options&syscall.WNOWAIT == 0,
+	}
+	switch idtype {
+	case _P_ALL:
+	case _P_PID:
+		wopts.SpecificTID = kernel.ThreadID(id)
+	case _P_PGID:
+		wopts.SpecificPGID = kernel.ProcessGroupID(id)
+	default:
+		return 0, nil, syscall.EINVAL
+	}
+	if options&syscall.WEXITED != 0 {
+		wopts.Events |= kernel.EventExit
+	}
+	if options&syscall.WSTOPPED != 0 {
+		wopts.Events |= kernel.EventChildGroupStop
+	}
+	if options&syscall.WCONTINUED != 0 {
+		wopts.Events |= kernel.EventGroupContinue
+	}
+	if options&syscall.WNOHANG == 0 {
+		wopts.BlockInterruptErr = kernel.ERESTARTSYS
+	}
+
+	wr, err := t.Wait(&wopts)
+	if err != nil {
+		if err == kernel.ErrNoWaitableEvent {
+			err = nil
+			// "If WNOHANG was specified in options and there were no children
+			// in a waitable state, then waitid() returns 0 immediately and the
+			// state of the siginfo_t structure pointed to by infop is
+			// unspecified." - waitid(2). But Linux's waitid actually zeroes
+			// out the fields it would set for a successful waitid in this case
+			// as well.
+			if infop != 0 {
+				var si arch.SignalInfo
+				_, err = t.CopyOut(infop, &si)
+			}
+		}
+		return 0, nil, err
+	}
+	if rusageAddr != 0 {
+		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
+		if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+			return 0, nil, err
+		}
+	}
+	if infop == 0 {
+		return 0, nil, nil
+	}
+	si := arch.SignalInfo{
+		Signo: int32(syscall.SIGCHLD),
+	}
+	si.SetPid(int32(wr.TID))
+	si.SetUid(int32(wr.UID))
+	// TODO: convert kernel.ExitStatus to functions and make
+	// WaitResult.Status a linux.WaitStatus
+	s := syscall.WaitStatus(wr.Status)
+	switch {
+	case s.Exited():
+		si.Code = arch.CLD_EXITED
+		si.SetStatus(int32(s.ExitStatus()))
+	case s.Signaled():
+		si.Code = arch.CLD_KILLED
+		si.SetStatus(int32(s.Signal()))
+	case s.CoreDump():
+		si.Code = arch.CLD_DUMPED
+		si.SetStatus(int32(s.Signal()))
+	case s.Stopped():
+		if wr.Event == kernel.EventTraceeStop {
+			si.Code = arch.CLD_TRAPPED
+			si.SetStatus(int32(s.TrapCause()))
+		} else {
+			si.Code = arch.CLD_STOPPED
+			si.SetStatus(int32(s.StopSignal()))
+		}
+	case s.Continued():
+		si.Code = arch.CLD_CONTINUED
+		si.SetStatus(int32(syscall.SIGCONT))
+	default:
+		t.Warningf("waitid got incomprehensible wait status %d", s)
+	}
+	_, err = t.CopyOut(infop, &si)
+	return 0, nil, err
+}
+
+// SetTidAddress implements linux syscall set_tid_address(2).
+func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	// Always succeed, return caller's tid.
+	t.SetClearTID(addr)
+	return uintptr(t.ThreadID()), nil, nil
+}
+
+// Unshare implements linux syscall unshare(2).
+func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	opts := kernel.SharingOptions{
+		NewAddressSpace:     flags&syscall.CLONE_VM == syscall.CLONE_VM,
+		NewSignalHandlers:   flags&syscall.CLONE_SIGHAND == syscall.CLONE_SIGHAND,
+		NewThreadGroup:      flags&syscall.CLONE_THREAD == syscall.CLONE_THREAD,
+		NewPIDNamespace:     flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID,
+		NewUserNamespace:    flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER,
+		NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET,
+		NewFiles:            flags&syscall.CLONE_FILES == syscall.CLONE_FILES,
+		NewFSContext:        flags&syscall.CLONE_FS == syscall.CLONE_FS,
+		NewUTSNamespace:     flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS,
+		NewIPCNamespace:     flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC,
+	}
+	// "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2)
+	if opts.NewPIDNamespace {
+		opts.NewThreadGroup = true
+	}
+	// "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since
+	// Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS."
+	if opts.NewUserNamespace {
+		opts.NewThreadGroup = true
+		opts.NewFSContext = true
+	}
+	return 0, nil, t.Unshare(&opts)
+}
+
+// SchedYield implements linux syscall sched_yield(2).
+func SchedYield(t *kernel.Task, _ arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	t.Yield()
+	return 0, nil, nil
+}
+
+// SchedSetaffinity implements linux syscall sched_setaffinity(2).
+func SchedSetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := args[0].Int()
+	size := args[1].SizeT()
+	maskAddr := args[2].Pointer()
+
+	var task *kernel.Task
+	if tid == 0 {
+		task = t
+	} else {
+		task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
+		if task == nil {
+			return 0, nil, syserror.ESRCH
+		}
+	}
+
+	mask := sched.NewCPUSet(t.Kernel().ApplicationCores())
+	if size > mask.Size() {
+		size = mask.Size()
+	}
+	if _, err := t.CopyInBytes(maskAddr, mask[:size]); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, task.SetCPUMask(mask)
+}
+
+// SchedGetaffinity implements linux syscall sched_getaffinity(2).
+func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := args[0].Int()
+	size := args[1].SizeT()
+	maskAddr := args[2].Pointer()
+
+	// This limitation is because linux stores the cpumask
+	// in an array of "unsigned long" so the buffer needs to
+	// be a multiple of the word size.
+	if size&(t.Arch().Width()-1) > 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var task *kernel.Task
+	if tid == 0 {
+		task = t
+	} else {
+		task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
+		if task == nil {
+			return 0, nil, syserror.ESRCH
+		}
+	}
+
+	mask := task.CPUMask()
+	// The buffer needs to be big enough to hold a cpumask with
+	// all possible cpus.
+	if size < mask.Size() {
+		return 0, nil, syserror.EINVAL
+	}
+	_, err := t.CopyOutBytes(maskAddr, mask)
+
+	// NOTE: The syscall interface is slightly different than the glibc
+	// interface. The raw sched_getaffinity syscall returns the number of
+	// bytes used to represent a cpu mask.
+	return uintptr(mask.Size()), nil, err
+}
+
+// Getcpu implements linux syscall getcpu(2).
+func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	cpu := args[0].Pointer()
+	node := args[1].Pointer()
+	// third argument to this system call is nowadays unused.
+
+	if cpu != 0 {
+		buf := t.CopyScratchBuffer(4)
+		usermem.ByteOrder.PutUint32(buf, uint32(t.CPU()))
+		if _, err := t.CopyOutBytes(cpu, buf); err != nil {
+			return 0, nil, err
+		}
+	}
+	// We always return node 0.
+	if node != 0 {
+		if _, err := t.MemoryManager().ZeroOut(t, node, 4, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// Setpgid implements the linux syscall setpgid(2).
+func Setpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// Note that throughout this function, pgid is interpreted with respect
+	// to t's namespace, not with respect to the selected ThreadGroup's
+	// namespace (which may be different).
+	pid := kernel.ThreadID(args[0].Int())
+	pgid := kernel.ProcessGroupID(args[1].Int())
+
+	// "If pid is zero, then the process ID of the calling process is used."
+	tg := t.ThreadGroup()
+	if pid != 0 {
+		ot := t.PIDNamespace().TaskWithID(pid)
+		if ot == nil {
+			return 0, nil, syserror.ESRCH
+		}
+		tg = ot.ThreadGroup()
+		if tg.Leader() != ot {
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Setpgid only operates on child threadgroups.
+		if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) {
+			return 0, nil, syserror.ESRCH
+		}
+	}
+
+	// "If pgid is zero, then the PGID of the process specified by pid is made
+	// the same as its process ID."
+	defaultPGID := kernel.ProcessGroupID(t.PIDNamespace().IDOfThreadGroup(tg))
+	if pgid == 0 {
+		pgid = defaultPGID
+	} else if pgid < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// If the pgid is the same as the group, then create a new one. Otherwise,
+	// we attempt to join an existing process group.
+	if pgid == defaultPGID {
+		// For convenience, errors line up with Linux syscall API.
+		if err := tg.CreateProcessGroup(); err != nil {
+			// Is the process group already as expected? If so,
+			// just return success. This is the same behavior as
+			// Linux.
+			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == defaultPGID {
+				return 0, nil, nil
+			}
+			return 0, nil, err
+		}
+	} else {
+		// Same as CreateProcessGroup, above.
+		if err := tg.JoinProcessGroup(t.PIDNamespace(), pgid, tg != t.ThreadGroup()); err != nil {
+			// See above.
+			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid {
+				return 0, nil, nil
+			}
+			return 0, nil, err
+		}
+	}
+
+	// Success.
+	return 0, nil, nil
+}
+
+// Getpgrp implements the linux syscall getpgrp(2).
+func Getpgrp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return uintptr(t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), nil, nil
+}
+
+// Getpgid implements the linux syscall getpgid(2).
+func Getpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := kernel.ThreadID(args[0].Int())
+	if tid == 0 {
+		return Getpgrp(t, args)
+	}
+
+	target := t.PIDNamespace().TaskWithID(tid)
+	if target == nil {
+		return 0, nil, syserror.ESRCH
+	}
+
+	return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil
+}
+
+// Setsid implements the linux syscall setsid(2).
+func Setsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.ThreadGroup().CreateSession()
+}
+
+// Getsid implements the linux syscall getsid(2).
+func Getsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := kernel.ThreadID(args[0].Int())
+	if tid == 0 {
+		return uintptr(t.PIDNamespace().IDOfSession(t.ThreadGroup().Session())), nil, nil
+	}
+
+	target := t.PIDNamespace().TaskWithID(tid)
+	if target == nil {
+		return 0, nil, syserror.ESRCH
+	}
+
+	return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil
+}
+
+// Getpriority pretends to implement the linux syscall getpriority(2).
+//
+// This is a stub; real priorities require a full scheduler.
+func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	which := args[0].Int()
+	who := kernel.ThreadID(args[1].Int())
+
+	switch which {
+	case syscall.PRIO_PROCESS:
+		// Look for who, return ESRCH if not found.
+		var task *kernel.Task
+		if who == 0 {
+			task = t
+		} else {
+			task = t.PIDNamespace().TaskWithID(who)
+		}
+
+		if task == nil {
+			return 0, nil, syscall.ESRCH
+		}
+
+		// From kernel/sys.c:getpriority:
+		// "To avoid negative return values, 'getpriority()'
+		// will not return the normal nice-value, but a negated
+		// value that has been offset by 20"
+		return uintptr(20 - task.Niceness()), nil, nil
+	case syscall.PRIO_USER:
+		fallthrough
+	case syscall.PRIO_PGRP:
+		// PRIO_USER and PRIO_PGRP have no further implementation yet.
+		return 0, nil, nil
+	default:
+		return 0, nil, syscall.EINVAL
+	}
+}
+
+// Setpriority pretends to implement the linux syscall setpriority(2).
+//
+// This is a stub; real priorities require a full scheduler.
+func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	which := args[0].Int()
+	who := kernel.ThreadID(args[1].Int())
+	niceval := int(args[2].Int())
+
+	// In the kernel's implementation, values outside the range
+	// of [-20, 19] are truncated to these minimum and maximum
+	// values.
+	if niceval < -20 /* min niceval */ {
+		niceval = -20
+	} else if niceval > 19 /* max niceval */ {
+		niceval = 19
+	}
+
+	switch which {
+	case syscall.PRIO_PROCESS:
+		// Look for who, return ESRCH if not found.
+		var task *kernel.Task
+		if who == 0 {
+			task = t
+		} else {
+			task = t.PIDNamespace().TaskWithID(who)
+		}
+
+		if task == nil {
+			return 0, nil, syscall.ESRCH
+		}
+
+		task.SetNiceness(niceval)
+	case syscall.PRIO_USER:
+		fallthrough
+	case syscall.PRIO_PGRP:
+		// PRIO_USER and PRIO_PGRP have no further implementation yet.
+		return 0, nil, nil
+	default:
+		return 0, nil, syscall.EINVAL
+	}
+
+	return 0, nil, nil
+}
+
+// Ptrace implements linux system call ptrace(2).
+func Ptrace(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	req := args[0].Int64()
+	pid := kernel.ThreadID(args[1].Int())
+	addr := args[2].Pointer()
+	data := args[3].Pointer()
+
+	return 0, nil, t.Ptrace(req, pid, addr, data)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
new file mode 100644
index 000000000..dcee694b2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -0,0 +1,338 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// The most significant 29 bits hold either a pid or a file descriptor.
+func pidOfClockID(c int32) kernel.ThreadID {
+	return kernel.ThreadID(^(c >> 3))
+}
+
+// whichCPUClock returns one of CPUCLOCK_PERF, CPUCLOCK_VIRT, CPUCLOCK_SCHED or
+// CLOCK_FD.
+func whichCPUClock(c int32) int32 {
+	return c & linux.CPUCLOCK_CLOCK_MASK
+}
+
+// isCPUClockPerThread returns true if the CPUCLOCK_PERTHREAD bit is set in the
+// clock id.
+func isCPUClockPerThread(c int32) bool {
+	return c&linux.CPUCLOCK_PERTHREAD_MASK != 0
+}
+
+// isValidCPUClock returns checks that the cpu clock id is valid.
+func isValidCPUClock(c int32) bool {
+	// Bits 0, 1, and 2 cannot all be set.
+	if c&7 == 7 {
+		return false
+	}
+	if whichCPUClock(c) >= linux.CPUCLOCK_MAX {
+		return false
+	}
+	return true
+}
+
+// targetTask returns the kernel.Task for the given clock id.
+func targetTask(t *kernel.Task, c int32) *kernel.Task {
+	pid := pidOfClockID(c)
+	if pid == 0 {
+		return t
+	}
+	return t.PIDNamespace().TaskWithID(pid)
+}
+
+// ClockGetres implements linux syscall clock_getres(2).
+func ClockGetres(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := int32(args[0].Int())
+	addr := args[1].Pointer()
+	r := linux.Timespec{
+		Sec:  0,
+		Nsec: 1,
+	}
+
+	if _, err := getClock(t, clockID); err != nil {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if addr == 0 {
+		// Don't need to copy out.
+		return 0, nil, nil
+	}
+
+	return 0, nil, copyTimespecOut(t, addr, &r)
+}
+
+type cpuClocker interface {
+	UserCPUClock() ktime.Clock
+	CPUClock() ktime.Clock
+}
+
+func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) {
+	if clockID < 0 {
+		if !isValidCPUClock(clockID) {
+			return nil, syserror.EINVAL
+		}
+
+		targetTask := targetTask(t, clockID)
+		if targetTask == nil {
+			return nil, syserror.EINVAL
+		}
+
+		var target cpuClocker
+		if isCPUClockPerThread(clockID) {
+			target = targetTask
+		} else {
+			target = targetTask.ThreadGroup()
+		}
+
+		switch whichCPUClock(clockID) {
+		case linux.CPUCLOCK_VIRT:
+			return target.UserCPUClock(), nil
+		case linux.CPUCLOCK_PROF, linux.CPUCLOCK_SCHED:
+			// CPUCLOCK_SCHED is approximated by CPUCLOCK_PROF.
+			return target.CPUClock(), nil
+		default:
+			return nil, syserror.EINVAL
+		}
+	}
+
+	switch clockID {
+	case linux.CLOCK_REALTIME, linux.CLOCK_REALTIME_COARSE:
+		return t.Kernel().RealtimeClock(), nil
+	case linux.CLOCK_MONOTONIC, linux.CLOCK_MONOTONIC_COARSE, linux.CLOCK_MONOTONIC_RAW:
+		// CLOCK_MONOTONIC approximates CLOCK_MONOTONIC_RAW.
+		return t.Kernel().MonotonicClock(), nil
+	case linux.CLOCK_PROCESS_CPUTIME_ID:
+		return t.ThreadGroup().CPUClock(), nil
+	case linux.CLOCK_THREAD_CPUTIME_ID:
+		return t.CPUClock(), nil
+	default:
+		return nil, syserror.EINVAL
+	}
+}
+
+// ClockGettime implements linux syscall clock_gettime(2).
+func ClockGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := int32(args[0].Int())
+	addr := args[1].Pointer()
+
+	c, err := getClock(t, clockID)
+	if err != nil {
+		return 0, nil, err
+	}
+	ts := c.Now().Timespec()
+	return 0, nil, copyTimespecOut(t, addr, &ts)
+}
+
+// ClockSettime implements linux syscall clock_settime(2).
+func ClockSettime(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, syserror.EPERM
+}
+
+// Time implements linux syscall time(2).
+func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	r := t.Kernel().RealtimeClock().Now().TimeT()
+	if addr == usermem.Addr(0) {
+		return uintptr(r), nil, nil
+	}
+
+	if _, err := t.CopyOut(addr, r); err != nil {
+		return 0, nil, err
+	}
+	return uintptr(r), nil, nil
+}
+
+// clockNanosleepRestartBlock encapsulates the state required to restart
+// clock_nanosleep(2) via restart_syscall(2).
+type clockNanosleepRestartBlock struct {
+	c        ktime.Clock
+	duration time.Duration
+	rem      usermem.Addr
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (n *clockNanosleepRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return 0, clockNanosleepFor(t, n.c, n.duration, n.rem)
+}
+
+// clockNanosleepUntil blocks until a specified time.
+//
+// If blocking is interrupted, the syscall is restarted with the original
+// arguments.
+func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, ts linux.Timespec) error {
+	notifier, tchan := ktime.NewChannelNotifier()
+	timer := ktime.NewTimer(c, notifier)
+
+	// Turn on the timer.
+	timer.Swap(ktime.Setting{
+		Period:  0,
+		Enabled: true,
+		Next:    ktime.FromTimespec(ts),
+	})
+
+	err := t.BlockWithTimer(nil, tchan)
+
+	timer.Destroy()
+
+	// Did we just block until the timeout happened?
+	if err == syserror.ETIMEDOUT {
+		return nil
+	}
+
+	return syserror.ConvertIntr(err, kernel.ERESTARTNOHAND)
+}
+
+// clockNanosleepFor blocks for a specified duration.
+//
+// If blocking is interrupted, the syscall is restarted with the remaining
+// duration timeout.
+func clockNanosleepFor(t *kernel.Task, c ktime.Clock, dur time.Duration, rem usermem.Addr) error {
+	timer, start, tchan := ktime.After(c, dur)
+
+	err := t.BlockWithTimer(nil, tchan)
+
+	after := c.Now()
+
+	timer.Destroy()
+
+	var remaining time.Duration
+	// Did we just block for the entire duration?
+	if err == syserror.ETIMEDOUT {
+		remaining = 0
+	} else {
+		remaining = dur - after.Sub(start)
+		if remaining < 0 {
+			remaining = time.Duration(0)
+		}
+	}
+
+	// Copy out remaining time.
+	if err != nil && rem != usermem.Addr(0) {
+		timeleft := linux.NsecToTimespec(remaining.Nanoseconds())
+		if err := copyTimespecOut(t, rem, &timeleft); err != nil {
+			return err
+		}
+	}
+
+	// Did we just block for the entire duration?
+	if err == syserror.ETIMEDOUT {
+		return nil
+	}
+
+	// If interrupted, arrange for a restart with the remaining duration.
+	if err == syserror.ErrInterrupted {
+		t.SetSyscallRestartBlock(&clockNanosleepRestartBlock{
+			c:        c,
+			duration: remaining,
+			rem:      rem,
+		})
+		return kernel.ERESTART_RESTARTBLOCK
+	}
+
+	return err
+}
+
+// Nanosleep implements linux syscall Nanosleep(2).
+func Nanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	rem := args[1].Pointer()
+
+	ts, err := copyTimespecIn(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if !ts.Valid() {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Just like linux, we cap the timeout with the max number that int64 can
+	// represent which is roughly 292 years.
+	dur := time.Duration(ts.ToNsecCapped()) * time.Nanosecond
+	return 0, nil, clockNanosleepFor(t, t.Kernel().MonotonicClock(), dur, rem)
+}
+
+// ClockNanosleep implements linux syscall clock_nanosleep(2).
+func ClockNanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := int32(args[0].Int())
+	flags := args[1].Int()
+	addr := args[2].Pointer()
+	rem := args[3].Pointer()
+
+	req, err := copyTimespecIn(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if !req.Valid() {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Only allow clock constants also allowed by Linux.
+	if clockID > 0 {
+		if clockID != linux.CLOCK_REALTIME &&
+			clockID != linux.CLOCK_MONOTONIC &&
+			clockID != linux.CLOCK_PROCESS_CPUTIME_ID {
+			return 0, nil, syserror.EINVAL
+		}
+	}
+
+	c, err := getClock(t, clockID)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if flags&linux.TIMER_ABSTIME != 0 {
+		return 0, nil, clockNanosleepUntil(t, c, req)
+	}
+
+	dur := time.Duration(req.ToNsecCapped()) * time.Nanosecond
+	return 0, nil, clockNanosleepFor(t, c, dur, rem)
+}
+
+// Gettimeofday implements linux syscall gettimeofday(2).
+func Gettimeofday(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tv := args[0].Pointer()
+	tz := args[1].Pointer()
+
+	if tv != usermem.Addr(0) {
+		nowTv := t.Kernel().RealtimeClock().Now().Timeval()
+		if err := copyTimevalOut(t, tv, &nowTv); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	if tz != usermem.Addr(0) {
+		// Ask the time package for the timezone.
+		_, offset := time.Now().Zone()
+		// This int32 array mimics linux's struct timezone.
+		timezone := [2]int32{-int32(offset) / 60, 0}
+		_, err := t.CopyOut(tz, timezone)
+		return 0, nil, err
+	}
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
new file mode 100644
index 000000000..4ed077626
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -0,0 +1,168 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// ItimerType denotes the type of interval timer.
+type ItimerType int
+
+// Interval timer types from <sys/time.h>.
+const (
+	// ItimerReal equals to ITIMER_REAL.
+	ItimerReal ItimerType = iota
+	// ItimerVirtual equals to ITIMER_VIRTUAL.
+	ItimerVirtual
+	// ItimerProf equals to ITIMER_PROF.
+	ItimerProf
+)
+
+const nsecPerSec = int64(time.Second)
+
+// copyItimerValIn copies an ItimerVal from the untrusted app range to the
+// kernel.  The ItimerVal may be either 32 or 64 bits.
+// A NULL address is allowed because because Linux allows
+// setitimer(which, NULL, &old_value) which disables the timer.
+// There is a KERN_WARN message saying this misfeature will be removed.
+// However, that hasn't happened as of 3.19, so we continue to support it.
+func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) {
+	if addr == usermem.Addr(0) {
+		return linux.ItimerVal{}, nil
+	}
+
+	switch t.Arch().Width() {
+	case 8:
+		// Native size, just copy directly.
+		var itv linux.ItimerVal
+		if _, err := t.CopyIn(addr, &itv); err != nil {
+			return linux.ItimerVal{}, err
+		}
+
+		return itv, nil
+	default:
+		return linux.ItimerVal{}, syscall.ENOSYS
+	}
+}
+
+// copyItimerValOut copies an ItimerVal to the untrusted app range.
+// The ItimerVal may be either 32 or 64 bits.
+// A NULL address is allowed, in which case no copy takes place
+func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) error {
+	if addr == usermem.Addr(0) {
+		return nil
+	}
+
+	switch t.Arch().Width() {
+	case 8:
+		// Native size, just copy directly.
+		_, err := t.CopyOut(addr, itv)
+		return err
+	default:
+		return syscall.ENOSYS
+	}
+}
+
+func findTimer(t *kernel.Task, w ItimerType) (*ktime.Timer, error) {
+	switch w {
+	case ItimerReal:
+		return t.ThreadGroup().Timer().RealTimer, nil
+	case ItimerVirtual:
+		return t.ThreadGroup().Timer().VirtualTimer, nil
+	case ItimerProf:
+		return t.ThreadGroup().Timer().ProfTimer, nil
+	default:
+		return nil, syscall.EINVAL
+	}
+}
+
+// Getitimer implements linux syscall getitimer(2).
+func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := ItimerType(args[0].Int())
+	val := args[1].Pointer()
+
+	timer, err := findTimer(t, timerID)
+	if err != nil {
+		return 0, nil, err
+	}
+	value, interval := ktime.SpecFromSetting(timer.Get())
+	olditv := linux.ItimerVal{
+		Value:    linux.DurationToTimeval(value),
+		Interval: linux.DurationToTimeval(interval),
+	}
+
+	return 0, nil, copyItimerValOut(t, val, &olditv)
+}
+
+// Setitimer implements linux syscall setitimer(2).
+func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := ItimerType(args[0].Int())
+	newVal := args[1].Pointer()
+	oldVal := args[2].Pointer()
+
+	timer, err := findTimer(t, timerID)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	itv, err := copyItimerValIn(t, newVal)
+	if err != nil {
+		return 0, nil, err
+	}
+	// Just like linux, we cap the timer value and interval with the max
+	// number that int64 can represent which is roughly 292 years.
+	s, err := ktime.SettingFromSpec(itv.Value.ToDuration(),
+		itv.Interval.ToDuration(), timer.Clock())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	valueNS, intervalNS := ktime.SpecFromSetting(timer.Swap(s))
+	olditv := linux.ItimerVal{
+		Value:    linux.DurationToTimeval(valueNS),
+		Interval: linux.DurationToTimeval(intervalNS),
+	}
+
+	return 0, nil, copyItimerValOut(t, oldVal, &olditv)
+}
+
+// Alarm implements linux syscall alarm(2).
+func Alarm(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	duration := time.Duration(args[0].Uint()) * time.Second
+
+	timer := t.ThreadGroup().Timer().RealTimer
+	s, err := ktime.SettingFromSpec(duration, 0, timer.Clock())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	value, _ := ktime.SpecFromSetting(timer.Swap(s))
+	sec := int64(value) / nsecPerSec
+	nsec := int64(value) % nsecPerSec
+	// We can't return 0 if we have an alarm pending ...
+	if (sec == 0 && nsec > 0) || nsec >= nsecPerSec/2 {
+		sec++
+	}
+
+	return uintptr(sec), nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
new file mode 100644
index 000000000..cb81d42b9
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -0,0 +1,135 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TimerfdCreate implements Linux syscall timerfd_create(2).
+func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := args[0].Int()
+	flags := args[1].Int()
+
+	if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var c ktime.Clock
+	switch clockID {
+	case linux.CLOCK_REALTIME:
+		c = t.Kernel().RealtimeClock()
+	case linux.CLOCK_MONOTONIC:
+		c = t.Kernel().MonotonicClock()
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+	f := timerfd.NewFile(t, c)
+	defer f.DecRef()
+	f.SetFlags(fs.SettableFileFlags{
+		NonBlocking: flags&linux.TFD_NONBLOCK != 0,
+	})
+
+	fd, err := t.FDMap().NewFDFrom(0, f, kernel.FDFlags{
+		CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
+	}, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// TimerfdSettime implements Linux syscall timerfd_settime(2).
+func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	flags := args[1].Int()
+	newValAddr := args[2].Pointer()
+	oldValAddr := args[3].Pointer()
+
+	if flags&^(linux.TFD_TIMER_ABSTIME) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	f := t.FDMap().GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	tf, ok := f.FileOperations.(*timerfd.TimerOperations)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var newVal linux.Itimerspec
+	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+		return 0, nil, err
+	}
+	var s ktime.Setting
+	var err error
+	if flags&linux.TFD_TIMER_ABSTIME != 0 {
+		s, err = ktime.SettingFromAbsSpec(ktime.FromTimespec(newVal.Value),
+			newVal.Interval.ToDuration())
+	} else {
+		s, err = ktime.SettingFromSpec(newVal.Value.ToDuration(),
+			newVal.Interval.ToDuration(), tf.Clock())
+	}
+	if err != nil {
+		return 0, nil, err
+	}
+	valueNS, intervalNS := ktime.SpecFromSetting(tf.SetTime(s))
+	if oldValAddr == 0 {
+		return 0, nil, nil
+	}
+	oldVal := linux.Itimerspec{
+		Interval: linux.DurationToTimespec(intervalNS),
+		Value:    linux.DurationToTimespec(valueNS),
+	}
+	_, err = t.CopyOut(oldValAddr, &oldVal)
+	return 0, nil, err
+}
+
+// TimerfdGettime implements Linux syscall timerfd_gettime(2).
+func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	curValAddr := args[1].Pointer()
+
+	f := t.FDMap().GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	tf, ok := f.FileOperations.(*timerfd.TimerOperations)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	valueNS, intervalNS := ktime.SpecFromSetting(tf.GetTime())
+	curVal := linux.Itimerspec{
+		Interval: linux.DurationToTimespec(intervalNS),
+		Value:    linux.DurationToTimespec(valueNS),
+	}
+	_, err := t.CopyOut(curValAddr, &curVal)
+	return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go
new file mode 100644
index 000000000..1047364b3
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_tls.go
@@ -0,0 +1,48 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//+build amd64
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// ArchPrctl implements linux syscall arch_prctl(2).
+// It sets architecture-specific process or thread state for t.
+func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	switch args[0].Int() {
+	case linux.ARCH_GET_FS:
+		addr := args[1].Pointer()
+		_, err := t.CopyOut(addr, &t.Arch().StateData().Regs.Fs_base)
+		if err != nil {
+			return 0, nil, err
+		}
+
+	case linux.ARCH_SET_FS:
+		regs := &t.Arch().StateData().Regs
+		regs.Fs = 0
+		regs.Fs_base = args[1].Uint64()
+
+	default:
+		return 0, nil, syscall.EINVAL
+	}
+
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
new file mode 100644
index 000000000..899116374
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -0,0 +1,89 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Uname implements linux syscall uname.
+func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	version := t.SyscallTable().Version
+
+	uts := t.UTSNamespace()
+
+	// Fill in structure fields.
+	var u linux.UtsName
+	copy(u.Sysname[:], version.Sysname)
+	copy(u.Nodename[:], uts.HostName())
+	copy(u.Release[:], version.Release)
+	copy(u.Version[:], version.Version)
+	copy(u.Machine[:], "x86_64") // +build tag above.
+	copy(u.Domainname[:], uts.DomainName())
+
+	// Copy out the result.
+	va := args[0].Pointer()
+	_, err := t.CopyOut(va, u)
+	return 0, nil, err
+}
+
+// Setdomainname implements Linux syscall setdomainname.
+func Setdomainname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nameAddr := args[0].Pointer()
+	size := args[1].Int()
+
+	utsns := t.UTSNamespace()
+	if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) {
+		return 0, nil, syserror.EPERM
+	}
+	if size < 0 || size > linux.UTSLen {
+		return 0, nil, syserror.EINVAL
+	}
+
+	name, err := t.CopyInString(nameAddr, int(size))
+	if err != nil {
+		return 0, nil, err
+	}
+
+	utsns.SetDomainName(name)
+	return 0, nil, nil
+}
+
+// Sethostname implements Linux syscall sethostname.
+func Sethostname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nameAddr := args[0].Pointer()
+	size := args[1].Int()
+
+	utsns := t.UTSNamespace()
+	if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) {
+		return 0, nil, syserror.EPERM
+	}
+	if size < 0 || size > linux.UTSLen {
+		return 0, nil, syserror.EINVAL
+	}
+
+	name, err := t.CopyInString(nameAddr, int(size))
+	if err != nil {
+		return 0, nil, err
+	}
+
+	utsns.SetHostName(name)
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
new file mode 100644
index 000000000..caa7b01ea
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -0,0 +1,274 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+const (
+	// EventMaskWrite contains events that can be triggered on writes.
+	//
+	// Note that EventHUp is not going to happen for pipes but may for
+	// implementations of poll on some sockets, see net/core/datagram.c.
+	EventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr
+)
+
+// Write implements linux syscall write(2).
+func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is writable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := writev(t, file, src)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+}
+
+// Pwrite64 implements linux syscall pwrite64(2).
+func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is writing at an offset supported?
+	if !file.Flags().Pwrite {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is writable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwritev(t, file, src, offset)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+}
+
+// Writev implements linux syscall writev(2).
+func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is writable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := writev(t, file, src)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+}
+
+// Pwritev implements linux syscall pwritev(2).
+func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is writing at an offset supported?
+	if !file.Flags().Pwrite {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is writable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwritev(t, file, src, offset)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+}
+
+func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) {
+	n, err := f.Writev(t, src)
+	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+		if n > 0 {
+			// Queue notification if we wrote anything.
+			f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	f.EventRegister(&w, EventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst64(n)
+
+		// Issue the request and break out if it completes with
+		// anything other than "would block".
+		n, err = f.Writev(t, src)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.Block(ch); err != nil {
+			break
+		}
+	}
+
+	f.EventUnregister(&w)
+
+	if total > 0 {
+		// Queue notification if we wrote anything.
+		f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+	}
+
+	return total, err
+}
+
+func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	n, err := f.Pwritev(t, src, offset)
+	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+		if n > 0 {
+			// Queue notification if we wrote anything.
+			f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	f.EventRegister(&w, EventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst64(n)
+
+		// Issue the request and break out if it completes with
+		// anything other than "would block".
+		n, err = f.Pwritev(t, src, offset+total)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.Block(ch); err != nil {
+			break
+		}
+	}
+
+	f.EventUnregister(&w)
+
+	if total > 0 {
+		// Queue notification if we wrote anything.
+		f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+	}
+
+	return total, err
+}
diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go
new file mode 100644
index 000000000..e865c6fc0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/timespec.go
@@ -0,0 +1,112 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// copyTimespecIn copies a Timespec from the untrusted app range to the kernel.
+func copyTimespecIn(t *kernel.Task, addr usermem.Addr) (linux.Timespec, error) {
+	switch t.Arch().Width() {
+	case 8:
+		ts := linux.Timespec{}
+		in := t.CopyScratchBuffer(16)
+		_, err := t.CopyInBytes(addr, in)
+		if err != nil {
+			return ts, err
+		}
+		ts.Sec = int64(usermem.ByteOrder.Uint64(in[0:]))
+		ts.Nsec = int64(usermem.ByteOrder.Uint64(in[8:]))
+		return ts, nil
+	default:
+		return linux.Timespec{}, syserror.ENOSYS
+	}
+}
+
+// copyTimespecOut copies a Timespec to the untrusted app range.
+func copyTimespecOut(t *kernel.Task, addr usermem.Addr, ts *linux.Timespec) error {
+	switch t.Arch().Width() {
+	case 8:
+		out := t.CopyScratchBuffer(16)
+		usermem.ByteOrder.PutUint64(out[0:], uint64(ts.Sec))
+		usermem.ByteOrder.PutUint64(out[8:], uint64(ts.Nsec))
+		_, err := t.CopyOutBytes(addr, out)
+		return err
+	default:
+		return syserror.ENOSYS
+	}
+}
+
+// copyTimevalIn copies a Timeval from the untrusted app range to the kernel.
+func copyTimevalIn(t *kernel.Task, addr usermem.Addr) (linux.Timeval, error) {
+	switch t.Arch().Width() {
+	case 8:
+		tv := linux.Timeval{}
+		in := t.CopyScratchBuffer(16)
+		_, err := t.CopyInBytes(addr, in)
+		if err != nil {
+			return tv, err
+		}
+		tv.Sec = int64(usermem.ByteOrder.Uint64(in[0:]))
+		tv.Usec = int64(usermem.ByteOrder.Uint64(in[8:]))
+		return tv, nil
+	default:
+		return linux.Timeval{}, syscall.ENOSYS
+	}
+}
+
+// copyTimevalOut copies a Timeval to the untrusted app range.
+func copyTimevalOut(t *kernel.Task, addr usermem.Addr, tv *linux.Timeval) error {
+	switch t.Arch().Width() {
+	case 8:
+		out := t.CopyScratchBuffer(16)
+		usermem.ByteOrder.PutUint64(out[0:], uint64(tv.Sec))
+		usermem.ByteOrder.PutUint64(out[8:], uint64(tv.Usec))
+		_, err := t.CopyOutBytes(addr, out)
+		return err
+	default:
+		return syscall.ENOSYS
+	}
+}
+
+// copyTimespecInToDuration copies a Timespec from the untrusted app range,
+// validates it and converts it to a Duration.
+//
+// If the Timespec is larger than what can be represented in a Duration, the
+// returned value is the maximum that Duration will allow.
+//
+// If timespecAddr is NULL, the returned value is negative.
+func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) {
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timespecAddr != 0 {
+		timespec, err := copyTimespecIn(t, timespecAddr)
+		if err != nil {
+			return 0, err
+		}
+		if !timespec.Valid() {
+			return 0, syscall.EINVAL
+		}
+		timeout = time.Duration(timespec.ToNsecCapped())
+	}
+	return timeout, nil
+}
author	Googler <noreply@google.com>	2018-04-27 10:37:02 -0700
committer	Adin Scannell <ascannell@google.com>	2018-04-28 01:44:26 -0400
commit	d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree	54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/syscalls/linux
parent	f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)