112 files changed, 25856 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
new file mode 100644
index 000000000..25fe1921b
--- /dev/null
+++ b/pkg/sentry/kernel/BUILD
@@ -0,0 +1,241 @@
+load("//tools:defs.bzl", "go_library", "go_test", "proto_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "pending_signals_list",
+    out = "pending_signals_list.go",
+    package = "kernel",
+    prefix = "pendingSignal",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*pendingSignal",
+        "Linker": "*pendingSignal",
+    },
+)
+
+go_template_instance(
+    name = "process_group_list",
+    out = "process_group_list.go",
+    package = "kernel",
+    prefix = "processGroup",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*ProcessGroup",
+        "Linker": "*ProcessGroup",
+    },
+)
+
+go_template_instance(
+    name = "seqatomic_taskgoroutineschedinfo",
+    out = "seqatomic_taskgoroutineschedinfo_unsafe.go",
+    package = "kernel",
+    suffix = "TaskGoroutineSchedInfo",
+    template = "//pkg/sync:generic_seqatomic",
+    types = {
+        "Value": "TaskGoroutineSchedInfo",
+    },
+)
+
+go_template_instance(
+    name = "session_list",
+    out = "session_list.go",
+    package = "kernel",
+    prefix = "session",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Session",
+        "Linker": "*Session",
+    },
+)
+
+go_template_instance(
+    name = "task_list",
+    out = "task_list.go",
+    package = "kernel",
+    prefix = "task",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Task",
+        "Linker": "*Task",
+    },
+)
+
+go_template_instance(
+    name = "socket_list",
+    out = "socket_list.go",
+    package = "kernel",
+    prefix = "socket",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*SocketEntry",
+        "Linker": "*SocketEntry",
+    },
+)
+
+proto_library(
+    name = "uncaught_signal",
+    srcs = ["uncaught_signal.proto"],
+    visibility = ["//visibility:public"],
+    deps = ["//pkg/sentry/arch:registers_proto"],
+)
+
+go_library(
+    name = "kernel",
+    srcs = [
+        "abstract_socket_namespace.go",
+        "aio.go",
+        "context.go",
+        "fd_table.go",
+        "fd_table_unsafe.go",
+        "fs_context.go",
+        "ipc_namespace.go",
+        "kernel.go",
+        "kernel_opts.go",
+        "kernel_state.go",
+        "pending_signals.go",
+        "pending_signals_list.go",
+        "pending_signals_state.go",
+        "posixtimer.go",
+        "process_group_list.go",
+        "ptrace.go",
+        "ptrace_amd64.go",
+        "ptrace_arm64.go",
+        "rseq.go",
+        "seccomp.go",
+        "seqatomic_taskgoroutineschedinfo_unsafe.go",
+        "session_list.go",
+        "sessions.go",
+        "signal.go",
+        "signal_handlers.go",
+        "socket_list.go",
+        "syscalls.go",
+        "syscalls_state.go",
+        "syslog.go",
+        "task.go",
+        "task_acct.go",
+        "task_block.go",
+        "task_clone.go",
+        "task_context.go",
+        "task_exec.go",
+        "task_exit.go",
+        "task_futex.go",
+        "task_identity.go",
+        "task_list.go",
+        "task_log.go",
+        "task_net.go",
+        "task_run.go",
+        "task_sched.go",
+        "task_signals.go",
+        "task_start.go",
+        "task_stop.go",
+        "task_syscall.go",
+        "task_usermem.go",
+        "thread_group.go",
+        "threads.go",
+        "timekeeper.go",
+        "timekeeper_state.go",
+        "tty.go",
+        "uts_namespace.go",
+        "vdso.go",
+        "version.go",
+    ],
+    imports = [
+        "gvisor.dev/gvisor/pkg/bpf",
+        "gvisor.dev/gvisor/pkg/sentry/device",
+        "gvisor.dev/gvisor/pkg/tcpip",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        ":uncaught_signal_go_proto",
+        "//pkg/abi",
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/binary",
+        "//pkg/bits",
+        "//pkg/bpf",
+        "//pkg/context",
+        "//pkg/cpuid",
+        "//pkg/eventchannel",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/metric",
+        "//pkg/refs",
+        "//pkg/safemem",
+        "//pkg/secio",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fs/timerfd",
+        "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/fsimpl/pipefs",
+        "//pkg/sentry/fsimpl/sockfs",
+        "//pkg/sentry/fsimpl/timerfd",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/hostcpu",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/epoll",
+        "//pkg/sentry/kernel/futex",
+        "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/kernel/semaphore",
+        "//pkg/sentry/kernel/shm",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/socket/netlink/port",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/time",
+        "//pkg/sentry/unimpl",
+        "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
+        "//pkg/state",
+        "//pkg/state/statefile",
+        "//pkg/state/wire",
+        "//pkg/sync",
+        "//pkg/syserr",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/stack",
+        "//pkg/usermem",
+        "//pkg/waiter",
+        "//tools/go_marshal/marshal",
+    ],
+)
+
+go_test(
+    name = "kernel_test",
+    size = "small",
+    srcs = [
+        "fd_table_test.go",
+        "table_test.go",
+        "task_test.go",
+        "timekeeper_test.go",
+    ],
+    library = ":kernel",
+    deps = [
+        "//pkg/abi",
+        "//pkg/context",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/filetest",
+        "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/time",
+        "//pkg/sentry/usage",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/kernel/README.md b/pkg/sentry/kernel/README.md
new file mode 100644
index 000000000..427311be8
--- /dev/null
+++ b/pkg/sentry/kernel/README.md
@@ -0,0 +1,108 @@
+This package contains:
+
+-   A (partial) emulation of the "core Linux kernel", which governs task
+    execution and scheduling, system call dispatch, and signal handling. See
+    below for details.
+
+-   The top-level interface for the sentry's Linux kernel emulation in general,
+    used by the `main` function of all versions of the sentry. This interface
+    revolves around the `Env` type (defined in `kernel.go`).
+
+# Background
+
+In Linux, each schedulable context is referred to interchangeably as a "task" or
+"thread". Tasks can be divided into userspace and kernel tasks. In the sentry,
+scheduling is managed by the Go runtime, so each schedulable context is a
+goroutine; only "userspace" (application) contexts are referred to as tasks, and
+represented by Task objects. (From this point forward, "task" refers to the
+sentry's notion of a task unless otherwise specified.)
+
+At a high level, Linux application threads can be thought of as repeating a "run
+loop":
+
+-   Some amount of application code is executed in userspace.
+
+-   A trap (explicit syscall invocation, hardware interrupt or exception, etc.)
+    causes control flow to switch to the kernel.
+
+-   Some amount of kernel code is executed in kernelspace, e.g. to handle the
+    cause of the trap.
+
+-   The kernel "returns from the trap" into application code.
+
+Analogously, each task in the sentry is associated with a *task goroutine* that
+executes that task's run loop (`Task.run` in `task_run.go`). However, the
+sentry's task run loop differs in structure in order to support saving execution
+state to, and resuming execution from, checkpoints.
+
+While in kernelspace, a Linux thread can be descheduled (cease execution) in a
+variety of ways:
+
+-   It can yield or be preempted, becoming temporarily descheduled but still
+    runnable. At present, the sentry delegates scheduling of runnable threads to
+    the Go runtime.
+
+-   It can exit, becoming permanently descheduled. The sentry's equivalent is
+    returning from `Task.run`, terminating the task goroutine.
+
+-   It can enter interruptible sleep, a state in which it can be woken by a
+    caller-defined wakeup or the receipt of a signal. In the sentry,
+    interruptible sleep (which is ambiguously referred to as *blocking*) is
+    implemented by making all events that can end blocking (including signal
+    notifications) communicated via Go channels and using `select` to multiplex
+    wakeup sources; see `task_block.go`.
+
+-   It can enter uninterruptible sleep, a state in which it can only be woken by
+    a caller-defined wakeup. Killable sleep is a closely related variant in
+    which the task can also be woken by SIGKILL. (These definitions also include
+    Linux's "group-stopped" (`TASK_STOPPED`) and "ptrace-stopped"
+    (`TASK_TRACED`) states.)
+
+To maximize compatibility with Linux, sentry checkpointing appears as a spurious
+signal-delivery interrupt on all tasks; interrupted system calls return `EINTR`
+or are automatically restarted as usual. However, these semantics require that
+uninterruptible and killable sleeps do not appear to be interrupted. In other
+words, the state of the task, including its progress through the interrupted
+operation, must be preserved by checkpointing. For many such sleeps, the wakeup
+condition is application-controlled, making it infeasible to wait for the sleep
+to end before checkpointing. Instead, we must support checkpointing progress
+through sleeping operations.
+
+# Implementation
+
+We break the task's control flow graph into *states*, delimited by:
+
+1.  Points where uninterruptible and killable sleeps may occur. For example,
+    there exists a state boundary between signal dequeueing and signal delivery
+    because there may be an intervening ptrace signal-delivery-stop.
+
+2.  Points where sleep-induced branches may "rejoin" normal execution. For
+    example, the syscall exit state exists because it can be reached immediately
+    following a synchronous syscall, or after a task that is sleeping in
+    `execve()` or `vfork()` resumes execution.
+
+3.  Points containing large branches. This is strictly for organizational
+    purposes. For example, the state that processes interrupt-signaled
+    conditions is kept separate from the main "app" state to reduce the size of
+    the latter.
+
+4.  `SyscallReinvoke`, which does not correspond to anything in Linux, and
+    exists solely to serve the autosave feature.
+
+![dot -Tpng -Goverlap=false -orun_states.png run_states.dot](g3doc/run_states.png "Task control flow graph")
+
+States before which a stop may occur are represented as implementations of the
+`taskRunState` interface named `run(state)`, allowing them to be saved and
+restored. States that cannot be immediately preceded by a stop are simply `Task`
+methods named `do(state)`.
+
+Conditions that can require task goroutines to cease execution for unknown
+lengths of time are called *stops*. Stops are divided into *internal stops*,
+which are stops whose start and end conditions are implemented within the
+sentry, and *external stops*, which are stops whose start and end conditions are
+not known to the sentry. Hence all uninterruptible and killable sleeps are
+internal stops, and the existence of a pending checkpoint operation is an
+external stop. Internal stops are reified into instances of the `TaskStop` type,
+while external stops are merely counted. The task run loop alternates between
+checking for stops and advancing the task's state. This allows checkpointing to
+hold tasks in a stopped state while waiting for all tasks in the system to stop.
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
new file mode 100644
index 000000000..920fe4329
--- /dev/null
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -0,0 +1,111 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// +stateify savable
+type abstractEndpoint struct {
+	ep   transport.BoundEndpoint
+	wr   *refs.WeakRef
+	name string
+	ns   *AbstractSocketNamespace
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+func (e *abstractEndpoint) WeakRefGone() {
+	e.ns.mu.Lock()
+	if e.ns.endpoints[e.name].ep == e.ep {
+		delete(e.ns.endpoints, e.name)
+	}
+	e.ns.mu.Unlock()
+}
+
+// AbstractSocketNamespace is used to implement the Linux abstract socket functionality.
+//
+// +stateify savable
+type AbstractSocketNamespace struct {
+	mu sync.Mutex `state:"nosave"`
+
+	// Keeps mapping from name to endpoint.
+	endpoints map[string]abstractEndpoint
+}
+
+// NewAbstractSocketNamespace returns a new AbstractSocketNamespace.
+func NewAbstractSocketNamespace() *AbstractSocketNamespace {
+	return &AbstractSocketNamespace{
+		endpoints: make(map[string]abstractEndpoint),
+	}
+}
+
+// A boundEndpoint wraps a transport.BoundEndpoint to maintain a reference on
+// its backing object.
+type boundEndpoint struct {
+	transport.BoundEndpoint
+	rc refs.RefCounter
+}
+
+// Release implements transport.BoundEndpoint.Release.
+func (e *boundEndpoint) Release() {
+	e.rc.DecRef()
+	e.BoundEndpoint.Release()
+}
+
+// BoundEndpoint retrieves the endpoint bound to the given name. The return
+// value is nil if no endpoint was bound.
+func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndpoint {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	ep, ok := a.endpoints[name]
+	if !ok {
+		return nil
+	}
+
+	rc := ep.wr.Get()
+	if rc == nil {
+		delete(a.endpoints, name)
+		return nil
+	}
+
+	return &boundEndpoint{ep.ep, rc}
+}
+
+// Bind binds the given socket.
+//
+// When the last reference managed by rc is dropped, ep may be removed from the
+// namespace.
+func (a *AbstractSocketNamespace) Bind(name string, ep transport.BoundEndpoint, rc refs.RefCounter) error {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if ep, ok := a.endpoints[name]; ok {
+		if rc := ep.wr.Get(); rc != nil {
+			rc.DecRef()
+			return syscall.EADDRINUSE
+		}
+	}
+
+	ae := abstractEndpoint{ep: ep, name: name, ns: a}
+	ae.wr = refs.NewWeakRef(rc, &ae)
+	a.endpoints[name] = ae
+	return nil
+}
diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go
new file mode 100644
index 000000000..0ac78c0b8
--- /dev/null
+++ b/pkg/sentry/kernel/aio.go
@@ -0,0 +1,81 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+// AIOCallback is an function that does asynchronous I/O on behalf of a task.
+type AIOCallback func(context.Context)
+
+// QueueAIO queues an AIOCallback which will be run asynchronously.
+func (t *Task) QueueAIO(cb AIOCallback) {
+	ctx := taskAsyncContext{t: t}
+	wg := &t.TaskSet().aioGoroutines
+	wg.Add(1)
+	go func() {
+		cb(ctx)
+		wg.Done()
+	}()
+}
+
+type taskAsyncContext struct {
+	context.NoopSleeper
+	t *Task
+}
+
+// Debugf implements log.Logger.Debugf.
+func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
+	ctx.t.Debugf(format, v...)
+}
+
+// Infof implements log.Logger.Infof.
+func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
+	ctx.t.Infof(format, v...)
+}
+
+// Warningf implements log.Logger.Warningf.
+func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
+	ctx.t.Warningf(format, v...)
+}
+
+// IsLogging implements log.Logger.IsLogging.
+func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
+	return ctx.t.IsLogging(level)
+}
+
+// Deadline implements context.Context.Deadline.
+func (ctx taskAsyncContext) Deadline() (time.Time, bool) {
+	return ctx.t.Deadline()
+}
+
+// Done implements context.Context.Done.
+func (ctx taskAsyncContext) Done() <-chan struct{} {
+	return ctx.t.Done()
+}
+
+// Err implements context.Context.Err.
+func (ctx taskAsyncContext) Err() error {
+	return ctx.t.Err()
+}
+
+// Value implements context.Context.Value.
+func (ctx taskAsyncContext) Value(key interface{}) interface{} {
+	return ctx.t.Value(key)
+}
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
new file mode 100644
index 000000000..2bc49483a
--- /dev/null
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -0,0 +1,69 @@
+load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "atomicptr_credentials",
+    out = "atomicptr_credentials_unsafe.go",
+    package = "auth",
+    suffix = "Credentials",
+    template = "//pkg/sync:generic_atomicptr",
+    types = {
+        "Value": "Credentials",
+    },
+)
+
+go_template_instance(
+    name = "id_map_range",
+    out = "id_map_range.go",
+    package = "auth",
+    prefix = "idMap",
+    template = "//pkg/segment:generic_range",
+    types = {
+        "T": "uint32",
+    },
+)
+
+go_template_instance(
+    name = "id_map_set",
+    out = "id_map_set.go",
+    consts = {
+        "minDegree": "3",
+    },
+    package = "auth",
+    prefix = "idMap",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint32",
+        "Range": "idMapRange",
+        "Value": "uint32",
+        "Functions": "idMapFunctions",
+    },
+)
+
+go_library(
+    name = "auth",
+    srcs = [
+        "atomicptr_credentials_unsafe.go",
+        "auth.go",
+        "capability_set.go",
+        "context.go",
+        "credentials.go",
+        "id.go",
+        "id_map.go",
+        "id_map_functions.go",
+        "id_map_range.go",
+        "id_map_set.go",
+        "user_namespace.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/bits",
+        "//pkg/context",
+        "//pkg/log",
+        "//pkg/sync",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/kernel/auth/auth.go b/pkg/sentry/kernel/auth/auth.go
new file mode 100644
index 000000000..847d121aa
--- /dev/null
+++ b/pkg/sentry/kernel/auth/auth.go
@@ -0,0 +1,22 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package auth implements an access control model that is a subset of Linux's.
+//
+// The auth package supports two kinds of access controls: user/group IDs and
+// capabilities. Each resource in the security model is associated with a user
+// namespace; "privileged" operations check that the operator's credentials
+// have the required user/group IDs or capabilities within the user namespace
+// of accessed resources.
+package auth
diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go
new file mode 100644
index 000000000..fc8c6745c
--- /dev/null
+++ b/pkg/sentry/kernel/auth/capability_set.go
@@ -0,0 +1,61 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
+)
+
+// A CapabilitySet is a set of capabilities implemented as a bitset. The zero
+// value of CapabilitySet is a set containing no capabilities.
+type CapabilitySet uint64
+
+// AllCapabilities is a CapabilitySet containing all valid capabilities.
+var AllCapabilities = CapabilitySetOf(linux.CAP_LAST_CAP+1) - 1
+
+// CapabilitySetOf returns a CapabilitySet containing only the given
+// capability.
+func CapabilitySetOf(cp linux.Capability) CapabilitySet {
+	return CapabilitySet(bits.MaskOf64(int(cp)))
+}
+
+// CapabilitySetOfMany returns a CapabilitySet containing the given capabilities.
+func CapabilitySetOfMany(cps []linux.Capability) CapabilitySet {
+	var cs uint64
+	for _, cp := range cps {
+		cs |= bits.MaskOf64(int(cp))
+	}
+	return CapabilitySet(cs)
+}
+
+// TaskCapabilities represents all the capability sets for a task. Each of these
+// sets is explained in greater detail in capabilities(7).
+type TaskCapabilities struct {
+	// Permitted is a limiting superset for the effective capabilities that
+	// the thread may assume.
+	PermittedCaps CapabilitySet
+	// Inheritable is a set of capabilities preserved across an execve(2).
+	InheritableCaps CapabilitySet
+	// Effective is the set of capabilities used by the kernel to perform
+	// permission checks for the thread.
+	EffectiveCaps CapabilitySet
+	// Bounding is a limiting superset for the capabilities that a thread
+	// can add to its inheritable set using capset(2).
+	BoundingCaps CapabilitySet
+	// Ambient is a set of capabilities that are preserved across an
+	// execve(2) of a program that is not privileged.
+	AmbientCaps CapabilitySet
+}
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
new file mode 100644
index 000000000..ef5723127
--- /dev/null
+++ b/pkg/sentry/kernel/auth/context.go
@@ -0,0 +1,36 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+)
+
+// contextID is the auth package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxCredentials is a Context.Value key for Credentials.
+	CtxCredentials contextID = iota
+)
+
+// CredentialsFromContext returns a copy of the Credentials used by ctx, or a
+// set of Credentials with no capabilities if ctx does not have Credentials.
+func CredentialsFromContext(ctx context.Context) *Credentials {
+	if v := ctx.Value(CtxCredentials); v != nil {
+		return v.(*Credentials)
+	}
+	return NewAnonymousCredentials()
+}
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
new file mode 100644
index 000000000..6862f2ef5
--- /dev/null
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -0,0 +1,262 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Credentials contains information required to authorize privileged operations
+// in a user namespace.
+//
+// +stateify savable
+type Credentials struct {
+	// Real/effective/saved user/group IDs in the root user namespace. None of
+	// these should ever be NoID.
+	RealKUID      KUID
+	EffectiveKUID KUID
+	SavedKUID     KUID
+	RealKGID      KGID
+	EffectiveKGID KGID
+	SavedKGID     KGID
+
+	// Filesystem user/group IDs are not implemented. "... setfsuid() is
+	// nowadays unneeded and should be avoided in new applications (likewise
+	// for setfsgid(2))." - setfsuid(2)
+
+	// Supplementary groups used by set/getgroups.
+	//
+	// ExtraKGIDs slices are immutable, allowing multiple Credentials with the
+	// same ExtraKGIDs to share the same slice.
+	ExtraKGIDs []KGID
+
+	// The capability sets applicable to this set of credentials.
+	PermittedCaps   CapabilitySet
+	InheritableCaps CapabilitySet
+	EffectiveCaps   CapabilitySet
+	BoundingCaps    CapabilitySet
+	// Ambient capabilities are not introduced until Linux 4.3.
+
+	// KeepCaps is the flag for PR_SET_KEEPCAPS which allow capabilities to be
+	// maintained after a switch from root user to non-root user via setuid().
+	KeepCaps bool
+
+	// The user namespace associated with the owner of the credentials.
+	UserNamespace *UserNamespace
+}
+
+// NewAnonymousCredentials returns a set of credentials with no capabilities in
+// any user namespace.
+func NewAnonymousCredentials() *Credentials {
+	// Create a new root user namespace. Since the new namespace's owner is
+	// KUID 0 and the returned credentials have non-zero KUID/KGID, the
+	// returned credentials do not have any capabilities in the new namespace.
+	// Since the new namespace is not part of any existing user namespace
+	// hierarchy, the returned credentials do not have any capabilities in any
+	// other namespace.
+	return &Credentials{
+		RealKUID:      NobodyKUID,
+		EffectiveKUID: NobodyKUID,
+		SavedKUID:     NobodyKUID,
+		RealKGID:      NobodyKGID,
+		EffectiveKGID: NobodyKGID,
+		SavedKGID:     NobodyKGID,
+		UserNamespace: NewRootUserNamespace(),
+	}
+}
+
+// NewRootCredentials returns a set of credentials with KUID and KGID 0 (i.e.
+// global root) in user namespace ns.
+func NewRootCredentials(ns *UserNamespace) *Credentials {
+	// I can't find documentation for this anywhere, but it's correct for the
+	// inheritable capability set to be initially empty (the capabilities test
+	// checks for this property).
+	return &Credentials{
+		RealKUID:      RootKUID,
+		EffectiveKUID: RootKUID,
+		SavedKUID:     RootKUID,
+		RealKGID:      RootKGID,
+		EffectiveKGID: RootKGID,
+		SavedKGID:     RootKGID,
+		PermittedCaps: AllCapabilities,
+		EffectiveCaps: AllCapabilities,
+		BoundingCaps:  AllCapabilities,
+		UserNamespace: ns,
+	}
+}
+
+// NewUserCredentials returns a set of credentials based on the given UID, GIDs,
+// and capabilities in a given namespace. If all arguments are their zero
+// values, this returns the same credentials as NewRootCredentials.
+func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *TaskCapabilities, ns *UserNamespace) *Credentials {
+	creds := NewRootCredentials(ns)
+
+	// Set the UID.
+	uid := kuid
+	creds.RealKUID = uid
+	creds.EffectiveKUID = uid
+	creds.SavedKUID = uid
+
+	// Set GID.
+	gid := kgid
+	creds.RealKGID = gid
+	creds.EffectiveKGID = gid
+	creds.SavedKGID = gid
+
+	// Set additional GIDs.
+	creds.ExtraKGIDs = append(creds.ExtraKGIDs, extraKGIDs...)
+
+	// Set capabilities.
+	if capabilities != nil {
+		creds.PermittedCaps = capabilities.PermittedCaps
+		creds.EffectiveCaps = capabilities.EffectiveCaps
+		creds.BoundingCaps = capabilities.BoundingCaps
+		creds.InheritableCaps = capabilities.InheritableCaps
+		// TODO(nlacasse): Support ambient capabilities.
+	} else {
+		// If no capabilities are specified, grant capabilities consistent with
+		// setresuid + setresgid from NewRootCredentials to the given uid and
+		// gid.
+		if kuid == RootKUID {
+			creds.PermittedCaps = AllCapabilities
+			creds.EffectiveCaps = AllCapabilities
+		} else {
+			creds.PermittedCaps = 0
+			creds.EffectiveCaps = 0
+		}
+		creds.BoundingCaps = AllCapabilities
+	}
+
+	return creds
+}
+
+// Fork generates an identical copy of a set of credentials.
+func (c *Credentials) Fork() *Credentials {
+	nc := new(Credentials)
+	*nc = *c // Copy-by-value; this is legal for all fields.
+	return nc
+}
+
+// InGroup returns true if c is in group kgid. Compare Linux's
+// kernel/groups.c:in_group_p().
+func (c *Credentials) InGroup(kgid KGID) bool {
+	if c.EffectiveKGID == kgid {
+		return true
+	}
+	for _, extraKGID := range c.ExtraKGIDs {
+		if extraKGID == kgid {
+			return true
+		}
+	}
+	return false
+}
+
+// HasCapabilityIn returns true if c has capability cp in ns.
+func (c *Credentials) HasCapabilityIn(cp linux.Capability, ns *UserNamespace) bool {
+	for {
+		// "1. A process has a capability inside a user namespace if it is a member
+		// of that namespace and it has the capability in its effective capability
+		// set." - user_namespaces(7)
+		if c.UserNamespace == ns {
+			return CapabilitySetOf(cp)&c.EffectiveCaps != 0
+		}
+		// "3. ... A process that resides in the parent of the user namespace and
+		// whose effective user ID matches the owner of the namespace has all
+		// capabilities in the namespace."
+		if c.UserNamespace == ns.parent && c.EffectiveKUID == ns.owner {
+			return true
+		}
+		// "2. If a process has a capability in a user namespace, then it has that
+		// capability in all child (and further removed descendant) namespaces as
+		// well."
+		if ns.parent == nil {
+			return false
+		}
+		ns = ns.parent
+	}
+}
+
+// HasCapability returns true if c has capability cp in its user namespace.
+func (c *Credentials) HasCapability(cp linux.Capability) bool {
+	return c.HasCapabilityIn(cp, c.UserNamespace)
+}
+
+// UseUID checks that c can use uid in its user namespace, then translates it
+// to the root user namespace.
+//
+// The checks UseUID does are common, but you should verify that it's doing
+// exactly what you want.
+func (c *Credentials) UseUID(uid UID) (KUID, error) {
+	// uid must be mapped.
+	kuid := c.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return NoID, syserror.EINVAL
+	}
+	// If c has CAP_SETUID, then it can use any UID in its user namespace.
+	if c.HasCapability(linux.CAP_SETUID) {
+		return kuid, nil
+	}
+	// Otherwise, c must already have the UID as its real, effective, or saved
+	// set-user-ID.
+	if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID {
+		return kuid, nil
+	}
+	return NoID, syserror.EPERM
+}
+
+// UseGID checks that c can use gid in its user namespace, then translates it
+// to the root user namespace.
+func (c *Credentials) UseGID(gid GID) (KGID, error) {
+	kgid := c.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return NoID, syserror.EINVAL
+	}
+	if c.HasCapability(linux.CAP_SETGID) {
+		return kgid, nil
+	}
+	if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID {
+		return kgid, nil
+	}
+	return NoID, syserror.EPERM
+}
+
+// SetUID translates the provided uid to the root user namespace and updates c's
+// uids to it. This performs no permissions or capabilities checks, the caller
+// is responsible for ensuring the calling context is permitted to modify c.
+func (c *Credentials) SetUID(uid UID) error {
+	kuid := c.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return syserror.EINVAL
+	}
+	c.RealKUID = kuid
+	c.EffectiveKUID = kuid
+	c.SavedKUID = kuid
+	return nil
+}
+
+// SetGID translates the provided gid to the root user namespace and updates c's
+// gids to it. This performs no permissions or capabilities checks, the caller
+// is responsible for ensuring the calling context is permitted to modify c.
+func (c *Credentials) SetGID(gid GID) error {
+	kgid := c.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return syserror.EINVAL
+	}
+	c.RealKGID = kgid
+	c.EffectiveKGID = kgid
+	c.SavedKGID = kgid
+	return nil
+}
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
new file mode 100644
index 000000000..0a58ba17c
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id.go
@@ -0,0 +1,121 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"math"
+)
+
+// UID is a user ID in an unspecified user namespace.
+type UID uint32
+
+// GID is a group ID in an unspecified user namespace.
+type GID uint32
+
+// In the root user namespace, user/group IDs have a 1-to-1 relationship with
+// the users/groups they represent. In other user namespaces, this is not the
+// case; for example, two different unmapped users may both "have" the overflow
+// UID. This means that it is generally only valid to compare user and group
+// IDs in the root user namespace. We assign distinct types, KUID/KGID, to such
+// IDs to emphasize this distinction. ("k" is for "key", as in "unique key".
+// Linux also uses the prefix "k", but I think they mean "kernel".)
+
+// KUID is a user ID in the root user namespace.
+type KUID uint32
+
+// KGID is a group ID in the root user namespace.
+type KGID uint32
+
+const (
+	// NoID is uint32(-1). -1 is consistently used as a special value, in Linux
+	// and by extension in the auth package, to mean "no ID":
+	//
+	// - ID mapping returns -1 if the ID is not mapped.
+	//
+	// - Most set*id() syscalls accept -1 to mean "do not change this ID".
+	NoID = math.MaxUint32
+
+	// OverflowUID is the default value of /proc/sys/kernel/overflowuid. The
+	// "overflow UID" is usually [1] used when translating a user ID between
+	// namespaces fails because the ID is not mapped. (We don't implement this
+	// file, so the overflow UID is constant.)
+	//
+	// [1] "There is one notable case where unmapped user and group IDs are not
+	// converted to the corresponding overflow ID value. When viewing a uid_map
+	// or gid_map file in which there is no mapping for the second field, that
+	// field is displayed as 4294967295 (-1 as an unsigned integer);" -
+	// user_namespaces(7)
+	OverflowUID = UID(65534)
+	OverflowGID = GID(65534)
+
+	// NobodyKUID is the user ID usually reserved for the least privileged user
+	// "nobody".
+	NobodyKUID = KUID(65534)
+	NobodyKGID = KGID(65534)
+
+	// RootKUID is the user ID usually used for the most privileged user "root".
+	RootKUID = KUID(0)
+	RootKGID = KGID(0)
+	RootUID  = UID(0)
+	RootGID  = GID(0)
+)
+
+// Ok returns true if uid is not -1.
+func (uid UID) Ok() bool {
+	return uid != NoID
+}
+
+// Ok returns true if gid is not -1.
+func (gid GID) Ok() bool {
+	return gid != NoID
+}
+
+// Ok returns true if kuid is not -1.
+func (kuid KUID) Ok() bool {
+	return kuid != NoID
+}
+
+// Ok returns true if kgid is not -1.
+func (kgid KGID) Ok() bool {
+	return kgid != NoID
+}
+
+// OrOverflow returns uid if it is valid and the overflow UID otherwise.
+func (uid UID) OrOverflow() UID {
+	if uid.Ok() {
+		return uid
+	}
+	return OverflowUID
+}
+
+// OrOverflow returns gid if it is valid and the overflow GID otherwise.
+func (gid GID) OrOverflow() GID {
+	if gid.Ok() {
+		return gid
+	}
+	return OverflowGID
+}
+
+// In translates kuid into user namespace ns. If kuid is not mapped in ns, In
+// returns NoID.
+func (kuid KUID) In(ns *UserNamespace) UID {
+	return ns.MapFromKUID(kuid)
+}
+
+// In translates kgid into user namespace ns. If kgid is not mapped in ns, In
+// returns NoID.
+func (kgid KGID) In(ns *UserNamespace) GID {
+	return ns.MapFromKGID(kgid)
+}
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
new file mode 100644
index 000000000..28cbe159d
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -0,0 +1,285 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns.
+func (ns *UserNamespace) MapFromKUID(kuid KUID) UID {
+	if ns.parent == nil {
+		return UID(kuid)
+	}
+	return UID(ns.mapID(&ns.uidMapFromParent, uint32(ns.parent.MapFromKUID(kuid))))
+}
+
+// MapFromKGID translates kgid, a GID in the root namespace, to a GID in ns.
+func (ns *UserNamespace) MapFromKGID(kgid KGID) GID {
+	if ns.parent == nil {
+		return GID(kgid)
+	}
+	return GID(ns.mapID(&ns.gidMapFromParent, uint32(ns.parent.MapFromKGID(kgid))))
+}
+
+// MapToKUID translates uid, a UID in ns, to a UID in the root namespace.
+func (ns *UserNamespace) MapToKUID(uid UID) KUID {
+	if ns.parent == nil {
+		return KUID(uid)
+	}
+	return ns.parent.MapToKUID(UID(ns.mapID(&ns.uidMapToParent, uint32(uid))))
+}
+
+// MapToKGID translates gid, a GID in ns, to a GID in the root namespace.
+func (ns *UserNamespace) MapToKGID(gid GID) KGID {
+	if ns.parent == nil {
+		return KGID(gid)
+	}
+	return ns.parent.MapToKGID(GID(ns.mapID(&ns.gidMapToParent, uint32(gid))))
+}
+
+func (ns *UserNamespace) mapID(m *idMapSet, id uint32) uint32 {
+	if id == NoID {
+		return NoID
+	}
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	if it := m.FindSegment(id); it.Ok() {
+		return it.Value() + (id - it.Start())
+	}
+	return NoID
+}
+
+// allIDsMapped returns true if all IDs in the range [start, end) are mapped in
+// m.
+//
+// Preconditions: end >= start.
+func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool {
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	return m.SpanRange(idMapRange{start, end}) == end-start
+}
+
+// An IDMapEntry represents a mapping from a range of contiguous IDs in a user
+// namespace to an equally-sized range of contiguous IDs in the namespace's
+// parent.
+//
+// +stateify savable
+type IDMapEntry struct {
+	// FirstID is the first ID in the range in the namespace.
+	FirstID uint32
+
+	// FirstParentID is the first ID in the range in the parent namespace.
+	FirstParentID uint32
+
+	// Length is the number of IDs in the range.
+	Length uint32
+}
+
+// SetUIDMap instructs ns to translate UIDs as specified by entries.
+//
+// Note: SetUIDMap does not place an upper bound on the number of entries, but
+// Linux does. This restriction is implemented in SetUIDMap's caller, the
+// implementation of /proc/[pid]/uid_map.
+func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) error {
+	c := CredentialsFromContext(ctx)
+
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	// "After the creation of a new user namespace, the uid_map file of *one*
+	// of the processes in the namespace may be written to *once* to define the
+	// mapping of user IDs in the new user namespace. An attempt to write more
+	// than once to a uid_map file in a user namespace fails with the error
+	// EPERM. Similar rules apply for gid_map files." - user_namespaces(7)
+	if !ns.uidMapFromParent.IsEmpty() {
+		return syserror.EPERM
+	}
+	// "At least one line must be written to the file."
+	if len(entries) == 0 {
+		return syserror.EINVAL
+	}
+	// """
+	// In order for a process to write to the /proc/[pid]/uid_map
+	// (/proc/[pid]/gid_map) file, all of the following requirements must be
+	// met:
+	//
+	// 1. The writing process must have the CAP_SETUID (CAP_SETGID) capability
+	// in the user namespace of the process pid.
+	// """
+	if !c.HasCapabilityIn(linux.CAP_SETUID, ns) {
+		return syserror.EPERM
+	}
+	// "2. The writing process must either be in the user namespace of the process
+	// pid or be in the parent user namespace of the process pid."
+	if c.UserNamespace != ns && c.UserNamespace != ns.parent {
+		return syserror.EPERM
+	}
+	// """
+	// 3. (see trySetUIDMap)
+	//
+	// 4. One of the following two cases applies:
+	//
+	// * Either the writing process has the CAP_SETUID (CAP_SETGID) capability
+	// in the parent user namespace.
+	// """
+	if !c.HasCapabilityIn(linux.CAP_SETUID, ns.parent) {
+		// """
+		// * Or otherwise all of the following restrictions apply:
+		//
+		//   + The data written to uid_map (gid_map) must consist of a single line
+		//   that maps the writing process' effective user ID (group ID) in the
+		//   parent user namespace to a user ID (group ID) in the user namespace.
+		// """
+		if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 {
+			return syserror.EPERM
+		}
+		// """
+		//   + The writing process must have the same effective user ID as the
+		//   process that created the user namespace.
+		// """
+		if c.EffectiveKUID != ns.owner {
+			return syserror.EPERM
+		}
+	}
+	// trySetUIDMap leaves data in maps if it fails.
+	if err := ns.trySetUIDMap(entries); err != nil {
+		ns.uidMapFromParent.RemoveAll()
+		ns.uidMapToParent.RemoveAll()
+		return err
+	}
+	return nil
+}
+
+func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error {
+	for _, e := range entries {
+		// Determine upper bounds and check for overflow. This implicitly
+		// checks for NoID.
+		lastID := e.FirstID + e.Length
+		if lastID <= e.FirstID {
+			return syserror.EINVAL
+		}
+		lastParentID := e.FirstParentID + e.Length
+		if lastParentID <= e.FirstParentID {
+			return syserror.EINVAL
+		}
+		// "3. The mapped user IDs (group IDs) must in turn have a mapping in
+		// the parent user namespace."
+		// Only the root namespace has a nil parent, and root is assigned
+		// mappings when it's created, so SetUIDMap would have returned EPERM
+		// without reaching this point if ns is root.
+		if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) {
+			return syserror.EPERM
+		}
+		// If either of these Adds fail, we have an overlapping range.
+		if !ns.uidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
+			return syserror.EINVAL
+		}
+		if !ns.uidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
+			return syserror.EINVAL
+		}
+	}
+	return nil
+}
+
+// SetGIDMap instructs ns to translate GIDs as specified by entries.
+func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) error {
+	c := CredentialsFromContext(ctx)
+
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	if !ns.gidMapFromParent.IsEmpty() {
+		return syserror.EPERM
+	}
+	if len(entries) == 0 {
+		return syserror.EINVAL
+	}
+	if !c.HasCapabilityIn(linux.CAP_SETGID, ns) {
+		return syserror.EPERM
+	}
+	if c.UserNamespace != ns && c.UserNamespace != ns.parent {
+		return syserror.EPERM
+	}
+	if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) {
+		if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 {
+			return syserror.EPERM
+		}
+		// It's correct for this to still be UID.
+		if c.EffectiveKUID != ns.owner {
+			return syserror.EPERM
+		}
+		// "In the case of gid_map, use of the setgroups(2) system call must
+		// first be denied by writing "deny" to the /proc/[pid]/setgroups file
+		// (see below) before writing to gid_map." (This file isn't implemented
+		// in the version of Linux we're emulating; see comment in
+		// UserNamespace.)
+	}
+	if err := ns.trySetGIDMap(entries); err != nil {
+		ns.gidMapFromParent.RemoveAll()
+		ns.gidMapToParent.RemoveAll()
+		return err
+	}
+	return nil
+}
+
+func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error {
+	for _, e := range entries {
+		lastID := e.FirstID + e.Length
+		if lastID <= e.FirstID {
+			return syserror.EINVAL
+		}
+		lastParentID := e.FirstParentID + e.Length
+		if lastParentID <= e.FirstParentID {
+			return syserror.EINVAL
+		}
+		if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) {
+			return syserror.EPERM
+		}
+		if !ns.gidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
+			return syserror.EINVAL
+		}
+		if !ns.gidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
+			return syserror.EINVAL
+		}
+	}
+	return nil
+}
+
+// UIDMap returns the user ID mappings configured for ns. If no mappings
+// have been configured, UIDMap returns nil.
+func (ns *UserNamespace) UIDMap() []IDMapEntry {
+	return ns.getIDMap(&ns.uidMapToParent)
+}
+
+// GIDMap returns the group ID mappings configured for ns. If no mappings
+// have been configured, GIDMap returns nil.
+func (ns *UserNamespace) GIDMap() []IDMapEntry {
+	return ns.getIDMap(&ns.gidMapToParent)
+}
+
+func (ns *UserNamespace) getIDMap(m *idMapSet) []IDMapEntry {
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	var entries []IDMapEntry
+	for it := m.FirstSegment(); it.Ok(); it = it.NextSegment() {
+		entries = append(entries, IDMapEntry{
+			FirstID:       it.Start(),
+			FirstParentID: it.Value(),
+			Length:        it.Range().Length(),
+		})
+	}
+	return entries
+}
diff --git a/pkg/sentry/kernel/auth/id_map_functions.go b/pkg/sentry/kernel/auth/id_map_functions.go
new file mode 100644
index 000000000..432dbfb6d
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map_functions.go
@@ -0,0 +1,45 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+// idMapFunctions "implements" generic interface segment.Functions for
+// idMapSet. An idMapSet maps non-overlapping ranges of contiguous IDs in one
+// user namespace to non-overlapping ranges of contiguous IDs in another user
+// namespace. Each such ID mapping is implemented as a range-to-value mapping
+// in the set such that [range.Start(), range.End()) => [value, value +
+// range.Length()).
+type idMapFunctions struct{}
+
+func (idMapFunctions) MinKey() uint32 {
+	return 0
+}
+
+func (idMapFunctions) MaxKey() uint32 {
+	return NoID
+}
+
+func (idMapFunctions) ClearValue(*uint32) {}
+
+func (idMapFunctions) Merge(r1 idMapRange, val1 uint32, r2 idMapRange, val2 uint32) (uint32, bool) {
+	// Mapped ranges have to be contiguous.
+	if val1+r1.Length() != val2 {
+		return 0, false
+	}
+	return val1, true
+}
+
+func (idMapFunctions) Split(r idMapRange, val uint32, split uint32) (uint32, uint32) {
+	return val, val + (split - r.Start)
+}
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
new file mode 100644
index 000000000..9dd52c860
--- /dev/null
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -0,0 +1,129 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"math"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// A UserNamespace represents a user namespace. See user_namespaces(7) for
+// details.
+//
+// +stateify savable
+type UserNamespace struct {
+	// parent is this namespace's parent. If this is the root namespace, parent
+	// is nil. The parent pointer is immutable.
+	parent *UserNamespace
+
+	// owner is the effective UID of the namespace's creator in the root
+	// namespace. owner is immutable.
+	owner KUID
+
+	// mu protects the following fields.
+	//
+	// If mu will be locked in multiple UserNamespaces, it must be locked in
+	// descendant namespaces before ancestors.
+	mu sync.Mutex `state:"nosave"`
+
+	// Mappings of user/group IDs between this namespace and its parent.
+	//
+	// All ID maps, once set, cannot be changed. This means that successful
+	// UID/GID translations cannot be racy.
+	uidMapFromParent idMapSet
+	uidMapToParent   idMapSet
+	gidMapFromParent idMapSet
+	gidMapToParent   idMapSet
+
+	// TODO(b/27454212): Support disabling setgroups(2).
+}
+
+// NewRootUserNamespace returns a UserNamespace that is appropriate for a
+// system's root user namespace.
+func NewRootUserNamespace() *UserNamespace {
+	var ns UserNamespace
+	// """
+	// The initial user namespace has no parent namespace, but, for
+	// consistency, the kernel provides dummy user and group ID mapping files
+	// for this namespace. Looking at the uid_map file (gid_map is the same)
+	// from a shell in the initial namespace shows:
+	//
+	// $ cat /proc/$$/uid_map
+	// 0          0 4294967295
+	// """ - user_namespaces(7)
+	for _, m := range []*idMapSet{
+		&ns.uidMapFromParent,
+		&ns.uidMapToParent,
+		&ns.gidMapFromParent,
+		&ns.gidMapToParent,
+	} {
+		if !m.Add(idMapRange{0, math.MaxUint32}, 0) {
+			panic("Failed to insert into empty ID map")
+		}
+	}
+	return &ns
+}
+
+// Root returns the root of the user namespace tree containing ns.
+func (ns *UserNamespace) Root() *UserNamespace {
+	for ns.parent != nil {
+		ns = ns.parent
+	}
+	return ns
+}
+
+// "The kernel imposes (since version 3.11) a limit of 32 nested levels of user
+// namespaces." - user_namespaces(7)
+const maxUserNamespaceDepth = 32
+
+func (ns *UserNamespace) depth() int {
+	var i int
+	for ns != nil {
+		i++
+		ns = ns.parent
+	}
+	return i
+}
+
+// NewChildUserNamespace returns a new user namespace created by a caller with
+// credentials c.
+func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) {
+	if c.UserNamespace.depth() >= maxUserNamespaceDepth {
+		// "... Calls to unshare(2) or clone(2) that would cause this limit to
+		// be exceeded fail with the error EUSERS." - user_namespaces(7)
+		return nil, syserror.EUSERS
+	}
+	// "EPERM: CLONE_NEWUSER was specified in flags, but either the effective
+	// user ID or the effective group ID of the caller does not have a mapping
+	// in the parent namespace (see user_namespaces(7))." - clone(2)
+	// "CLONE_NEWUSER requires that the user ID and group ID of the calling
+	// process are mapped to user IDs and group IDs in the user namespace of
+	// the calling process at the time of the call." - unshare(2)
+	if !c.EffectiveKUID.In(c.UserNamespace).Ok() {
+		return nil, syserror.EPERM
+	}
+	if !c.EffectiveKGID.In(c.UserNamespace).Ok() {
+		return nil, syserror.EPERM
+	}
+	return &UserNamespace{
+		parent: c.UserNamespace,
+		owner:  c.EffectiveKUID,
+		// "When a user namespace is created, it starts without a mapping of
+		// user IDs (group IDs) to the parent user namespace." -
+		// user_namespaces(7)
+	}, nil
+}
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
new file mode 100644
index 000000000..dd5f0f5fa
--- /dev/null
+++ b/pkg/sentry/kernel/context.go
@@ -0,0 +1,114 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/context"
+)
+
+// contextID is the kernel package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxCanTrace is a Context.Value key for a function with the same
+	// signature and semantics as kernel.Task.CanTrace.
+	CtxCanTrace contextID = iota
+
+	// CtxKernel is a Context.Value key for a Kernel.
+	CtxKernel
+
+	// CtxPIDNamespace is a Context.Value key for a PIDNamespace.
+	CtxPIDNamespace
+
+	// CtxTask is a Context.Value key for a Task.
+	CtxTask
+
+	// CtxUTSNamespace is a Context.Value key for a UTSNamespace.
+	CtxUTSNamespace
+
+	// CtxIPCNamespace is a Context.Value key for a IPCNamespace.
+	CtxIPCNamespace
+)
+
+// ContextCanTrace returns true if ctx is permitted to trace t, in the same sense
+// as kernel.Task.CanTrace.
+func ContextCanTrace(ctx context.Context, t *Task, attach bool) bool {
+	if v := ctx.Value(CtxCanTrace); v != nil {
+		return v.(func(*Task, bool) bool)(t, attach)
+	}
+	return false
+}
+
+// KernelFromContext returns the Kernel in which ctx is executing, or nil if
+// there is no such Kernel.
+func KernelFromContext(ctx context.Context) *Kernel {
+	if v := ctx.Value(CtxKernel); v != nil {
+		return v.(*Kernel)
+	}
+	return nil
+}
+
+// PIDNamespaceFromContext returns the PID namespace in which ctx is executing,
+// or nil if there is no such PID namespace.
+func PIDNamespaceFromContext(ctx context.Context) *PIDNamespace {
+	if v := ctx.Value(CtxPIDNamespace); v != nil {
+		return v.(*PIDNamespace)
+	}
+	return nil
+}
+
+// UTSNamespaceFromContext returns the UTS namespace in which ctx is executing,
+// or nil if there is no such UTS namespace.
+func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace {
+	if v := ctx.Value(CtxUTSNamespace); v != nil {
+		return v.(*UTSNamespace)
+	}
+	return nil
+}
+
+// IPCNamespaceFromContext returns the IPC namespace in which ctx is executing,
+// or nil if there is no such IPC namespace.
+func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace {
+	if v := ctx.Value(CtxIPCNamespace); v != nil {
+		return v.(*IPCNamespace)
+	}
+	return nil
+}
+
+// TaskFromContext returns the Task associated with ctx, or nil if there is no
+// such Task.
+func TaskFromContext(ctx context.Context) *Task {
+	if v := ctx.Value(CtxTask); v != nil {
+		return v.(*Task)
+	}
+	return nil
+}
+
+// Deadline implements context.Context.Deadline.
+func (*Task) Deadline() (time.Time, bool) {
+	return time.Time{}, false
+}
+
+// Done implements context.Context.Done.
+func (*Task) Done() <-chan struct{} {
+	return nil
+}
+
+// Err implements context.Context.Err.
+func (*Task) Err() error {
+	return nil
+}
diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD
new file mode 100644
index 000000000..9d26392c0
--- /dev/null
+++ b/pkg/sentry/kernel/contexttest/BUILD
@@ -0,0 +1,17 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "contexttest",
+    testonly = 1,
+    srcs = ["contexttest.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+    ],
+)
diff --git a/pkg/sentry/kernel/contexttest/contexttest.go b/pkg/sentry/kernel/contexttest/contexttest.go
new file mode 100644
index 000000000..22c340e56
--- /dev/null
+++ b/pkg/sentry/kernel/contexttest/contexttest.go
@@ -0,0 +1,40 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package contexttest provides a test context.Context which includes
+// a dummy kernel pointing to a valid platform.
+package contexttest
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+)
+
+// Context returns a Context that may be used in tests. Uses ptrace as the
+// platform.Platform, and provides a stub kernel that only serves to point to
+// the platform.
+func Context(tb testing.TB) context.Context {
+	ctx := contexttest.Context(tb)
+	k := &kernel.Kernel{
+		Platform: platform.FromContext(ctx),
+	}
+	k.SetMemoryFile(pgalloc.MemoryFileFromContext(ctx))
+	ctx.(*contexttest.TestContext).RegisterValue(kernel.CtxKernel, k)
+	return ctx
+}
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
new file mode 100644
index 000000000..75eedd5a2
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -0,0 +1,51 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "epoll_list",
+    out = "epoll_list.go",
+    package = "epoll",
+    prefix = "pollEntry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*pollEntry",
+        "Linker": "*pollEntry",
+    },
+)
+
+go_library(
+    name = "epoll",
+    srcs = [
+        "epoll.go",
+        "epoll_list.go",
+        "epoll_state.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/refs",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sync",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "epoll_test",
+    size = "small",
+    srcs = [
+        "epoll_test.go",
+    ],
+    library = ":epoll",
+    deps = [
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/fs/filetest",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
new file mode 100644
index 000000000..4c0f1e41f
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -0,0 +1,462 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package epoll provides an implementation of Linux's IO event notification
+// facility. See epoll(7) for more details.
+package epoll
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// EntryFlags is a bitmask that holds an entry's flags.
+type EntryFlags int
+
+// Valid entry flags.
+const (
+	OneShot EntryFlags = 1 << iota
+	EdgeTriggered
+)
+
+// FileIdentifier identifies a file. We cannot use just the FD because it could
+// potentially be reassigned. We also cannot use just the file pointer because
+// it is possible to have multiple entries for the same file object as long as
+// they are created with different FDs (i.e., the FDs point to the same file).
+//
+// +stateify savable
+type FileIdentifier struct {
+	File *fs.File `state:"wait"`
+	Fd   int32
+}
+
+// pollEntry holds all the state associated with an event poll entry, that is,
+// a file being observed by an event poll object.
+//
+// +stateify savable
+type pollEntry struct {
+	pollEntryEntry
+	file     *refs.WeakRef  `state:"manual"`
+	id       FileIdentifier `state:"wait"`
+	userData [2]int32
+	waiter   waiter.Entry `state:"manual"`
+	mask     waiter.EventMask
+	flags    EntryFlags
+
+	epoll *EventPoll
+
+	// We cannot save the current list pointer as it points into EventPoll
+	// struct, while state framework currently does not support such
+	// in-struct pointers. Instead, EventPoll will properly set this field
+	// in its loading logic.
+	curList *pollEntryList `state:"nosave"`
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+// weakReferenceGone is called when the file in the weak reference is destroyed.
+// The poll entry is removed in response to this.
+func (p *pollEntry) WeakRefGone() {
+	p.epoll.RemoveEntry(p.id)
+}
+
+// EventPoll holds all the state associated with an event poll object, that is,
+// collection of files to observe and their current state.
+//
+// +stateify savable
+type EventPoll struct {
+	fsutil.FilePipeSeek             `state:"zerovalue"`
+	fsutil.FileNotDirReaddir        `state:"zerovalue"`
+	fsutil.FileNoFsync              `state:"zerovalue"`
+	fsutil.FileNoopFlush            `state:"zerovalue"`
+	fsutil.FileNoIoctl              `state:"zerovalue"`
+	fsutil.FileNoMMap               `state:"zerovalue"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	// Wait queue is used to notify interested parties when the event poll
+	// object itself becomes readable or writable.
+	waiter.Queue `state:"zerovalue"`
+
+	// files is the map of all the files currently being observed, it is
+	// protected by mu.
+	mu    sync.Mutex `state:"nosave"`
+	files map[FileIdentifier]*pollEntry
+
+	// listsMu protects manipulation of the lists below. It needs to be a
+	// different lock to avoid circular lock acquisition order involving
+	// the wait queue mutexes and mu. The full order is mu, observed file
+	// wait queue mutex, then listsMu; this allows listsMu to be acquired
+	// when (*pollEntry).Callback is called.
+	//
+	// An entry is always in one of the following lists:
+	//	readyList -- when there's a chance that it's ready to have
+	//		events delivered to epoll waiters. Given that being
+	//		ready is a transient state, the Readiness() and
+	//		readEvents() functions always call the entry's file
+	//		Readiness() function to confirm it's ready.
+	//	waitingList -- when there's no chance that the entry is ready,
+	//		so it's waiting for the (*pollEntry).Callback to be called
+	//		on it before it gets moved to the readyList.
+	//	disabledList -- when the entry is disabled. This happens when
+	//		a one-shot entry gets delivered via readEvents().
+	listsMu      sync.Mutex `state:"nosave"`
+	readyList    pollEntryList
+	waitingList  pollEntryList
+	disabledList pollEntryList
+}
+
+// cycleMu is used to serialize all the cycle checks. This is only used when
+// an event poll file is added as an entry to another event poll. Such checks
+// are serialized to avoid lock acquisition order inversion: if a thread is
+// adding A to B, and another thread is adding B to A, each would acquire A's
+// and B's mutexes in reverse order, and could cause deadlocks. Having this
+// lock prevents this by allowing only one check at a time to happen.
+//
+// We do the cycle check to prevent callers from introducing potentially
+// infinite recursions. If a caller were to add A to B and then B to A, for
+// event poll A to know if it's readable, it would need to check event poll B,
+// which in turn would need event poll A and so on indefinitely.
+var cycleMu sync.Mutex
+
+// NewEventPoll allocates and initializes a new event poll object.
+func NewEventPoll(ctx context.Context) *fs.File {
+	// name matches fs/eventpoll.c:epoll_create1.
+	dirent := fs.NewDirent(ctx, anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]"))
+	// Release the initial dirent reference after NewFile takes a reference.
+	defer dirent.DecRef()
+	return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{
+		files: make(map[FileIdentifier]*pollEntry),
+	})
+}
+
+// Release implements fs.FileOperations.Release.
+func (e *EventPoll) Release() {
+	// We need to take the lock now because files may be attempting to
+	// remove entries in parallel if they get destroyed.
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Go through all entries and clean up.
+	for _, entry := range e.files {
+		entry.id.File.EventUnregister(&entry.waiter)
+		entry.file.Drop()
+	}
+	e.files = nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (*EventPoll) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syscall.ENOSYS
+}
+
+// Write implements fs.FileOperations.Write.
+func (*EventPoll) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syscall.ENOSYS
+}
+
+// eventsAvailable determines if 'e' has events available for delivery.
+func (e *EventPoll) eventsAvailable() bool {
+	e.listsMu.Lock()
+
+	for it := e.readyList.Front(); it != nil; {
+		entry := it
+		it = it.Next()
+
+		// If the entry is ready, we know 'e' has at least one entry
+		// ready for delivery.
+		ready := entry.id.File.Readiness(entry.mask)
+		if ready != 0 {
+			e.listsMu.Unlock()
+			return true
+		}
+
+		// Entry is not ready, so move it to waiting list.
+		e.readyList.Remove(entry)
+		e.waitingList.PushBack(entry)
+		entry.curList = &e.waitingList
+	}
+
+	e.listsMu.Unlock()
+
+	return false
+}
+
+// Readiness determines if the event poll object is currently readable (i.e.,
+// if there are pending events for delivery).
+func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ready := waiter.EventMask(0)
+
+	if (mask&waiter.EventIn) != 0 && e.eventsAvailable() {
+		ready |= waiter.EventIn
+	}
+
+	return ready
+}
+
+// ReadEvents returns up to max available events.
+func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent {
+	var local pollEntryList
+	var ret []linux.EpollEvent
+
+	e.listsMu.Lock()
+
+	// Go through all entries we believe may be ready.
+	for it := e.readyList.Front(); it != nil && len(ret) < max; {
+		entry := it
+		it = it.Next()
+
+		// Check the entry's readiness. It it's not really ready, we
+		// just put it back in the waiting list and move on to the next
+		// entry.
+		ready := entry.id.File.Readiness(entry.mask) & entry.mask
+		if ready == 0 {
+			e.readyList.Remove(entry)
+			e.waitingList.PushBack(entry)
+			entry.curList = &e.waitingList
+
+			continue
+		}
+
+		// Add event to the array that will be returned to caller.
+		ret = append(ret, linux.EpollEvent{
+			Events: uint32(ready),
+			Data:   entry.userData,
+		})
+
+		// The entry is consumed, so we must move it to the disabled
+		// list in case it's one-shot, or back to the wait list if it's
+		// edge-triggered. If it's neither, we leave it in the ready
+		// list so that its readiness can be checked the next time
+		// around; however, we must move it to the end of the list so
+		// that other events can be delivered as well.
+		e.readyList.Remove(entry)
+		if entry.flags&OneShot != 0 {
+			e.disabledList.PushBack(entry)
+			entry.curList = &e.disabledList
+		} else if entry.flags&EdgeTriggered != 0 {
+			e.waitingList.PushBack(entry)
+			entry.curList = &e.waitingList
+		} else {
+			local.PushBack(entry)
+		}
+	}
+
+	e.readyList.PushBackList(&local)
+
+	e.listsMu.Unlock()
+
+	return ret
+}
+
+// Callback implements waiter.EntryCallback.Callback.
+//
+// Callback is called when one of the files we're polling becomes ready. It
+// moves said file to the readyList if it's currently in the waiting list.
+func (p *pollEntry) Callback(*waiter.Entry) {
+	e := p.epoll
+
+	e.listsMu.Lock()
+
+	if p.curList == &e.waitingList {
+		e.waitingList.Remove(p)
+		e.readyList.PushBack(p)
+		p.curList = &e.readyList
+		e.listsMu.Unlock()
+
+		e.Notify(waiter.EventIn)
+		return
+	}
+
+	e.listsMu.Unlock()
+}
+
+// initEntryReadiness initializes the entry's state with regards to its
+// readiness by placing it in the appropriate list and registering for
+// notifications.
+func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
+	// A new entry starts off in the waiting list.
+	e.listsMu.Lock()
+	e.waitingList.PushBack(entry)
+	entry.curList = &e.waitingList
+	e.listsMu.Unlock()
+
+	// Register for event notifications.
+	f := entry.id.File
+	f.EventRegister(&entry.waiter, entry.mask)
+
+	// Check if the file happens to already be in a ready state.
+	ready := f.Readiness(entry.mask) & entry.mask
+	if ready != 0 {
+		entry.Callback(&entry.waiter)
+	}
+}
+
+// observes checks if event poll object e is directly or indirectly observing
+// event poll object ep. It uses a bounded recursive depth-first search.
+func (e *EventPoll) observes(ep *EventPoll, depthLeft int) bool {
+	// If we reached the maximum depth, we'll consider that we found it
+	// because we don't want to allow chains that are too long.
+	if depthLeft <= 0 {
+		return true
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Go through each observed file and check if it is or observes ep.
+	for id := range e.files {
+		f, ok := id.File.FileOperations.(*EventPoll)
+		if !ok {
+			continue
+		}
+
+		if f == ep || f.observes(ep, depthLeft-1) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// AddEntry adds a new file to the collection of files observed by e.
+func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
+	// Acquire cycle check lock if another event poll is being added.
+	ep, ok := id.File.FileOperations.(*EventPoll)
+	if ok {
+		cycleMu.Lock()
+		defer cycleMu.Unlock()
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Fail if the file already has an entry.
+	if _, ok := e.files[id]; ok {
+		return syscall.EEXIST
+	}
+
+	// Check if a cycle would be created. We use 4 as the limit because
+	// that's the value used by linux and we want to emulate it.
+	if ep != nil {
+		if e == ep {
+			return syscall.EINVAL
+		}
+
+		if ep.observes(e, 4) {
+			return syscall.ELOOP
+		}
+	}
+
+	// Create new entry and add it to map.
+	//
+	// N.B. Even though we are creating a weak reference here, we know it
+	//      won't trigger a callback because we hold a reference to the file
+	//      throughout the execution of this function.
+	entry := &pollEntry{
+		id:       id,
+		userData: data,
+		epoll:    e,
+		flags:    flags,
+		mask:     mask,
+	}
+	entry.waiter.Callback = entry
+	e.files[id] = entry
+	entry.file = refs.NewWeakRef(id.File, entry)
+
+	// Initialize the readiness state of the new entry.
+	e.initEntryReadiness(entry)
+
+	return nil
+}
+
+// UpdateEntry updates the flags, mask and user data associated with a file that
+// is already part of the collection of observed files.
+func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Fail if the file doesn't have an entry.
+	entry, ok := e.files[id]
+	if !ok {
+		return syscall.ENOENT
+	}
+
+	// Unregister the old mask and remove entry from the list it's in, so
+	// (*pollEntry).Callback is guaranteed to not be called on this entry anymore.
+	entry.id.File.EventUnregister(&entry.waiter)
+
+	// Remove entry from whatever list it's in. This ensure that no other
+	// threads have access to this entry as the only way left to find it
+	// is via e.files, but we hold e.mu, which prevents that.
+	e.listsMu.Lock()
+	entry.curList.Remove(entry)
+	e.listsMu.Unlock()
+
+	// Initialize new readiness state.
+	entry.flags = flags
+	entry.mask = mask
+	entry.userData = data
+	e.initEntryReadiness(entry)
+
+	return nil
+}
+
+// RemoveEntry a files from the collection of observed files.
+func (e *EventPoll) RemoveEntry(id FileIdentifier) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Fail if the file doesn't have an entry.
+	entry, ok := e.files[id]
+	if !ok {
+		return syscall.ENOENT
+	}
+
+	// Unregister from file first so that no concurrent attempts will be
+	// made to manipulate the file.
+	entry.id.File.EventUnregister(&entry.waiter)
+
+	// Remove from the current list.
+	e.listsMu.Lock()
+	entry.curList.Remove(entry)
+	entry.curList = nil
+	e.listsMu.Unlock()
+
+	// Remove file from map, and drop weak reference.
+	delete(e.files, id)
+	entry.file.Drop()
+
+	return nil
+}
+
+// UnregisterEpollWaiters removes the epoll waiter objects from the waiting
+// queues. This is different from Release() as the file is not dereferenced.
+func (e *EventPoll) UnregisterEpollWaiters() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	for _, entry := range e.files {
+		entry.id.File.EventUnregister(&entry.waiter)
+	}
+}
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
new file mode 100644
index 000000000..7c61e0258
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -0,0 +1,51 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package epoll
+
+import (
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// afterLoad is invoked by stateify.
+func (p *pollEntry) afterLoad() {
+	p.waiter.Callback = p
+	p.file = refs.NewWeakRef(p.id.File, p)
+	p.id.File.EventRegister(&p.waiter, p.mask)
+}
+
+// afterLoad is invoked by stateify.
+func (e *EventPoll) afterLoad() {
+	e.listsMu.Lock()
+	defer e.listsMu.Unlock()
+
+	for _, ls := range []*pollEntryList{&e.waitingList, &e.readyList, &e.disabledList} {
+		for it := ls.Front(); it != nil; it = it.Next() {
+			it.curList = ls
+		}
+	}
+
+	for it := e.waitingList.Front(); it != nil; {
+		entry := it
+		it = it.Next()
+
+		if entry.id.File.Readiness(entry.mask) != 0 {
+			e.waitingList.Remove(entry)
+			e.readyList.PushBack(entry)
+			entry.curList = &e.readyList
+			e.Notify(waiter.EventIn)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go
new file mode 100644
index 000000000..22630e9c5
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll_test.go
@@ -0,0 +1,54 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package epoll
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fs/filetest"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func TestFileDestroyed(t *testing.T) {
+	f := filetest.NewTestFile(t)
+	id := FileIdentifier{f, 12}
+
+	efile := NewEventPoll(contexttest.Context(t))
+	e := efile.FileOperations.(*EventPoll)
+	if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil {
+		t.Fatalf("addEntry failed: %v", err)
+	}
+
+	// Check that we get an event reported twice in a row.
+	evt := e.ReadEvents(1)
+	if len(evt) != 1 {
+		t.Fatalf("Unexpected number of ready events: want %v, got %v", 1, len(evt))
+	}
+
+	evt = e.ReadEvents(1)
+	if len(evt) != 1 {
+		t.Fatalf("Unexpected number of ready events: want %v, got %v", 1, len(evt))
+	}
+
+	// Destroy the file. Check that we get no more events.
+	f.DecRef()
+
+	evt = e.ReadEvents(1)
+	if len(evt) != 0 {
+		t.Fatalf("Unexpected number of ready events: want %v, got %v", 0, len(evt))
+	}
+
+}
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
new file mode 100644
index 000000000..9983a32e5
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -0,0 +1,33 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "eventfd",
+    srcs = ["eventfd.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fdnotifier",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "eventfd_test",
+    size = "small",
+    srcs = ["eventfd_test.go"],
+    library = ":eventfd",
+    deps = [
+        "//pkg/sentry/contexttest",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
new file mode 100644
index 000000000..87951adeb
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -0,0 +1,285 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package eventfd provides an implementation of Linux's file-based event
+// notification.
+package eventfd
+
+import (
+	"math"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// EventOperations represents an event with the semantics of Linux's file-based event
+// notification (eventfd). Eventfds are usually internal to the Sentry but in certain
+// situations they may be converted into a host-backed eventfd.
+//
+// +stateify savable
+type EventOperations struct {
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	// Mutex that protects accesses to the fields of this event.
+	mu sync.Mutex `state:"nosave"`
+
+	// Queue is used to notify interested parties when the event object
+	// becomes readable or writable.
+	wq waiter.Queue `state:"zerovalue"`
+
+	// val is the current value of the event counter.
+	val uint64
+
+	// semMode specifies whether the event is in "semaphore" mode.
+	semMode bool
+
+	// hostfd indicates whether this eventfd is passed through to the host.
+	hostfd int
+}
+
+// New creates a new event object with the supplied initial value and mode.
+func New(ctx context.Context, initVal uint64, semMode bool) *fs.File {
+	// name matches fs/eventfd.c:eventfd_file_create.
+	dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[eventfd]")
+	// Release the initial dirent reference after NewFile takes a reference.
+	defer dirent.DecRef()
+	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{
+		val:     initVal,
+		semMode: semMode,
+		hostfd:  -1,
+	})
+}
+
+// HostFD returns the host eventfd associated with this event.
+func (e *EventOperations) HostFD() (int, error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if e.hostfd >= 0 {
+		return e.hostfd, nil
+	}
+
+	flags := linux.EFD_NONBLOCK
+	if e.semMode {
+		flags |= linux.EFD_SEMAPHORE
+	}
+
+	fd, _, err := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(e.val), uintptr(flags), 0)
+	if err != 0 {
+		return -1, err
+	}
+
+	if err := fdnotifier.AddFD(int32(fd), &e.wq); err != nil {
+		syscall.Close(int(fd))
+		return -1, err
+	}
+
+	e.hostfd = int(fd)
+	return e.hostfd, nil
+}
+
+// Release implements fs.FileOperations.Release.
+func (e *EventOperations) Release() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if e.hostfd >= 0 {
+		fdnotifier.RemoveFD(int32(e.hostfd))
+		syscall.Close(e.hostfd)
+		e.hostfd = -1
+	}
+}
+
+// Read implements fs.FileOperations.Read.
+func (e *EventOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	if dst.NumBytes() < 8 {
+		return 0, syscall.EINVAL
+	}
+	if err := e.read(ctx, dst); err != nil {
+		return 0, err
+	}
+	return 8, nil
+}
+
+// Write implements fs.FileOperations.Write.
+func (e *EventOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	if src.NumBytes() < 8 {
+		return 0, syscall.EINVAL
+	}
+	if err := e.write(ctx, src); err != nil {
+		return 0, err
+	}
+	return 8, nil
+}
+
+// Must be called with e.mu locked.
+func (e *EventOperations) hostRead(ctx context.Context, dst usermem.IOSequence) error {
+	var buf [8]byte
+
+	if _, err := syscall.Read(e.hostfd, buf[:]); err != nil {
+		if err == syscall.EWOULDBLOCK {
+			return syserror.ErrWouldBlock
+		}
+		return err
+	}
+
+	_, err := dst.CopyOut(ctx, buf[:])
+	return err
+}
+
+func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) error {
+	e.mu.Lock()
+
+	if e.hostfd >= 0 {
+		defer e.mu.Unlock()
+		return e.hostRead(ctx, dst)
+	}
+
+	// We can't complete the read if the value is currently zero.
+	if e.val == 0 {
+		e.mu.Unlock()
+		return syserror.ErrWouldBlock
+	}
+
+	// Update the value based on the mode the event is operating in.
+	var val uint64
+	if e.semMode {
+		val = 1
+		// Consistent with Linux, this is done even if writing to memory fails.
+		e.val--
+	} else {
+		val = e.val
+		e.val = 0
+	}
+
+	e.mu.Unlock()
+
+	// Notify writers. We do this even if we were already writable because
+	// it is possible that a writer is waiting to write the maximum value
+	// to the event.
+	e.wq.Notify(waiter.EventOut)
+
+	var buf [8]byte
+	usermem.ByteOrder.PutUint64(buf[:], val)
+	_, err := dst.CopyOut(ctx, buf[:])
+	return err
+}
+
+// Must be called with e.mu locked.
+func (e *EventOperations) hostWrite(val uint64) error {
+	var buf [8]byte
+	usermem.ByteOrder.PutUint64(buf[:], val)
+	_, err := syscall.Write(e.hostfd, buf[:])
+	if err == syscall.EWOULDBLOCK {
+		return syserror.ErrWouldBlock
+	}
+	return err
+}
+
+func (e *EventOperations) write(ctx context.Context, src usermem.IOSequence) error {
+	var buf [8]byte
+	if _, err := src.CopyIn(ctx, buf[:]); err != nil {
+		return err
+	}
+	val := usermem.ByteOrder.Uint64(buf[:])
+
+	return e.Signal(val)
+}
+
+// Signal is an internal function to signal the event fd.
+func (e *EventOperations) Signal(val uint64) error {
+	if val == math.MaxUint64 {
+		return syscall.EINVAL
+	}
+
+	e.mu.Lock()
+
+	if e.hostfd >= 0 {
+		defer e.mu.Unlock()
+		return e.hostWrite(val)
+	}
+
+	// We only allow writes that won't cause the value to go over the max
+	// uint64 minus 1.
+	if val > math.MaxUint64-1-e.val {
+		e.mu.Unlock()
+		return syserror.ErrWouldBlock
+	}
+
+	e.val += val
+	e.mu.Unlock()
+
+	// Always trigger a notification.
+	e.wq.Notify(waiter.EventIn)
+
+	return nil
+}
+
+// Readiness returns the ready events for the event fd.
+func (e *EventOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	e.mu.Lock()
+	if e.hostfd >= 0 {
+		defer e.mu.Unlock()
+		return fdnotifier.NonBlockingPoll(int32(e.hostfd), mask)
+	}
+
+	ready := waiter.EventMask(0)
+	if e.val > 0 {
+		ready |= waiter.EventIn
+	}
+
+	if e.val < math.MaxUint64-1 {
+		ready |= waiter.EventOut
+	}
+	e.mu.Unlock()
+
+	return mask & ready
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (e *EventOperations) EventRegister(entry *waiter.Entry, mask waiter.EventMask) {
+	e.wq.EventRegister(entry, mask)
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if e.hostfd >= 0 {
+		fdnotifier.UpdateFD(int32(e.hostfd))
+	}
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (e *EventOperations) EventUnregister(entry *waiter.Entry) {
+	e.wq.EventUnregister(entry)
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if e.hostfd >= 0 {
+		fdnotifier.UpdateFD(int32(e.hostfd))
+	}
+}
diff --git a/pkg/sentry/kernel/eventfd/eventfd_test.go b/pkg/sentry/kernel/eventfd/eventfd_test.go
new file mode 100644
index 000000000..9b4892f74
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/eventfd_test.go
@@ -0,0 +1,78 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package eventfd
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func TestEventfd(t *testing.T) {
+	initVals := []uint64{
+		0,
+		// Using a non-zero initial value verifies that writing to an
+		// eventfd signals when the eventfd's counter was already
+		// non-zero.
+		343,
+	}
+
+	for _, initVal := range initVals {
+		ctx := contexttest.Context(t)
+
+		// Make a new event that is writable.
+		event := New(ctx, initVal, false)
+
+		// Register a callback for a write event.
+		w, ch := waiter.NewChannelEntry(nil)
+		event.EventRegister(&w, waiter.EventIn)
+		defer event.EventUnregister(&w)
+
+		data := []byte("00000124")
+		// Create and submit a write request.
+		n, err := event.Writev(ctx, usermem.BytesIOSequence(data))
+		if err != nil {
+			t.Fatal(err)
+		}
+		if n != 8 {
+			t.Errorf("eventfd.write wrote %d bytes, not full int64", n)
+		}
+
+		// Check if the callback fired due to the write event.
+		select {
+		case <-ch:
+		default:
+			t.Errorf("Didn't get notified of EventIn after write")
+		}
+	}
+}
+
+func TestEventfdStat(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	// Make a new event that is writable.
+	event := New(ctx, 0, false)
+
+	// Create and submit an stat request.
+	uattr, err := event.Dirent.Inode.UnstableAttr(ctx)
+	if err != nil {
+		t.Fatalf("eventfd stat request failed: %v", err)
+	}
+	if uattr.Size != 0 {
+		t.Fatal("EventFD size should be 0")
+	}
+}
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
new file mode 100644
index 000000000..2b3955598
--- /dev/null
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -0,0 +1,18 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "fasync",
+    srcs = ["fasync.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
new file mode 100644
index 000000000..153d2cd9b
--- /dev/null
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -0,0 +1,188 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fasync provides FIOASYNC related functionality.
+package fasync
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// New creates a new fs.FileAsync.
+func New() fs.FileAsync {
+	return &FileAsync{}
+}
+
+// NewVFS2 creates a new vfs.FileAsync.
+func NewVFS2() vfs.FileAsync {
+	return &FileAsync{}
+}
+
+// FileAsync sends signals when the registered file is ready for IO.
+//
+// +stateify savable
+type FileAsync struct {
+	// e is immutable after first use (which is protected by mu below).
+	e waiter.Entry
+
+	// regMu protects registeration and unregistration actions on e.
+	//
+	// regMu must be held while registration decisions are being made
+	// through the registration action itself.
+	//
+	// Lock ordering: regMu, mu.
+	regMu sync.Mutex `state:"nosave"`
+
+	// mu protects all following fields.
+	//
+	// Lock ordering: e.mu, mu.
+	mu         sync.Mutex `state:"nosave"`
+	requester  *auth.Credentials
+	registered bool
+
+	// Only one of the following is allowed to be non-nil.
+	recipientPG *kernel.ProcessGroup
+	recipientTG *kernel.ThreadGroup
+	recipientT  *kernel.Task
+}
+
+// Callback sends a signal.
+func (a *FileAsync) Callback(e *waiter.Entry) {
+	a.mu.Lock()
+	if !a.registered {
+		a.mu.Unlock()
+		return
+	}
+	t := a.recipientT
+	tg := a.recipientTG
+	if a.recipientPG != nil {
+		tg = a.recipientPG.Originator()
+	}
+	if tg != nil {
+		t = tg.Leader()
+	}
+	if t == nil {
+		// No recipient has been registered.
+		a.mu.Unlock()
+		return
+	}
+	c := t.Credentials()
+	// Logic from sigio_perm in fs/fcntl.c.
+	if a.requester.EffectiveKUID == 0 ||
+		a.requester.EffectiveKUID == c.SavedKUID ||
+		a.requester.EffectiveKUID == c.RealKUID ||
+		a.requester.RealKUID == c.SavedKUID ||
+		a.requester.RealKUID == c.RealKUID {
+		t.SendSignal(kernel.SignalInfoPriv(linux.SIGIO))
+	}
+	a.mu.Unlock()
+}
+
+// Register sets the file which will be monitored for IO events.
+//
+// The file must not be currently registered.
+func (a *FileAsync) Register(w waiter.Waitable) {
+	a.regMu.Lock()
+	defer a.regMu.Unlock()
+	a.mu.Lock()
+
+	if a.registered {
+		a.mu.Unlock()
+		panic("registering already registered file")
+	}
+
+	if a.e.Callback == nil {
+		a.e.Callback = a
+	}
+	a.registered = true
+
+	a.mu.Unlock()
+	w.EventRegister(&a.e, waiter.EventIn|waiter.EventOut|waiter.EventErr|waiter.EventHUp)
+}
+
+// Unregister stops monitoring a file.
+//
+// The file must be currently registered.
+func (a *FileAsync) Unregister(w waiter.Waitable) {
+	a.regMu.Lock()
+	defer a.regMu.Unlock()
+	a.mu.Lock()
+
+	if !a.registered {
+		a.mu.Unlock()
+		panic("unregistering unregistered file")
+	}
+
+	a.registered = false
+
+	a.mu.Unlock()
+	w.EventUnregister(&a.e)
+}
+
+// Owner returns who is currently getting signals. All return values will be
+// nil if no one is set to receive signals.
+func (a *FileAsync) Owner() (*kernel.Task, *kernel.ThreadGroup, *kernel.ProcessGroup) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	return a.recipientT, a.recipientTG, a.recipientPG
+}
+
+// SetOwnerTask sets the owner (who will receive signals) to a specified task.
+// Only this owner will receive signals.
+func (a *FileAsync) SetOwnerTask(requester *kernel.Task, recipient *kernel.Task) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.requester = requester.Credentials()
+	a.recipientT = recipient
+	a.recipientTG = nil
+	a.recipientPG = nil
+}
+
+// SetOwnerThreadGroup sets the owner (who will receive signals) to a specified
+// thread group. Only this owner will receive signals.
+func (a *FileAsync) SetOwnerThreadGroup(requester *kernel.Task, recipient *kernel.ThreadGroup) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.requester = requester.Credentials()
+	a.recipientT = nil
+	a.recipientTG = recipient
+	a.recipientPG = nil
+}
+
+// SetOwnerProcessGroup sets the owner (who will receive signals) to a
+// specified process group. Only this owner will receive signals.
+func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kernel.ProcessGroup) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.requester = requester.Credentials()
+	a.recipientT = nil
+	a.recipientTG = nil
+	a.recipientPG = recipient
+}
+
+// ClearOwner unsets the current signal recipient.
+func (a *FileAsync) ClearOwner() {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.requester = nil
+	a.recipientT = nil
+	a.recipientTG = nil
+	a.recipientPG = nil
+}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
new file mode 100644
index 000000000..4b7d234a4
--- /dev/null
+++ b/pkg/sentry/kernel/fd_table.go
@@ -0,0 +1,638 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"math"
+	"strings"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FDFlags define flags for an individual descriptor.
+//
+// +stateify savable
+type FDFlags struct {
+	// CloseOnExec indicates the descriptor should be closed on exec.
+	CloseOnExec bool
+}
+
+// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
+// representation.
+func (f FDFlags) ToLinuxFileFlags() (mask uint) {
+	if f.CloseOnExec {
+		mask |= linux.O_CLOEXEC
+	}
+	return
+}
+
+// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
+// representation.
+func (f FDFlags) ToLinuxFDFlags() (mask uint) {
+	if f.CloseOnExec {
+		mask |= linux.FD_CLOEXEC
+	}
+	return
+}
+
+// descriptor holds the details about a file descriptor, namely a pointer to
+// the file itself and the descriptor flags.
+//
+// Note that this is immutable and can only be changed via operations on the
+// descriptorTable.
+//
+// It contains both VFS1 and VFS2 file types, but only one of them can be set.
+//
+// +stateify savable
+type descriptor struct {
+	// TODO(gvisor.dev/issue/1624): Remove fs.File.
+	file     *fs.File
+	fileVFS2 *vfs.FileDescription
+	flags    FDFlags
+}
+
+// FDTable is used to manage File references and flags.
+//
+// +stateify savable
+type FDTable struct {
+	refs.AtomicRefCount
+	k *Kernel
+
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// next is start position to find fd.
+	next int32
+
+	// used contains the number of non-nil entries. It must be accessed
+	// atomically. It may be read atomically without holding mu (but not
+	// written).
+	used int32
+
+	// descriptorTable holds descriptors.
+	descriptorTable `state:".(map[int32]descriptor)"`
+}
+
+func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
+	m := make(map[int32]descriptor)
+	f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+		m[fd] = descriptor{
+			file:     file,
+			fileVFS2: fileVFS2,
+			flags:    flags,
+		}
+	})
+	return m
+}
+
+func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
+	f.init() // Initialize table.
+	for fd, d := range m {
+		f.setAll(fd, d.file, d.fileVFS2, d.flags)
+
+		// Note that we do _not_ need to acquire a extra table reference here. The
+		// table reference will already be accounted for in the file, so we drop the
+		// reference taken by set above.
+		switch {
+		case d.file != nil:
+			d.file.DecRef()
+		case d.fileVFS2 != nil:
+			d.fileVFS2.DecRef()
+		}
+	}
+}
+
+// drop drops the table reference.
+func (f *FDTable) drop(file *fs.File) {
+	// Release locks.
+	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF})
+
+	// Send inotify events.
+	d := file.Dirent
+	var ev uint32
+	if fs.IsDir(d.Inode.StableAttr) {
+		ev |= linux.IN_ISDIR
+	}
+	if file.Flags().Write {
+		ev |= linux.IN_CLOSE_WRITE
+	} else {
+		ev |= linux.IN_CLOSE_NOWRITE
+	}
+	d.InotifyEvent(ev, 0)
+
+	// Drop the table reference.
+	file.DecRef()
+}
+
+// dropVFS2 drops the table reference.
+func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
+	// Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the
+	// entire file.
+	err := file.UnlockPOSIX(context.Background(), f, 0, 0, linux.SEEK_SET)
+	if err != nil && err != syserror.ENOLCK {
+		panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
+	}
+
+	// Generate inotify events.
+	ev := uint32(linux.IN_CLOSE_NOWRITE)
+	if file.IsWritable() {
+		ev = linux.IN_CLOSE_WRITE
+	}
+	file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent)
+
+	// Drop the table's reference.
+	file.DecRef()
+}
+
+// NewFDTable allocates a new FDTable that may be used by tasks in k.
+func (k *Kernel) NewFDTable() *FDTable {
+	f := &FDTable{k: k}
+	f.init()
+	return f
+}
+
+// destroy removes all of the file descriptors from the map.
+func (f *FDTable) destroy() {
+	f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool {
+		return true
+	})
+}
+
+// DecRef implements RefCounter.DecRef with destructor f.destroy.
+func (f *FDTable) DecRef() {
+	f.DecRefWithDestructor(f.destroy)
+}
+
+// Size returns the number of file descriptor slots currently allocated.
+func (f *FDTable) Size() int {
+	size := atomic.LoadInt32(&f.used)
+	return int(size)
+}
+
+// forEach iterates over all non-nil files in sorted order.
+//
+// It is the caller's responsibility to acquire an appropriate lock.
+func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
+	// retries tracks the number of failed TryIncRef attempts for the same FD.
+	retries := 0
+	fd := int32(0)
+	for {
+		file, fileVFS2, flags, ok := f.getAll(fd)
+		if !ok {
+			break
+		}
+		switch {
+		case file != nil:
+			if !file.TryIncRef() {
+				retries++
+				if retries > 1000 {
+					panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, FileOps: %+v", fd, file, file.FileOperations))
+				}
+				continue // Race caught.
+			}
+			fn(fd, file, nil, flags)
+			file.DecRef()
+		case fileVFS2 != nil:
+			if !fileVFS2.TryIncRef() {
+				retries++
+				if retries > 1000 {
+					panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, Impl: %+v", fd, fileVFS2, fileVFS2.Impl()))
+				}
+				continue // Race caught.
+			}
+			fn(fd, nil, fileVFS2, flags)
+			fileVFS2.DecRef()
+		}
+		retries = 0
+		fd++
+	}
+}
+
+// String is a stringer for FDTable.
+func (f *FDTable) String() string {
+	var buf strings.Builder
+	f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+		switch {
+		case file != nil:
+			n, _ := file.Dirent.FullName(nil /* root */)
+			fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, n)
+
+		case fileVFS2 != nil:
+			vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem()
+			name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
+			if err != nil {
+				fmt.Fprintf(&buf, "<err: %v>\n", err)
+				return
+			}
+			fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, name)
+		}
+	})
+	return buf.String()
+}
+
+// NewFDs allocates new FDs guaranteed to be the lowest number available
+// greater than or equal to the fd parameter. All files will share the set
+// flags. Success is guaranteed to be all or none.
+func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return nil, syscall.EINVAL
+	}
+
+	// Default limit.
+	end := int32(math.MaxInt32)
+
+	// Ensure we don't get past the provided limit.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		lim := limitSet.Get(limits.NumberOfFiles)
+		if lim.Cur != limits.Infinity {
+			end = int32(lim.Cur)
+		}
+		if fd >= end {
+			return nil, syscall.EMFILE
+		}
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// From f.next to find available fd.
+	if fd < f.next {
+		fd = f.next
+	}
+
+	// Install all entries.
+	for i := fd; i < end && len(fds) < len(files); i++ {
+		if d, _, _ := f.get(i); d == nil {
+			f.set(i, files[len(fds)], flags) // Set the descriptor.
+			fds = append(fds, i)             // Record the file descriptor.
+		}
+	}
+
+	// Failure? Unwind existing FDs.
+	if len(fds) < len(files) {
+		for _, i := range fds {
+			f.set(i, nil, FDFlags{}) // Zap entry.
+		}
+		return nil, syscall.EMFILE
+	}
+
+	if fd == f.next {
+		// Update next search start position.
+		f.next = fds[len(fds)-1] + 1
+	}
+
+	return fds, nil
+}
+
+// NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available
+// greater than or equal to the fd parameter. All files will share the set
+// flags. Success is guaranteed to be all or none.
+func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return nil, syscall.EINVAL
+	}
+
+	// Default limit.
+	end := int32(math.MaxInt32)
+
+	// Ensure we don't get past the provided limit.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		lim := limitSet.Get(limits.NumberOfFiles)
+		if lim.Cur != limits.Infinity {
+			end = int32(lim.Cur)
+		}
+		if fd >= end {
+			return nil, syscall.EMFILE
+		}
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// From f.next to find available fd.
+	if fd < f.next {
+		fd = f.next
+	}
+
+	// Install all entries.
+	for i := fd; i < end && len(fds) < len(files); i++ {
+		if d, _, _ := f.getVFS2(i); d == nil {
+			f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
+			fds = append(fds, i)                 // Record the file descriptor.
+		}
+	}
+
+	// Failure? Unwind existing FDs.
+	if len(fds) < len(files) {
+		for _, i := range fds {
+			f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+		}
+		return nil, syscall.EMFILE
+	}
+
+	if fd == f.next {
+		// Update next search start position.
+		f.next = fds[len(fds)-1] + 1
+	}
+
+	return fds, nil
+}
+
+// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
+// the given file description. If it succeeds, it takes a reference on file.
+func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
+	if minfd < 0 {
+		// Don't accept negative FDs.
+		return -1, syscall.EINVAL
+	}
+
+	// Default limit.
+	end := int32(math.MaxInt32)
+
+	// Ensure we don't get past the provided limit.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		lim := limitSet.Get(limits.NumberOfFiles)
+		if lim.Cur != limits.Infinity {
+			end = int32(lim.Cur)
+		}
+		if minfd >= end {
+			return -1, syscall.EMFILE
+		}
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// From f.next to find available fd.
+	fd := minfd
+	if fd < f.next {
+		fd = f.next
+	}
+	for fd < end {
+		if d, _, _ := f.getVFS2(fd); d == nil {
+			f.setVFS2(fd, file, flags)
+			if fd == f.next {
+				// Update next search start position.
+				f.next = fd + 1
+			}
+			return fd, nil
+		}
+		fd++
+	}
+	return -1, syscall.EMFILE
+}
+
+// NewFDAt sets the file reference for the given FD. If there is an active
+// reference for that FD, the ref count for that existing reference is
+// decremented.
+func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
+	return f.newFDAt(ctx, fd, file, nil, flags)
+}
+
+// NewFDAtVFS2 sets the file reference for the given FD. If there is an active
+// reference for that FD, the ref count for that existing reference is
+// decremented.
+func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error {
+	return f.newFDAt(ctx, fd, nil, file, flags)
+}
+
+func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return syscall.EBADF
+	}
+
+	// Check the limit for the provided file.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
+			return syscall.EMFILE
+		}
+	}
+
+	// Install the entry.
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.setAll(fd, file, fileVFS2, flags)
+	return nil
+}
+
+// SetFlags sets the flags for the given file descriptor.
+//
+// True is returned iff flags were changed.
+func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return syscall.EBADF
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	file, _, _ := f.get(fd)
+	if file == nil {
+		// No file found.
+		return syscall.EBADF
+	}
+
+	// Update the flags.
+	f.set(fd, file, flags)
+	return nil
+}
+
+// SetFlagsVFS2 sets the flags for the given file descriptor.
+//
+// True is returned iff flags were changed.
+func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return syscall.EBADF
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	file, _, _ := f.getVFS2(fd)
+	if file == nil {
+		// No file found.
+		return syscall.EBADF
+	}
+
+	// Update the flags.
+	f.setVFS2(fd, file, flags)
+	return nil
+}
+
+// Get returns a reference to the file and the flags for the FD or nil if no
+// file is defined for the given fd.
+//
+// N.B. Callers are required to use DecRef when they are done.
+//
+//go:nosplit
+func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) {
+	if fd < 0 {
+		return nil, FDFlags{}
+	}
+
+	for {
+		file, flags, _ := f.get(fd)
+		if file != nil {
+			if !file.TryIncRef() {
+				continue // Race caught.
+			}
+			// Reference acquired.
+			return file, flags
+		}
+		// No file available.
+		return nil, FDFlags{}
+	}
+}
+
+// GetVFS2 returns a reference to the file and the flags for the FD or nil if no
+// file is defined for the given fd.
+//
+// N.B. Callers are required to use DecRef when they are done.
+//
+//go:nosplit
+func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) {
+	if fd < 0 {
+		return nil, FDFlags{}
+	}
+
+	for {
+		file, flags, _ := f.getVFS2(fd)
+		if file != nil {
+			if !file.TryIncRef() {
+				continue // Race caught.
+			}
+			// Reference acquired.
+			return file, flags
+		}
+		// No file available.
+		return nil, FDFlags{}
+	}
+}
+
+// GetFDs returns a sorted list of valid fds.
+//
+// Precondition: The caller must be running on the task goroutine, or Task.mu
+// must be locked.
+func (f *FDTable) GetFDs() []int32 {
+	fds := make([]int32, 0, int(atomic.LoadInt32(&f.used)))
+	f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+		fds = append(fds, fd)
+	})
+	return fds
+}
+
+// GetRefs returns a stable slice of references to all files and bumps the
+// reference count on each. The caller must use DecRef on each reference when
+// they're done using the slice.
+func (f *FDTable) GetRefs() []*fs.File {
+	files := make([]*fs.File, 0, f.Size())
+	f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+		file.IncRef() // Acquire a reference for caller.
+		files = append(files, file)
+	})
+	return files
+}
+
+// GetRefsVFS2 returns a stable slice of references to all files and bumps the
+// reference count on each. The caller must use DecRef on each reference when
+// they're done using the slice.
+func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription {
+	files := make([]*vfs.FileDescription, 0, f.Size())
+	f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
+		file.IncRef() // Acquire a reference for caller.
+		files = append(files, file)
+	})
+	return files
+}
+
+// Fork returns an independent FDTable.
+func (f *FDTable) Fork() *FDTable {
+	clone := f.k.NewFDTable()
+
+	f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+		// The set function here will acquire an appropriate table
+		// reference for the clone. We don't need anything else.
+		switch {
+		case file != nil:
+			clone.set(fd, file, flags)
+		case fileVFS2 != nil:
+			clone.setVFS2(fd, fileVFS2, flags)
+		}
+	})
+	return clone
+}
+
+// Remove removes an FD from and returns a non-file iff successful.
+//
+// N.B. Callers are required to use DecRef when they are done.
+func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
+	if fd < 0 {
+		return nil, nil
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// Update current available position.
+	if fd < f.next {
+		f.next = fd
+	}
+
+	orig, orig2, _, _ := f.getAll(fd)
+
+	// Add reference for caller.
+	switch {
+	case orig != nil:
+		orig.IncRef()
+	case orig2 != nil:
+		orig2.IncRef()
+	}
+	if orig != nil || orig2 != nil {
+		f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+	}
+	return orig, orig2
+}
+
+// RemoveIf removes all FDs where cond is true.
+func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+		if cond(file, fileVFS2, flags) {
+			f.set(fd, nil, FDFlags{}) // Clear from table.
+			// Update current available position.
+			if fd < f.next {
+				f.next = fd
+			}
+		}
+	})
+}
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
new file mode 100644
index 000000000..29f95a2c4
--- /dev/null
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -0,0 +1,228 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"runtime"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/filetest"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+const (
+	// maxFD is the maximum FD to try to create in the map.
+	//
+	// This number of open files has been seen in the wild.
+	maxFD = 2 * 1024
+)
+
+func runTest(t testing.TB, fn func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet)) {
+	t.Helper() // Don't show in stacks.
+
+	// Create the limits and context.
+	limitSet := limits.NewLimitSet()
+	limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true)
+	ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet)
+
+	// Create a test file.;
+	file := filetest.NewTestFile(t)
+
+	// Create the table.
+	fdTable := new(FDTable)
+	fdTable.init()
+
+	// Run the test.
+	fn(ctx, fdTable, file, limitSet)
+}
+
+// TestFDTableMany allocates maxFD FDs, i.e. maxes out the FDTable, until there
+// is no room, then makes sure that NewFDAt works and also that if we remove
+// one and add one that works too.
+func TestFDTableMany(t *testing.T) {
+	runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+		for i := 0; i < maxFD; i++ {
+			if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil {
+				t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD)
+			}
+		}
+
+		if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil {
+			t.Fatalf("fdTable.NewFDs(0, r) in full map: got nil, wanted error")
+		}
+
+		if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil {
+			t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+		}
+
+		i := int32(2)
+		fdTable.Remove(i)
+		if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i {
+			t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err)
+		}
+	})
+}
+
+func TestFDTableOverLimit(t *testing.T) {
+	runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+		if _, err := fdTable.NewFDs(ctx, maxFD, []*fs.File{file}, FDFlags{}); err == nil {
+			t.Fatalf("fdTable.NewFDs(maxFD, f): got nil, wanted error")
+		}
+
+		if _, err := fdTable.NewFDs(ctx, maxFD-2, []*fs.File{file, file, file}, FDFlags{}); err == nil {
+			t.Fatalf("fdTable.NewFDs(maxFD-2, {f,f,f}): got nil, wanted error")
+		}
+
+		if fds, err := fdTable.NewFDs(ctx, maxFD-3, []*fs.File{file, file, file}, FDFlags{}); err != nil {
+			t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err)
+		} else {
+			for _, fd := range fds {
+				fdTable.Remove(fd)
+			}
+		}
+
+		if fds, err := fdTable.NewFDs(ctx, maxFD-1, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != maxFD-1 {
+			t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+		}
+
+		if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil {
+			t.Fatalf("Adding an FD to a resized map: got %v, want nil", err)
+		} else if len(fds) != 1 || fds[0] != 0 {
+			t.Fatalf("Added an FD to a resized map: got %v, want {1}", fds)
+		}
+	})
+}
+
+// TestFDTable does a set of simple tests to make sure simple adds, removes,
+// GetRefs, and DecRefs work. The ordering is just weird enough that a
+// table-driven approach seemed clumsy.
+func TestFDTable(t *testing.T) {
+	runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet) {
+		// Cap the limit at one.
+		limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}, true)
+
+		if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil {
+			t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err)
+		}
+
+		if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil {
+			t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error")
+		}
+
+		// Remove the previous limit.
+		limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true)
+
+		if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil {
+			t.Fatalf("Adding an FD to a resized map: got %v, want nil", err)
+		} else if len(fds) != 1 || fds[0] != 1 {
+			t.Fatalf("Added an FD to a resized map: got %v, want {1}", fds)
+		}
+
+		if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil {
+			t.Fatalf("Replacing FD 1 via fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+		}
+
+		if err := fdTable.NewFDAt(ctx, maxFD+1, file, FDFlags{}); err == nil {
+			t.Fatalf("Using an FD that was too large via fdTable.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1)
+		}
+
+		if ref, _ := fdTable.Get(1); ref == nil {
+			t.Fatalf("fdTable.Get(1): got nil, wanted %v", file)
+		}
+
+		if ref, _ := fdTable.Get(2); ref != nil {
+			t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref)
+		}
+
+		ref, _ := fdTable.Remove(1)
+		if ref == nil {
+			t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success")
+		}
+		ref.DecRef()
+
+		if ref, _ := fdTable.Remove(1); ref != nil {
+			t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
+		}
+	})
+}
+
+func TestDescriptorFlags(t *testing.T) {
+	runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+		if err := fdTable.NewFDAt(ctx, 2, file, FDFlags{CloseOnExec: true}); err != nil {
+			t.Fatalf("fdTable.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err)
+		}
+
+		newFile, flags := fdTable.Get(2)
+		if newFile == nil {
+			t.Fatalf("fdTable.Get(2): got a %v, wanted nil", newFile)
+		}
+
+		if !flags.CloseOnExec {
+			t.Fatalf("new File flags %v don't match original %d\n", flags, 0)
+		}
+	})
+}
+
+func BenchmarkFDLookupAndDecRef(b *testing.B) {
+	b.StopTimer() // Setup.
+
+	runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+		fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{})
+		if err != nil {
+			b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err)
+		}
+
+		b.StartTimer() // Benchmark.
+		for i := 0; i < b.N; i++ {
+			tf, _ := fdTable.Get(fds[i%len(fds)])
+			tf.DecRef()
+		}
+	})
+}
+
+func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) {
+	b.StopTimer() // Setup.
+
+	runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+		fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{})
+		if err != nil {
+			b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err)
+		}
+
+		concurrency := runtime.GOMAXPROCS(0)
+		if concurrency < 4 {
+			concurrency = 4
+		}
+		each := b.N / concurrency
+
+		b.StartTimer() // Benchmark.
+		var wg sync.WaitGroup
+		for i := 0; i < concurrency; i++ {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				for i := 0; i < each; i++ {
+					tf, _ := fdTable.Get(fds[i%len(fds)])
+					tf.DecRef()
+				}
+			}()
+		}
+		wg.Wait()
+	})
+}
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
new file mode 100644
index 000000000..7fd97dc53
--- /dev/null
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -0,0 +1,169 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync/atomic"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+type descriptorTable struct {
+	// slice is a *[]unsafe.Pointer, where each element is actually
+	// *descriptor object, updated atomically.
+	//
+	// Changes to the slice itself requiring holding FDTable.mu.
+	slice unsafe.Pointer `state:".(map[int32]*descriptor)"`
+}
+
+// init initializes the table.
+func (f *FDTable) init() {
+	var slice []unsafe.Pointer // Empty slice.
+	atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
+}
+
+// get gets a file entry.
+//
+// The boolean indicates whether this was in range.
+//
+//go:nosplit
+func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) {
+	file, _, flags, ok := f.getAll(fd)
+	return file, flags, ok
+}
+
+// getVFS2 gets a file entry.
+//
+// The boolean indicates whether this was in range.
+//
+//go:nosplit
+func (f *FDTable) getVFS2(fd int32) (*vfs.FileDescription, FDFlags, bool) {
+	_, file, flags, ok := f.getAll(fd)
+	return file, flags, ok
+}
+
+// getAll gets a file entry.
+//
+// The boolean indicates whether this was in range.
+//
+//go:nosplit
+func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, bool) {
+	slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+	if fd >= int32(len(slice)) {
+		return nil, nil, FDFlags{}, false
+	}
+	d := (*descriptor)(atomic.LoadPointer(&slice[fd]))
+	if d == nil {
+		return nil, nil, FDFlags{}, true
+	}
+	if d.file != nil && d.fileVFS2 != nil {
+		panic("VFS1 and VFS2 files set")
+	}
+	return d.file, d.fileVFS2, d.flags, true
+}
+
+// set sets an entry.
+//
+// This handles accounting changes, as well as acquiring and releasing the
+// reference needed by the table iff the file is different.
+//
+// Precondition: mu must be held.
+func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
+	f.setAll(fd, file, nil, flags)
+}
+
+// setVFS2 sets an entry.
+//
+// This handles accounting changes, as well as acquiring and releasing the
+// reference needed by the table iff the file is different.
+//
+// Precondition: mu must be held.
+func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) {
+	f.setAll(fd, nil, file, flags)
+}
+
+// setAll sets an entry.
+//
+// This handles accounting changes, as well as acquiring and releasing the
+// reference needed by the table iff the file is different.
+//
+// Precondition: mu must be held.
+func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+	if file != nil && fileVFS2 != nil {
+		panic("VFS1 and VFS2 files set")
+	}
+
+	slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+
+	// Grow the table as required.
+	if last := int32(len(slice)); fd >= last {
+		end := fd + 1
+		if end < 2*last {
+			end = 2 * last
+		}
+		slice = append(slice, make([]unsafe.Pointer, end-last)...)
+		atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
+	}
+
+	var desc *descriptor
+	if file != nil || fileVFS2 != nil {
+		desc = &descriptor{
+			file:     file,
+			fileVFS2: fileVFS2,
+			flags:    flags,
+		}
+	}
+
+	// Update the single element.
+	orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(desc)))
+
+	// Acquire a table reference.
+	if desc != nil {
+		switch {
+		case desc.file != nil:
+			if orig == nil || desc.file != orig.file {
+				desc.file.IncRef()
+			}
+		case desc.fileVFS2 != nil:
+			if orig == nil || desc.fileVFS2 != orig.fileVFS2 {
+				desc.fileVFS2.IncRef()
+			}
+		}
+	}
+
+	// Drop the table reference.
+	if orig != nil {
+		switch {
+		case orig.file != nil:
+			if desc == nil || desc.file != orig.file {
+				f.drop(orig.file)
+			}
+		case orig.fileVFS2 != nil:
+			if desc == nil || desc.fileVFS2 != orig.fileVFS2 {
+				f.dropVFS2(orig.fileVFS2)
+			}
+		}
+	}
+
+	// Adjust used.
+	switch {
+	case orig == nil && desc != nil:
+		atomic.AddInt32(&f.used, 1)
+	case orig != nil && desc == nil:
+		atomic.AddInt32(&f.used, -1)
+	}
+}
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
new file mode 100644
index 000000000..47f78df9a
--- /dev/null
+++ b/pkg/sentry/kernel/fs_context.go
@@ -0,0 +1,283 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// FSContext contains filesystem context.
+//
+// This includes umask and working directory.
+//
+// +stateify savable
+type FSContext struct {
+	refs.AtomicRefCount
+
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// root is the filesystem root. Will be nil iff the FSContext has been
+	// destroyed.
+	root *fs.Dirent
+
+	// rootVFS2 is the filesystem root.
+	rootVFS2 vfs.VirtualDentry
+
+	// cwd is the current working directory. Will be nil iff the FSContext
+	// has been destroyed.
+	cwd *fs.Dirent
+
+	// cwdVFS2 is the current working directory.
+	cwdVFS2 vfs.VirtualDentry
+
+	// umask is the current file mode creation mask. When a thread using this
+	// context invokes a syscall that creates a file, bits set in umask are
+	// removed from the permissions that the file is created with.
+	umask uint
+}
+
+// newFSContext returns a new filesystem context.
+func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
+	root.IncRef()
+	cwd.IncRef()
+	f := FSContext{
+		root:  root,
+		cwd:   cwd,
+		umask: umask,
+	}
+	f.EnableLeakCheck("kernel.FSContext")
+	return &f
+}
+
+// NewFSContextVFS2 returns a new filesystem context.
+func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
+	root.IncRef()
+	cwd.IncRef()
+	f := FSContext{
+		rootVFS2: root,
+		cwdVFS2:  cwd,
+		umask:    umask,
+	}
+	f.EnableLeakCheck("kernel.FSContext")
+	return &f
+}
+
+// destroy is the destructor for an FSContext.
+//
+// This will call DecRef on both root and cwd Dirents.  If either call to
+// DecRef returns an error, then it will be propagated.  If both calls to
+// DecRef return an error, then the one from root.DecRef will be propagated.
+//
+// Note that there may still be calls to WorkingDirectory() or RootDirectory()
+// (that return nil).  This is because valid references may still be held via
+// proc files or other mechanisms.
+func (f *FSContext) destroy() {
+	// Hold f.mu so that we don't race with RootDirectory() and
+	// WorkingDirectory().
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	if VFS2Enabled {
+		f.rootVFS2.DecRef()
+		f.rootVFS2 = vfs.VirtualDentry{}
+		f.cwdVFS2.DecRef()
+		f.cwdVFS2 = vfs.VirtualDentry{}
+	} else {
+		f.root.DecRef()
+		f.root = nil
+		f.cwd.DecRef()
+		f.cwd = nil
+	}
+}
+
+// DecRef implements RefCounter.DecRef with destructor f.destroy.
+func (f *FSContext) DecRef() {
+	f.DecRefWithDestructor(f.destroy)
+}
+
+// Fork forks this FSContext.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) Fork() *FSContext {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	if VFS2Enabled {
+		f.cwdVFS2.IncRef()
+		f.rootVFS2.IncRef()
+	} else {
+		f.cwd.IncRef()
+		f.root.IncRef()
+	}
+
+	return &FSContext{
+		cwd:      f.cwd,
+		root:     f.root,
+		cwdVFS2:  f.cwdVFS2,
+		rootVFS2: f.rootVFS2,
+		umask:    f.umask,
+	}
+}
+
+// WorkingDirectory returns the current working directory.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) WorkingDirectory() *fs.Dirent {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.cwd.IncRef()
+	return f.cwd
+}
+
+// WorkingDirectoryVFS2 returns the current working directory.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.cwdVFS2.IncRef()
+	return f.cwdVFS2
+}
+
+// SetWorkingDirectory sets the current working directory.
+// This will take an extra reference on the Dirent.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
+	if d == nil {
+		panic("FSContext.SetWorkingDirectory called with nil dirent")
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	if f.cwd == nil {
+		panic(fmt.Sprintf("FSContext.SetWorkingDirectory(%v)) called after destroy", d))
+	}
+
+	old := f.cwd
+	f.cwd = d
+	d.IncRef()
+	old.DecRef()
+}
+
+// SetWorkingDirectoryVFS2 sets the current working directory.
+// This will take an extra reference on the VirtualDentry.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	old := f.cwdVFS2
+	f.cwdVFS2 = d
+	d.IncRef()
+	old.DecRef()
+}
+
+// RootDirectory returns the current filesystem root.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) RootDirectory() *fs.Dirent {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if f.root != nil {
+		f.root.IncRef()
+	}
+	return f.root
+}
+
+// RootDirectoryVFS2 returns the current filesystem root.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.rootVFS2.IncRef()
+	return f.rootVFS2
+}
+
+// SetRootDirectory sets the root directory.
+// This will take an extra reference on the Dirent.
+//
+// This is not a valid call after free.
+func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
+	if d == nil {
+		panic("FSContext.SetRootDirectory called with nil dirent")
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	if f.root == nil {
+		panic(fmt.Sprintf("FSContext.SetRootDirectory(%v)) called after destroy", d))
+	}
+
+	old := f.root
+	f.root = d
+	d.IncRef()
+	old.DecRef()
+}
+
+// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
+//
+// This is not a valid call after free.
+func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) {
+	if !vd.Ok() {
+		panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
+	}
+
+	f.mu.Lock()
+
+	if !f.rootVFS2.Ok() {
+		f.mu.Unlock()
+		panic(fmt.Sprintf("FSContext.SetRootDirectoryVFS2(%v)) called after destroy", vd))
+	}
+
+	old := f.rootVFS2
+	vd.IncRef()
+	f.rootVFS2 = vd
+	f.mu.Unlock()
+	old.DecRef()
+}
+
+// Umask returns the current umask.
+func (f *FSContext) Umask() uint {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return f.umask
+}
+
+// SwapUmask atomically sets the current umask and returns the old umask.
+func (f *FSContext) SwapUmask(mask uint) uint {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	old := f.umask
+	f.umask = mask
+	return old
+}
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
new file mode 100644
index 000000000..c5021f2db
--- /dev/null
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -0,0 +1,57 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "atomicptr_bucket",
+    out = "atomicptr_bucket_unsafe.go",
+    package = "futex",
+    suffix = "Bucket",
+    template = "//pkg/sync:generic_atomicptr",
+    types = {
+        "Value": "bucket",
+    },
+)
+
+go_template_instance(
+    name = "waiter_list",
+    out = "waiter_list.go",
+    package = "futex",
+    prefix = "waiter",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Waiter",
+        "Linker": "*Waiter",
+    },
+)
+
+go_library(
+    name = "futex",
+    srcs = [
+        "atomicptr_bucket_unsafe.go",
+        "futex.go",
+        "waiter_list.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/log",
+        "//pkg/sentry/memmap",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
+
+go_test(
+    name = "futex_test",
+    size = "small",
+    srcs = ["futex_test.go"],
+    library = ":futex",
+    deps = [
+        "//pkg/sync",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
new file mode 100644
index 000000000..732e66da4
--- /dev/null
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -0,0 +1,795 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package futex provides an implementation of the futex interface as found in
+// the Linux kernel. It allows one to easily transform Wait() calls into waits
+// on a channel, which is useful in a Go-based kernel, for example.
+package futex
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// KeyKind indicates the type of a Key.
+type KeyKind int
+
+const (
+	// KindPrivate indicates a private futex (a futex syscall with the
+	// FUTEX_PRIVATE_FLAG set).
+	KindPrivate KeyKind = iota
+
+	// KindSharedPrivate indicates a shared futex on a private memory mapping.
+	// Although KindPrivate and KindSharedPrivate futexes both use memory
+	// addresses to identify futexes, they do not interoperate (in Linux, the
+	// two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key
+	// comparison).
+	KindSharedPrivate
+
+	// KindSharedMappable indicates a shared futex on a memory mapping other
+	// than a private anonymous memory mapping.
+	KindSharedMappable
+)
+
+// Key represents something that a futex waiter may wait on.
+type Key struct {
+	// Kind is the type of the Key.
+	Kind KeyKind
+
+	// Mappable is the memory-mapped object that is represented by the Key.
+	// Mappable is always nil if Kind is not KindSharedMappable, and may be nil
+	// even if it is.
+	Mappable memmap.Mappable
+
+	// MappingIdentity is the MappingIdentity associated with Mappable.
+	// MappingIdentity is always nil is Mappable is nil, and may be nil even if
+	// it isn't.
+	MappingIdentity memmap.MappingIdentity
+
+	// If Kind is KindPrivate or KindSharedPrivate, Offset is the represented
+	// memory address. Otherwise, Offset is the represented offset into
+	// Mappable.
+	Offset uint64
+}
+
+func (k *Key) release() {
+	if k.MappingIdentity != nil {
+		k.MappingIdentity.DecRef()
+	}
+	k.Mappable = nil
+	k.MappingIdentity = nil
+}
+
+func (k *Key) clone() Key {
+	if k.MappingIdentity != nil {
+		k.MappingIdentity.IncRef()
+	}
+	return *k
+}
+
+// Preconditions: k.Kind == KindPrivate or KindSharedPrivate.
+func (k *Key) addr() usermem.Addr {
+	return usermem.Addr(k.Offset)
+}
+
+// matches returns true if a wakeup on k2 should wake a waiter waiting on k.
+func (k *Key) matches(k2 *Key) bool {
+	// k.MappingIdentity is ignored; it's only used for reference counting.
+	return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset
+}
+
+// Target abstracts memory accesses and keys.
+type Target interface {
+	// SwapUint32 gives access to usermem.IO.SwapUint32.
+	SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
+
+	// CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32.
+	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
+
+	// LoadUint32 gives access to usermem.IO.LoadUint32.
+	LoadUint32(addr usermem.Addr) (uint32, error)
+
+	// GetSharedKey returns a Key with kind KindSharedPrivate or
+	// KindSharedMappable corresponding to the memory mapped at address addr.
+	//
+	// If GetSharedKey returns a Key with a non-nil MappingIdentity, a
+	// reference is held on the MappingIdentity, which must be dropped by the
+	// caller when the Key is no longer in use.
+	GetSharedKey(addr usermem.Addr) (Key, error)
+}
+
+// check performs a basic equality check on the given address.
+func check(t Target, addr usermem.Addr, val uint32) error {
+	cur, err := t.LoadUint32(addr)
+	if err != nil {
+		return err
+	}
+	if cur != val {
+		return syserror.EAGAIN
+	}
+	return nil
+}
+
+// atomicOp performs a complex operation on the given address.
+func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
+	opType := (opIn >> 28) & 0xf
+	cmp := (opIn >> 24) & 0xf
+	opArg := (opIn >> 12) & 0xfff
+	cmpArg := opIn & 0xfff
+
+	if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 {
+		opArg = 1 << opArg
+		opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag.
+	}
+
+	var (
+		oldVal uint32
+		err    error
+	)
+	if opType == linux.FUTEX_OP_SET {
+		oldVal, err = t.SwapUint32(addr, opArg)
+		if err != nil {
+			return false, err
+		}
+	} else {
+		for {
+			oldVal, err = t.LoadUint32(addr)
+			if err != nil {
+				return false, err
+			}
+			var newVal uint32
+			switch opType {
+			case linux.FUTEX_OP_ADD:
+				newVal = oldVal + opArg
+			case linux.FUTEX_OP_OR:
+				newVal = oldVal | opArg
+			case linux.FUTEX_OP_ANDN:
+				newVal = oldVal &^ opArg
+			case linux.FUTEX_OP_XOR:
+				newVal = oldVal ^ opArg
+			default:
+				return false, syserror.ENOSYS
+			}
+			prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal)
+			if err != nil {
+				return false, err
+			}
+			if prev == oldVal {
+				break // Success.
+			}
+		}
+	}
+
+	switch cmp {
+	case linux.FUTEX_OP_CMP_EQ:
+		return oldVal == cmpArg, nil
+	case linux.FUTEX_OP_CMP_NE:
+		return oldVal != cmpArg, nil
+	case linux.FUTEX_OP_CMP_LT:
+		return oldVal < cmpArg, nil
+	case linux.FUTEX_OP_CMP_LE:
+		return oldVal <= cmpArg, nil
+	case linux.FUTEX_OP_CMP_GT:
+		return oldVal > cmpArg, nil
+	case linux.FUTEX_OP_CMP_GE:
+		return oldVal >= cmpArg, nil
+	default:
+		return false, syserror.ENOSYS
+	}
+}
+
+// Waiter is the struct which gets enqueued into buckets for wake up routines
+// and requeue routines to scan and notify. Once a Waiter has been enqueued by
+// WaitPrepare(), callers may listen on C for wake up events.
+type Waiter struct {
+	// Synchronization:
+	//
+	// - A Waiter that is not enqueued in a bucket is exclusively owned (no
+	// synchronization applies).
+	//
+	// - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this,
+	// waiterEntry, bucket, and key are protected by the bucket.mu ("bucket
+	// lock") of the containing bucket, and bitmask is immutable. Note that
+	// since bucket is mutated using atomic memory operations, bucket.Load()
+	// may be called without holding the bucket lock, although it may change
+	// racily. See WaitComplete().
+	//
+	// - A Waiter is only guaranteed to be no longer queued after calling
+	// WaitComplete().
+
+	// waiterEntry links Waiter into bucket.waiters.
+	waiterEntry
+
+	// bucket is the bucket this waiter is queued in. If bucket is nil, the
+	// waiter is not waiting and is not in any bucket.
+	bucket AtomicPtrBucket
+
+	// C is sent to when the Waiter is woken.
+	C chan struct{}
+
+	// key is what this waiter is waiting on.
+	key Key
+
+	// The bitmask we're waiting on.
+	// This is used the case of a FUTEX_WAKE_BITSET.
+	bitmask uint32
+
+	// tid is the thread ID for the waiter in case this is a PI mutex.
+	tid uint32
+}
+
+// NewWaiter returns a new unqueued Waiter.
+func NewWaiter() *Waiter {
+	return &Waiter{
+		C: make(chan struct{}, 1),
+	}
+}
+
+// woken returns true if w has been woken since the last call to WaitPrepare.
+func (w *Waiter) woken() bool {
+	return len(w.C) != 0
+}
+
+// bucket holds a list of waiters for a given address hash.
+//
+// +stateify savable
+type bucket struct {
+	// mu protects waiters and contained Waiter state. See comment in Waiter.
+	mu sync.Mutex `state:"nosave"`
+
+	waiters waiterList `state:"zerovalue"`
+}
+
+// wakeLocked wakes up to n waiters matching the bitmask at the addr for this
+// bucket and returns the number of waiters woken.
+//
+// Preconditions: b.mu must be locked.
+func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int {
+	done := 0
+	for w := b.waiters.Front(); done < n && w != nil; {
+		if !w.key.matches(key) || w.bitmask&bitmask == 0 {
+			// Not matching.
+			w = w.Next()
+			continue
+		}
+
+		// Remove from the bucket and wake the waiter.
+		woke := w
+		w = w.Next() // Next iteration.
+		b.wakeWaiterLocked(woke)
+		done++
+	}
+	return done
+}
+
+func (b *bucket) wakeWaiterLocked(w *Waiter) {
+	// Remove from the bucket and wake the waiter.
+	b.waiters.Remove(w)
+	w.C <- struct{}{}
+
+	// NOTE: The above channel write establishes a write barrier according
+	// to the memory model, so nothing may be ordered around it. Since
+	// we've dequeued w and will never touch it again, we can safely
+	// store nil to w.bucket here and allow the WaitComplete() to
+	// short-circuit grabbing the bucket lock. If they somehow miss the
+	// store, we are still holding the lock, so we can know that they won't
+	// dequeue w, assume it's free and have the below operation
+	// afterwards.
+	w.bucket.Store(nil)
+}
+
+// requeueLocked takes n waiters from the bucket and moves them to naddr on the
+// bucket "to".
+//
+// Preconditions: b and to must be locked.
+func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int {
+	done := 0
+	for w := b.waiters.Front(); done < n && w != nil; {
+		if !w.key.matches(key) {
+			// Not matching.
+			w = w.Next()
+			continue
+		}
+
+		requeued := w
+		w = w.Next() // Next iteration.
+		b.waiters.Remove(requeued)
+		requeued.key.release()
+		requeued.key = nkey.clone()
+		to.waiters.PushBack(requeued)
+		requeued.bucket.Store(to)
+		done++
+	}
+	return done
+}
+
+const (
+	// bucketCount is the number of buckets per Manager. By having many of
+	// these we reduce contention when concurrent yet unrelated calls are made.
+	bucketCount     = 1 << bucketCountBits
+	bucketCountBits = 10
+)
+
+// getKey returns a Key representing address addr in c.
+func getKey(t Target, addr usermem.Addr, private bool) (Key, error) {
+	// Ensure the address is aligned.
+	// It must be a DWORD boundary.
+	if addr&0x3 != 0 {
+		return Key{}, syserror.EINVAL
+	}
+	if private {
+		return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil
+	}
+	return t.GetSharedKey(addr)
+}
+
+// bucketIndexForAddr returns the index into Manager.buckets for addr.
+func bucketIndexForAddr(addr usermem.Addr) uintptr {
+	// - The bottom 2 bits of addr must be 0, per getKey.
+	//
+	// - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
+	// for a canonical address, and (on all existing platforms) bit 47 must be
+	// 0 for an application address.
+	//
+	// Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful"
+	// bits. We choose one of the simplest possible hash functions that at
+	// least uses all 45 useful bits in the output, given that bucketCountBits
+	// == 10. This hash function also has the property that it will usually map
+	// adjacent addresses to adjacent buckets, slightly improving memory
+	// locality when an application synchronization structure uses multiple
+	// nearby futexes.
+	//
+	// Note that despite the large number of arithmetic operations in the
+	// function, many components can be computed in parallel, such that the
+	// critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This
+	// is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... +
+	// (addr >> 42)" without any additional grouping, the compiler puts all 4
+	// additions in the critical path.
+	h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22)
+	h2 := uintptr(addr>>32) + uintptr(addr>>42)
+	return (h1 + h2) % bucketCount
+}
+
+// Manager holds futex state for a single virtual address space.
+//
+// +stateify savable
+type Manager struct {
+	// privateBuckets holds buckets for KindPrivate and KindSharedPrivate
+	// futexes.
+	privateBuckets [bucketCount]bucket `state:"zerovalue"`
+
+	// sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket
+	// may be shared by multiple Managers. The sharedBucket pointer is
+	// immutable.
+	sharedBucket *bucket
+}
+
+// NewManager returns an initialized futex manager.
+func NewManager() *Manager {
+	return &Manager{
+		sharedBucket: &bucket{},
+	}
+}
+
+// Fork returns a new Manager. Shared futex clients using the returned Manager
+// may interoperate with those using m.
+func (m *Manager) Fork() *Manager {
+	return &Manager{
+		sharedBucket: m.sharedBucket,
+	}
+}
+
+// lockBucket returns a locked bucket for the given key.
+func (m *Manager) lockBucket(k *Key) *bucket {
+	var b *bucket
+	if k.Kind == KindSharedMappable {
+		b = m.sharedBucket
+	} else {
+		b = &m.privateBuckets[bucketIndexForAddr(k.addr())]
+	}
+	b.mu.Lock()
+	return b
+}
+
+// lockBuckets returns locked buckets for the given keys.
+func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
+	// Buckets must be consistently ordered to avoid circular lock
+	// dependencies. We order buckets in m.privateBuckets by index (lowest
+	// index first), and all buckets in m.privateBuckets precede
+	// m.sharedBucket.
+
+	// Handle the common case first:
+	if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable {
+		i1 := bucketIndexForAddr(k1.addr())
+		i2 := bucketIndexForAddr(k2.addr())
+		b1 := &m.privateBuckets[i1]
+		b2 := &m.privateBuckets[i2]
+		switch {
+		case i1 < i2:
+			b1.mu.Lock()
+			b2.mu.Lock()
+		case i2 < i1:
+			b2.mu.Lock()
+			b1.mu.Lock()
+		default:
+			b1.mu.Lock()
+		}
+		return b1, b2
+	}
+
+	// At least one of b1 or b2 should be m.sharedBucket.
+	b1 := m.sharedBucket
+	b2 := m.sharedBucket
+	if k1.Kind != KindSharedMappable {
+		b1 = m.lockBucket(k1)
+	} else if k2.Kind != KindSharedMappable {
+		b2 = m.lockBucket(k2)
+	}
+	m.sharedBucket.mu.Lock()
+	return b1, b2
+}
+
+// Wake wakes up to n waiters matching the bitmask on the given addr.
+// The number of waiters woken is returned.
+func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32, n int) (int, error) {
+	// This function is very hot; avoid defer.
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return 0, err
+	}
+
+	b := m.lockBucket(&k)
+	r := b.wakeLocked(&k, bitmask, n)
+
+	b.mu.Unlock()
+	k.release()
+	return r, nil
+}
+
+func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
+	k1, err := getKey(t, addr, private)
+	if err != nil {
+		return 0, err
+	}
+	defer k1.release()
+	k2, err := getKey(t, naddr, private)
+	if err != nil {
+		return 0, err
+	}
+	defer k2.release()
+
+	b1, b2 := m.lockBuckets(&k1, &k2)
+	defer b1.mu.Unlock()
+	if b2 != b1 {
+		defer b2.mu.Unlock()
+	}
+
+	if checkval {
+		if err := check(t, addr, val); err != nil {
+			return 0, err
+		}
+	}
+
+	// Wake the number required.
+	done := b1.wakeLocked(&k1, ^uint32(0), nwake)
+
+	// Requeue the number required.
+	b1.requeueLocked(b2, &k1, &k2, nreq)
+
+	return done, nil
+}
+
+// Requeue wakes up to nwake waiters on the given addr, and unconditionally
+// requeues up to nreq waiters on naddr.
+func (m *Manager) Requeue(t Target, addr, naddr usermem.Addr, private bool, nwake int, nreq int) (int, error) {
+	return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq)
+}
+
+// RequeueCmp atomically checks that the addr contains val (via the Target),
+// wakes up to nwake waiters on addr and then unconditionally requeues nreq
+// waiters on naddr.
+func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, val uint32, nwake int, nreq int) (int, error) {
+	return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq)
+}
+
+// WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
+// waiters unconditionally from addr1, and, based on the original value at addr2
+// and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
+// It returns the total number of waiters woken.
+func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
+	k1, err := getKey(t, addr1, private)
+	if err != nil {
+		return 0, err
+	}
+	defer k1.release()
+	k2, err := getKey(t, addr2, private)
+	if err != nil {
+		return 0, err
+	}
+	defer k2.release()
+
+	b1, b2 := m.lockBuckets(&k1, &k2)
+	defer b1.mu.Unlock()
+	if b2 != b1 {
+		defer b2.mu.Unlock()
+	}
+
+	done := 0
+	cond, err := atomicOp(t, addr2, op)
+	if err != nil {
+		return 0, err
+	}
+
+	// Wake up up to nwake1 entries from the first bucket.
+	done = b1.wakeLocked(&k1, ^uint32(0), nwake1)
+
+	// Wake up up to nwake2 entries from the second bucket if the
+	// operation yielded true.
+	if cond {
+		done += b2.wakeLocked(&k2, ^uint32(0), nwake2)
+	}
+
+	return done, nil
+}
+
+// WaitPrepare atomically checks that addr contains val (via the Checker), then
+// enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
+// Waiter must be subsequently removed by calling WaitComplete, whether or not
+// a wakeup is received on w.C.
+func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) error {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return err
+	}
+	// Ownership of k is transferred to w below.
+
+	// Prepare the Waiter before taking the bucket lock.
+	select {
+	case <-w.C:
+	default:
+	}
+	w.key = k
+	w.bitmask = bitmask
+
+	b := m.lockBucket(&k)
+	// This function is very hot; avoid defer.
+
+	// Perform our atomic check.
+	if err := check(t, addr, val); err != nil {
+		b.mu.Unlock()
+		w.key.release()
+		return err
+	}
+
+	// Add the waiter to the bucket.
+	b.waiters.PushBack(w)
+	w.bucket.Store(b)
+
+	b.mu.Unlock()
+	return nil
+}
+
+// WaitComplete must be called when a Waiter previously added by WaitPrepare is
+// no longer eligible to be woken.
+func (m *Manager) WaitComplete(w *Waiter) {
+	// Remove w from the bucket it's in.
+	for {
+		b := w.bucket.Load()
+
+		// If b is nil, the waiter isn't in any bucket anymore. This can't be
+		// racy because the waiter can't be concurrently re-queued in another
+		// bucket.
+		if b == nil {
+			break
+		}
+
+		// Take the bucket lock. Note that without holding the bucket lock, the
+		// waiter is not guaranteed to stay in that bucket, so after we take
+		// the bucket lock, we must ensure that the bucket hasn't changed: if
+		// it happens to have changed, we release the old bucket lock and try
+		// again with the new bucket; if it hasn't changed, we know it won't
+		// change now because we hold the lock.
+		b.mu.Lock()
+		if b != w.bucket.Load() {
+			b.mu.Unlock()
+			continue
+		}
+
+		// Remove waiter from bucket.
+		b.waiters.Remove(w)
+		w.bucket.Store(nil)
+		b.mu.Unlock()
+		break
+	}
+
+	// Release references held by the waiter.
+	w.key.release()
+}
+
+// LockPI attempts to lock the futex following the Priority-inheritance futex
+// rules. The lock is acquired only when 'addr' points to 0. The TID of the
+// calling task is set to 'addr' to indicate the futex is owned. It returns true
+// if the futex was successfully acquired.
+//
+// FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see
+// exit_robust_list()). Given we don't support robust lists, although handled
+// below, it's never set.
+func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return false, err
+	}
+	// Ownership of k is transferred to w below.
+
+	// Prepare the Waiter before taking the bucket lock.
+	select {
+	case <-w.C:
+	default:
+	}
+	w.key = k
+	w.tid = tid
+
+	b := m.lockBucket(&k)
+	// Hot function: avoid defers.
+
+	success, err := m.lockPILocked(w, t, addr, tid, b, try)
+	if err != nil {
+		w.key.release()
+		b.mu.Unlock()
+		return false, err
+	}
+	if success || try {
+		// Release waiter if it's not going to be a wait.
+		w.key.release()
+	}
+	b.mu.Unlock()
+	return success, nil
+}
+
+func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) {
+	for {
+		cur, err := t.LoadUint32(addr)
+		if err != nil {
+			return false, err
+		}
+		if (cur & linux.FUTEX_TID_MASK) == tid {
+			return false, syserror.EDEADLK
+		}
+
+		if (cur & linux.FUTEX_TID_MASK) == 0 {
+			// No owner and no waiters, try to acquire the futex.
+
+			// Set TID and preserve owner died status.
+			val := tid
+			val |= cur & linux.FUTEX_OWNER_DIED
+			prev, err := t.CompareAndSwapUint32(addr, cur, val)
+			if err != nil {
+				return false, err
+			}
+			if prev != cur {
+				// CAS failed, retry...
+				// Linux reacquires the bucket lock on retries, which will re-lookup the
+				// mapping at the futex address. However, retrying while holding the
+				// lock is more efficient and reduces the chance of another conflict.
+				continue
+			}
+			// Futex acquired.
+			return true, nil
+		}
+
+		// Futex is already owned, prepare to wait.
+
+		if try {
+			// Caller doesn't want to wait.
+			return false, nil
+		}
+
+		// Set waiters bit if not set yet.
+		if cur&linux.FUTEX_WAITERS == 0 {
+			prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS)
+			if err != nil {
+				return false, err
+			}
+			if prev != cur {
+				// CAS failed, retry...
+				continue
+			}
+		}
+
+		// Add the waiter to the bucket.
+		b.waiters.PushBack(w)
+		w.bucket.Store(b)
+		return false, nil
+	}
+}
+
+// UnlockPI unlock the futex following the Priority-inheritance futex
+// rules. The address provided must contain the caller's TID. If there are
+// waiters, TID of the next waiter (FIFO) is set to the given address, and the
+// waiter woken up. If there are no waiters, 0 is set to the address.
+func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return err
+	}
+	b := m.lockBucket(&k)
+
+	err = m.unlockPILocked(t, addr, tid, b, &k)
+
+	k.release()
+	b.mu.Unlock()
+	return err
+}
+
+func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket, key *Key) error {
+	cur, err := t.LoadUint32(addr)
+	if err != nil {
+		return err
+	}
+
+	if (cur & linux.FUTEX_TID_MASK) != tid {
+		return syserror.EPERM
+	}
+
+	var next *Waiter  // Who's the next owner?
+	var next2 *Waiter // Who's the one after that?
+	for w := b.waiters.Front(); w != nil; w = w.Next() {
+		if !w.key.matches(key) {
+			continue
+		}
+
+		if next == nil {
+			next = w
+		} else {
+			next2 = w
+			break
+		}
+	}
+
+	if next == nil {
+		// It's safe to set 0 because there are no waiters, no new owner, and the
+		// executing task is the current owner (no owner died bit).
+		prev, err := t.CompareAndSwapUint32(addr, cur, 0)
+		if err != nil {
+			return err
+		}
+		if prev != cur {
+			// Let user mode handle CAS races. This is different than lock, which
+			// retries when CAS fails.
+			return syserror.EAGAIN
+		}
+		return nil
+	}
+
+	// Set next owner's TID, waiters if there are any. Resets owner died bit, if
+	// set, because the executing task takes over as the owner.
+	val := next.tid
+	if next2 != nil {
+		val |= linux.FUTEX_WAITERS
+	}
+
+	prev, err := t.CompareAndSwapUint32(addr, cur, val)
+	if err != nil {
+		return err
+	}
+	if prev != cur {
+		return syserror.EINVAL
+	}
+
+	b.wakeWaiterLocked(next)
+	return nil
+}
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
new file mode 100644
index 000000000..7c5c7665b
--- /dev/null
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -0,0 +1,530 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package futex
+
+import (
+	"math"
+	"runtime"
+	"sync/atomic"
+	"syscall"
+	"testing"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// testData implements the Target interface, and allows us to
+// treat the address passed for futex operations as an index in
+// a byte slice for testing simplicity.
+type testData []byte
+
+const sizeofInt32 = 4
+
+func newTestData(size uint) testData {
+	return make([]byte, size)
+}
+
+func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
+	val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t[addr])), new)
+	return val, nil
+}
+
+func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
+	if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t[addr])), old, new) {
+		return old, nil
+	}
+	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
+}
+
+func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) {
+	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
+}
+
+func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) {
+	return Key{
+		Kind:   KindSharedMappable,
+		Offset: uint64(addr),
+	}, nil
+}
+
+func futexKind(private bool) string {
+	if private {
+		return "private"
+	}
+	return "shared"
+}
+
+func newPreparedTestWaiter(t *testing.T, m *Manager, ta Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) *Waiter {
+	w := NewWaiter()
+	if err := m.WaitPrepare(w, ta, addr, private, val, bitmask); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	return w
+}
+
+func TestFutexWake(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(sizeofInt32)
+
+			// Start waiting for wakeup.
+			w := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w)
+
+			// Perform a wakeup.
+			if n, err := m.Wake(d, 0, private, ^uint32(0), 1); err != nil || n != 1 {
+				t.Errorf("Wake: got (%d, %v), wanted (1, nil)", n, err)
+			}
+
+			// Expect the waiter to have been woken.
+			if !w.woken() {
+				t.Error("waiter not woken")
+			}
+		})
+	}
+}
+
+func TestFutexWakeBitmask(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(sizeofInt32)
+
+			// Start waiting for wakeup.
+			w := newPreparedTestWaiter(t, m, d, 0, private, 0, 0x0000ffff)
+			defer m.WaitComplete(w)
+
+			// Perform a wakeup using the wrong bitmask.
+			if n, err := m.Wake(d, 0, private, 0xffff0000, 1); err != nil || n != 0 {
+				t.Errorf("Wake with non-matching bitmask: got (%d, %v), wanted (0, nil)", n, err)
+			}
+
+			// Expect the waiter to still be waiting.
+			if w.woken() {
+				t.Error("waiter woken unexpectedly")
+			}
+
+			// Perform a wakeup using the right bitmask.
+			if n, err := m.Wake(d, 0, private, 0x00000001, 1); err != nil || n != 1 {
+				t.Errorf("Wake with matching bitmask: got (%d, %v), wanted (1, nil)", n, err)
+			}
+
+			// Expect that the waiter was woken.
+			if !w.woken() {
+				t.Error("waiter not woken")
+			}
+		})
+	}
+}
+
+func TestFutexWakeTwo(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(sizeofInt32)
+
+			// Start three waiters waiting for wakeup.
+			var ws [3]*Waiter
+			for i := range ws {
+				ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+				defer m.WaitComplete(ws[i])
+			}
+
+			// Perform two wakeups.
+			const wakeups = 2
+			if n, err := m.Wake(d, 0, private, ^uint32(0), 2); err != nil || n != wakeups {
+				t.Errorf("Wake: got (%d, %v), wanted (%d, nil)", n, err, wakeups)
+			}
+
+			// Expect that exactly two waiters were woken.
+			// We don't get guarantees about exactly which two,
+			// (although we expect them to be w1 and w2).
+			awake := 0
+			for i := range ws {
+				if ws[i].woken() {
+					awake++
+				}
+			}
+			if awake != wakeups {
+				t.Errorf("got %d woken waiters, wanted %d", awake, wakeups)
+			}
+		})
+	}
+}
+
+func TestFutexWakeUnrelated(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(2 * sizeofInt32)
+
+			// Start two waiters waiting for wakeup on different addresses.
+			w1 := newPreparedTestWaiter(t, m, d, 0*sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w1)
+			w2 := newPreparedTestWaiter(t, m, d, 1*sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w2)
+
+			// Perform two wakeups on the second address.
+			if n, err := m.Wake(d, 1*sizeofInt32, private, ^uint32(0), 2); err != nil || n != 1 {
+				t.Errorf("Wake: got (%d, %v), wanted (1, nil)", n, err)
+			}
+
+			// Expect that only the second waiter was woken.
+			if w1.woken() {
+				t.Error("w1 woken unexpectedly")
+			}
+			if !w2.woken() {
+				t.Error("w2 not woken")
+			}
+		})
+	}
+}
+
+func TestWakeOpEmpty(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(2 * sizeofInt32)
+
+			// Perform wakeups with no waiters.
+			if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 10, 0); err != nil || n != 0 {
+				t.Fatalf("WakeOp: got (%d, %v), wanted (0, nil)", n, err)
+			}
+		})
+	}
+}
+
+func TestWakeOpFirstNonEmpty(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add two waiters on address 0.
+			w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w1)
+			w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w2)
+
+			// Perform 10 wakeups on address 0.
+			if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 0, 0); err != nil || n != 2 {
+				t.Errorf("WakeOp: got (%d, %v), wanted (2, nil)", n, err)
+			}
+
+			// Expect that both waiters were woken.
+			if !w1.woken() {
+				t.Error("w1 not woken")
+			}
+			if !w2.woken() {
+				t.Error("w2 not woken")
+			}
+		})
+	}
+}
+
+func TestWakeOpSecondNonEmpty(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add two waiters on address sizeofInt32.
+			w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w1)
+			w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w2)
+
+			// Perform 10 wakeups on address sizeofInt32 (contingent on
+			// d.Op(0), which should succeed).
+			if n, err := m.WakeOp(d, 0, sizeofInt32, private, 0, 10, 0); err != nil || n != 2 {
+				t.Errorf("WakeOp: got (%d, %v), wanted (2, nil)", n, err)
+			}
+
+			// Expect that both waiters were woken.
+			if !w1.woken() {
+				t.Error("w1 not woken")
+			}
+			if !w2.woken() {
+				t.Error("w2 not woken")
+			}
+		})
+	}
+}
+
+func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add two waiters on address sizeofInt32.
+			w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w1)
+			w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w2)
+
+			// Perform 10 wakeups on address sizeofInt32 (contingent on
+			// d.Op(1), which should fail).
+			if n, err := m.WakeOp(d, 0, sizeofInt32, private, 0, 10, 1); err != nil || n != 0 {
+				t.Errorf("WakeOp: got (%d, %v), wanted (0, nil)", n, err)
+			}
+
+			// Expect that neither waiter was woken.
+			if w1.woken() {
+				t.Error("w1 woken unexpectedly")
+			}
+			if w2.woken() {
+				t.Error("w2 woken unexpectedly")
+			}
+		})
+	}
+}
+
+func TestWakeOpAllNonEmpty(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add two waiters on address 0.
+			w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w1)
+			w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w2)
+
+			// Add two waiters on address sizeofInt32.
+			w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w3)
+			w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w4)
+
+			// Perform 10 wakeups on address 0 (unconditionally), and 10
+			// wakeups on address sizeofInt32 (contingent on d.Op(0), which
+			// should succeed).
+			if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 10, 0); err != nil || n != 4 {
+				t.Errorf("WakeOp: got (%d, %v), wanted (4, nil)", n, err)
+			}
+
+			// Expect that all waiters were woken.
+			if !w1.woken() {
+				t.Error("w1 not woken")
+			}
+			if !w2.woken() {
+				t.Error("w2 not woken")
+			}
+			if !w3.woken() {
+				t.Error("w3 not woken")
+			}
+			if !w4.woken() {
+				t.Error("w4 not woken")
+			}
+		})
+	}
+}
+
+func TestWakeOpAllNonEmptyFailingOp(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add two waiters on address 0.
+			w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w1)
+			w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w2)
+
+			// Add two waiters on address sizeofInt32.
+			w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w3)
+			w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w4)
+
+			// Perform 10 wakeups on address 0 (unconditionally), and 10
+			// wakeups on address sizeofInt32 (contingent on d.Op(1), which
+			// should fail).
+			if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 10, 1); err != nil || n != 2 {
+				t.Errorf("WakeOp: got (%d, %v), wanted (2, nil)", n, err)
+			}
+
+			// Expect that only the first two waiters were woken.
+			if !w1.woken() {
+				t.Error("w1 not woken")
+			}
+			if !w2.woken() {
+				t.Error("w2 not woken")
+			}
+			if w3.woken() {
+				t.Error("w3 woken unexpectedly")
+			}
+			if w4.woken() {
+				t.Error("w4 woken unexpectedly")
+			}
+		})
+	}
+}
+
+func TestWakeOpSameAddress(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add four waiters on address 0.
+			var ws [4]*Waiter
+			for i := range ws {
+				ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+				defer m.WaitComplete(ws[i])
+			}
+
+			// Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup
+			// on address 0 (contingent on d.Op(0), which should succeed).
+			const wakeups = 2
+			if n, err := m.WakeOp(d, 0, 0, private, 1, 1, 0); err != nil || n != wakeups {
+				t.Errorf("WakeOp: got (%d, %v), wanted (%d, nil)", n, err, wakeups)
+			}
+
+			// Expect that exactly two waiters were woken.
+			awake := 0
+			for i := range ws {
+				if ws[i].woken() {
+					awake++
+				}
+			}
+			if awake != wakeups {
+				t.Errorf("got %d woken waiters, wanted %d", awake, wakeups)
+			}
+		})
+	}
+}
+
+func TestWakeOpSameAddressFailingOp(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add four waiters on address 0.
+			var ws [4]*Waiter
+			for i := range ws {
+				ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+				defer m.WaitComplete(ws[i])
+			}
+
+			// Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup
+			// on address 0 (contingent on d.Op(1), which should fail).
+			const wakeups = 1
+			if n, err := m.WakeOp(d, 0, 0, private, 1, 1, 1); err != nil || n != wakeups {
+				t.Errorf("WakeOp: got (%d, %v), wanted (%d, nil)", n, err, wakeups)
+			}
+
+			// Expect that exactly one waiter was woken.
+			awake := 0
+			for i := range ws {
+				if ws[i].woken() {
+					awake++
+				}
+			}
+			if awake != wakeups {
+				t.Errorf("got %d woken waiters, wanted %d", awake, wakeups)
+			}
+		})
+	}
+}
+
+const (
+	testMutexSize            = sizeofInt32
+	testMutexLocked   uint32 = 1
+	testMutexUnlocked uint32 = 0
+)
+
+// testMutex ties together a testData slice, an address, and a
+// futex manager in order to implement the sync.Locker interface.
+// Beyond being used as a Locker, this is a simple mechanism for
+// changing the underlying values for simpler tests.
+type testMutex struct {
+	a usermem.Addr
+	d testData
+	m *Manager
+}
+
+func newTestMutex(addr usermem.Addr, d testData, m *Manager) *testMutex {
+	return &testMutex{a: addr, d: d, m: m}
+}
+
+// Lock acquires the testMutex.
+// This may wait for it to be available via the futex manager.
+func (t *testMutex) Lock() {
+	for {
+		// Attempt to grab the lock.
+		if atomic.CompareAndSwapUint32(
+			(*uint32)(unsafe.Pointer(&t.d[t.a])),
+			testMutexUnlocked,
+			testMutexLocked) {
+			// Lock held.
+			return
+		}
+
+		// Wait for it to be "not locked".
+		w := NewWaiter()
+		err := t.m.WaitPrepare(w, t.d, t.a, true, testMutexLocked, ^uint32(0))
+		if err == syscall.EAGAIN {
+			continue
+		}
+		if err != nil {
+			// Should never happen.
+			panic("WaitPrepare returned unexpected error: " + err.Error())
+		}
+		<-w.C
+		t.m.WaitComplete(w)
+	}
+}
+
+// Unlock releases the testMutex.
+// This will notify any waiters via the futex manager.
+func (t *testMutex) Unlock() {
+	// Unlock.
+	atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d[t.a])), testMutexUnlocked)
+
+	// Notify all waiters.
+	t.m.Wake(t.d, t.a, true, ^uint32(0), math.MaxInt32)
+}
+
+// This function was shamelessly stolen from mutex_test.go.
+func HammerMutex(l sync.Locker, loops int, cdone chan bool) {
+	for i := 0; i < loops; i++ {
+		l.Lock()
+		runtime.Gosched()
+		l.Unlock()
+	}
+	cdone <- true
+}
+
+func TestMutexStress(t *testing.T) {
+	m := NewManager()
+	d := newTestData(testMutexSize)
+	tm := newTestMutex(0*testMutexSize, d, m)
+	c := make(chan bool)
+
+	for i := 0; i < 10; i++ {
+		go HammerMutex(tm, 1000, c)
+	}
+
+	for i := 0; i < 10; i++ {
+		<-c
+	}
+}
diff --git a/pkg/sentry/kernel/g3doc/run_states.dot b/pkg/sentry/kernel/g3doc/run_states.dot
new file mode 100644
index 000000000..7861fe1f5
--- /dev/null
+++ b/pkg/sentry/kernel/g3doc/run_states.dot
@@ -0,0 +1,99 @@
+digraph {
+  subgraph {
+    App;
+  }
+  subgraph {
+    Interrupt;
+    InterruptAfterSignalDeliveryStop;
+  }
+  subgraph {
+    Syscall;
+    SyscallAfterPtraceEventSeccomp;
+    SyscallEnter;
+    SyscallAfterSyscallEnterStop;
+    SyscallAfterSysemuStop;
+    SyscallInvoke;
+    SyscallAfterPtraceEventClone;
+    SyscallAfterExecStop;
+    SyscallAfterVforkStop;
+    SyscallReinvoke;
+    SyscallExit;
+  }
+  subgraph {
+    Vsyscall;
+    VsyscallAfterPtraceEventSeccomp;
+    VsyscallInvoke;
+  }
+  subgraph {
+    Exit;
+    ExitMain; // leave thread group, release resources, reparent children, kill PID namespace and wait if TGID 1
+    ExitNotify; // signal parent/tracer, become waitable
+    ExitDone; // represented by t.runState == nil
+  }
+
+  // Task exit
+  Exit -> ExitMain;
+  ExitMain -> ExitNotify;
+  ExitNotify -> ExitDone;
+
+  // Execution of untrusted application code
+  App -> App;
+
+  // Interrupts (usually signal delivery)
+  App -> Interrupt;
+  Interrupt -> Interrupt; // if other interrupt conditions may still apply
+  Interrupt -> Exit; // if killed
+
+  // Syscalls
+  App -> Syscall;
+  Syscall -> SyscallEnter;
+  SyscallEnter -> SyscallInvoke;
+  SyscallInvoke -> SyscallExit;
+  SyscallExit -> App;
+
+  // exit, exit_group
+  SyscallInvoke -> Exit;
+
+  // execve
+  SyscallInvoke -> SyscallAfterExecStop;
+  SyscallAfterExecStop -> SyscallExit;
+  SyscallAfterExecStop -> App; // fatal signal pending
+
+  // vfork
+  SyscallInvoke -> SyscallAfterVforkStop;
+  SyscallAfterVforkStop -> SyscallExit;
+
+  // Vsyscalls
+  App -> Vsyscall;
+  Vsyscall -> VsyscallInvoke;
+  Vsyscall -> App; // fault while reading return address from stack
+  VsyscallInvoke -> App;
+
+  // ptrace-specific branches
+  Interrupt -> InterruptAfterSignalDeliveryStop;
+  InterruptAfterSignalDeliveryStop -> Interrupt;
+  SyscallEnter -> SyscallAfterSyscallEnterStop;
+  SyscallAfterSyscallEnterStop -> SyscallInvoke;
+  SyscallAfterSyscallEnterStop -> SyscallExit; // skipped by tracer
+  SyscallAfterSyscallEnterStop -> App; // fatal signal pending
+  SyscallEnter -> SyscallAfterSysemuStop;
+  SyscallAfterSysemuStop -> SyscallExit;
+  SyscallAfterSysemuStop -> App; // fatal signal pending
+  SyscallInvoke -> SyscallAfterPtraceEventClone;
+  SyscallAfterPtraceEventClone -> SyscallExit;
+  SyscallAfterPtraceEventClone -> SyscallAfterVforkStop;
+
+  // seccomp
+  Syscall -> App; // SECCOMP_RET_TRAP, SECCOMP_RET_ERRNO, SECCOMP_RET_KILL, SECCOMP_RET_TRACE without tracer
+  Syscall -> SyscallAfterPtraceEventSeccomp; // SECCOMP_RET_TRACE
+  SyscallAfterPtraceEventSeccomp -> SyscallEnter;
+  SyscallAfterPtraceEventSeccomp -> SyscallExit; // skipped by tracer
+  SyscallAfterPtraceEventSeccomp -> App; // fatal signal pending
+  Vsyscall -> VsyscallAfterPtraceEventSeccomp;
+  VsyscallAfterPtraceEventSeccomp -> VsyscallInvoke;
+  VsyscallAfterPtraceEventSeccomp -> App;
+
+  // Autosave
+  SyscallInvoke -> SyscallReinvoke;
+  SyscallReinvoke -> SyscallInvoke;
+}
diff --git a/pkg/sentry/kernel/g3doc/run_states.png b/pkg/sentry/kernel/g3doc/run_states.png
new file mode 100644
index 000000000..b63b60f02
--- /dev/null
+++ b/pkg/sentry/kernel/g3doc/run_states.png
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
new file mode 100644
index 000000000..80a070d7e
--- /dev/null
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -0,0 +1,58 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/semaphore"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/shm"
+)
+
+// IPCNamespace represents an IPC namespace.
+//
+// +stateify savable
+type IPCNamespace struct {
+	// User namespace which owns this IPC namespace. Immutable.
+	userNS *auth.UserNamespace
+
+	semaphores *semaphore.Registry
+	shms       *shm.Registry
+}
+
+// NewIPCNamespace creates a new IPC namespace.
+func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
+	return &IPCNamespace{
+		userNS:     userNS,
+		semaphores: semaphore.NewRegistry(userNS),
+		shms:       shm.NewRegistry(userNS),
+	}
+}
+
+// SemaphoreRegistry returns the semaphore set registry for this namespace.
+func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry {
+	return i.semaphores
+}
+
+// ShmRegistry returns the shm segment registry for this namespace.
+func (i *IPCNamespace) ShmRegistry() *shm.Registry {
+	return i.shms
+}
+
+// IPCNamespace returns the task's IPC namespace.
+func (t *Task) IPCNamespace() *IPCNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.ipcns
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
new file mode 100644
index 000000000..2177b785a
--- /dev/null
+++ b/pkg/sentry/kernel/kernel.go
@@ -0,0 +1,1682 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kernel provides an emulation of the Linux kernel.
+//
+// See README.md for a detailed overview.
+//
+// Lock order (outermost locks must be taken first):
+//
+// Kernel.extMu
+//   ThreadGroup.timerMu
+//     ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer)
+//       TaskSet.mu
+//         SignalHandlers.mu
+//           Task.mu
+//       runningTasksMu
+//
+// Locking SignalHandlers.mu in multiple SignalHandlers requires locking
+// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
+// time requires locking all of their signal mutexes first.
+package kernel
+
+import (
+	"errors"
+	"fmt"
+	"path/filepath"
+	"sync/atomic"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/eventchannel"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	oldtimerfd "gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
+	sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/unimpl"
+	uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/state"
+	"gvisor.dev/gvisor/pkg/state/wire"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow
+// easy access everywhere. To be removed once VFS2 becomes the default.
+var VFS2Enabled = false
+
+// Kernel represents an emulated Linux kernel. It must be initialized by calling
+// Init() or LoadFrom().
+//
+// +stateify savable
+type Kernel struct {
+	// extMu serializes external changes to the Kernel with calls to
+	// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
+	// remains frozen for the duration of the call; it requires that the Kernel
+	// is paused as a precondition, which ensures that none of the tasks
+	// running within the Kernel can affect its state, but extMu is required to
+	// ensure that concurrent users of the Kernel *outside* the Kernel's
+	// control cannot affect its state by calling e.g.
+	// Kernel.SendExternalSignal.)
+	extMu sync.Mutex `state:"nosave"`
+
+	// started is true if Start has been called. Unless otherwise specified,
+	// all Kernel fields become immutable once started becomes true.
+	started bool `state:"nosave"`
+
+	// All of the following fields are immutable unless otherwise specified.
+
+	// Platform is the platform that is used to execute tasks in the created
+	// Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is
+	// embedded anonymously (the same issue applies).
+	platform.Platform `state:"nosave"`
+
+	// mf provides application memory.
+	mf *pgalloc.MemoryFile `state:"nosave"`
+
+	// See InitKernelArgs for the meaning of these fields.
+	featureSet                  *cpuid.FeatureSet
+	timekeeper                  *Timekeeper
+	tasks                       *TaskSet
+	rootUserNamespace           *auth.UserNamespace
+	rootNetworkNamespace        *inet.Namespace
+	applicationCores            uint
+	useHostCores                bool
+	extraAuxv                   []arch.AuxEntry
+	vdso                        *loader.VDSO
+	rootUTSNamespace            *UTSNamespace
+	rootIPCNamespace            *IPCNamespace
+	rootAbstractSocketNamespace *AbstractSocketNamespace
+
+	// futexes is the "root" futex.Manager, from which all others are forked.
+	// This is necessary to ensure that shared futexes are coherent across all
+	// tasks, including those created by CreateProcess.
+	futexes *futex.Manager
+
+	// globalInit is the thread group whose leader has ID 1 in the root PID
+	// namespace. globalInit is stored separately so that it is accessible even
+	// after all tasks in the thread group have exited, such that ID 1 is no
+	// longer mapped.
+	//
+	// globalInit is mutable until it is assigned by the first successful call
+	// to CreateProcess, and is protected by extMu.
+	globalInit *ThreadGroup
+
+	// realtimeClock is a ktime.Clock based on timekeeper's Realtime.
+	realtimeClock *timekeeperClock
+
+	// monotonicClock is a ktime.Clock based on timekeeper's Monotonic.
+	monotonicClock *timekeeperClock
+
+	// syslog is the kernel log.
+	syslog syslog
+
+	// runningTasksMu synchronizes disable/enable of cpuClockTicker when
+	// the kernel is idle (runningTasks == 0).
+	//
+	// runningTasksMu is used to exclude critical sections when the timer
+	// disables itself and when the first active task enables the timer,
+	// ensuring that tasks always see a valid cpuClock value.
+	runningTasksMu sync.Mutex `state:"nosave"`
+
+	// runningTasks is the total count of tasks currently in
+	// TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are
+	// not blocked or stopped.
+	//
+	// runningTasks must be accessed atomically. Increments from 0 to 1 are
+	// further protected by runningTasksMu (see incRunningTasks).
+	runningTasks int64
+
+	// cpuClock is incremented every linux.ClockTick. cpuClock is used to
+	// measure task CPU usage, since sampling monotonicClock twice on every
+	// syscall turns out to be unreasonably expensive. This is similar to how
+	// Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING),
+	// although Linux also uses scheduler timing information to improve
+	// resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do
+	// since "preeemptive" scheduling is managed by the Go runtime, which
+	// doesn't provide this information.
+	//
+	// cpuClock is mutable, and is accessed using atomic memory operations.
+	cpuClock uint64
+
+	// cpuClockTicker increments cpuClock.
+	cpuClockTicker *ktime.Timer `state:"nosave"`
+
+	// cpuClockTickerDisabled indicates that cpuClockTicker has been
+	// disabled because no tasks are running.
+	//
+	// cpuClockTickerDisabled is protected by runningTasksMu.
+	cpuClockTickerDisabled bool
+
+	// cpuClockTickerSetting is the ktime.Setting of cpuClockTicker at the
+	// point it was disabled. It is cached here to avoid a lock ordering
+	// violation with cpuClockTicker.mu when runningTaskMu is held.
+	//
+	// cpuClockTickerSetting is only valid when cpuClockTickerDisabled is
+	// true.
+	//
+	// cpuClockTickerSetting is protected by runningTasksMu.
+	cpuClockTickerSetting ktime.Setting
+
+	// uniqueID is used to generate unique identifiers.
+	//
+	// uniqueID is mutable, and is accessed using atomic memory operations.
+	uniqueID uint64
+
+	// nextInotifyCookie is a monotonically increasing counter used for
+	// generating unique inotify event cookies.
+	//
+	// nextInotifyCookie is mutable, and is accessed using atomic memory
+	// operations.
+	nextInotifyCookie uint32
+
+	// netlinkPorts manages allocation of netlink socket port IDs.
+	netlinkPorts *port.Manager
+
+	// saveErr is the error causing the sandbox to exit during save, if
+	// any. It is protected by extMu.
+	saveErr error `state:"nosave"`
+
+	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
+	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
+
+	// sockets is the list of all network sockets the system. Protected by
+	// extMu.
+	sockets socketList
+
+	// nextSocketEntry is the next entry number to use in sockets. Protected
+	// by extMu.
+	nextSocketEntry uint64
+
+	// deviceRegistry is used to save/restore device.SimpleDevices.
+	deviceRegistry struct{} `state:".(*device.Registry)"`
+
+	// DirentCacheLimiter controls the number of total dirent entries can be in
+	// caches. Not all caches use it, only the caches that use host resources use
+	// the limiter. It may be nil if disabled.
+	DirentCacheLimiter *fs.DirentCacheLimiter
+
+	// unimplementedSyscallEmitterOnce is used in the initialization of
+	// unimplementedSyscallEmitter.
+	unimplementedSyscallEmitterOnce sync.Once `state:"nosave"`
+
+	// unimplementedSyscallEmitter is used to emit unimplemented syscall
+	// events. This is initialized lazily on the first unimplemented
+	// syscall.
+	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
+
+	// SpecialOpts contains special kernel options.
+	SpecialOpts
+
+	// VFS keeps the filesystem state used across the kernel.
+	vfs vfs.VirtualFilesystem
+
+	// hostMount is the Mount used for file descriptors that were imported
+	// from the host.
+	hostMount *vfs.Mount
+
+	// pipeMount is the Mount used for pipes created by the pipe() and pipe2()
+	// syscalls (as opposed to named pipes created by mknod()).
+	pipeMount *vfs.Mount
+
+	// shmMount is the Mount used for anonymous files created by the
+	// memfd_create() syscalls. It is analagous to Linux's shm_mnt.
+	shmMount *vfs.Mount
+
+	// socketMount is the Mount used for sockets created by the socket() and
+	// socketpair() syscalls. There are several cases where a socket dentry will
+	// not be contained in socketMount:
+	// 1. Socket files created by mknod()
+	// 2. Socket fds imported from the host (Kernel.hostMount is used for these)
+	// 3. Socket files created by binding Unix sockets to a file path
+	socketMount *vfs.Mount
+
+	// If set to true, report address space activation waits as if the task is in
+	// external wait so that the watchdog doesn't report the task stuck.
+	SleepForAddressSpaceActivation bool
+}
+
+// InitKernelArgs holds arguments to Init.
+type InitKernelArgs struct {
+	// FeatureSet is the emulated CPU feature set.
+	FeatureSet *cpuid.FeatureSet
+
+	// Timekeeper manages time for all tasks in the system.
+	Timekeeper *Timekeeper
+
+	// RootUserNamespace is the root user namespace.
+	RootUserNamespace *auth.UserNamespace
+
+	// RootNetworkNamespace is the root network namespace. If nil, no networking
+	// will be available.
+	RootNetworkNamespace *inet.Namespace
+
+	// ApplicationCores is the number of logical CPUs visible to sandboxed
+	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
+	// ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
+	// most significant bit in cpu_possible_mask + 1.
+	ApplicationCores uint
+
+	// If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
+	// instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
+	// no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
+	// will be overridden.
+	UseHostCores bool
+
+	// ExtraAuxv contains additional auxiliary vector entries that are added to
+	// each process by the ELF loader.
+	ExtraAuxv []arch.AuxEntry
+
+	// Vdso holds the VDSO and its parameter page.
+	Vdso *loader.VDSO
+
+	// RootUTSNamespace is the root UTS namespace.
+	RootUTSNamespace *UTSNamespace
+
+	// RootIPCNamespace is the root IPC namespace.
+	RootIPCNamespace *IPCNamespace
+
+	// RootAbstractSocketNamespace is the root Abstract Socket namespace.
+	RootAbstractSocketNamespace *AbstractSocketNamespace
+
+	// PIDNamespace is the root PID namespace.
+	PIDNamespace *PIDNamespace
+}
+
+// Init initialize the Kernel with no tasks.
+//
+// Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile
+// before calling Init.
+func (k *Kernel) Init(args InitKernelArgs) error {
+	if args.FeatureSet == nil {
+		return fmt.Errorf("FeatureSet is nil")
+	}
+	if args.Timekeeper == nil {
+		return fmt.Errorf("Timekeeper is nil")
+	}
+	if args.Timekeeper.clocks == nil {
+		return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()")
+	}
+	if args.RootUserNamespace == nil {
+		return fmt.Errorf("RootUserNamespace is nil")
+	}
+	if args.ApplicationCores == 0 {
+		return fmt.Errorf("ApplicationCores is 0")
+	}
+
+	k.featureSet = args.FeatureSet
+	k.timekeeper = args.Timekeeper
+	k.tasks = newTaskSet(args.PIDNamespace)
+	k.rootUserNamespace = args.RootUserNamespace
+	k.rootUTSNamespace = args.RootUTSNamespace
+	k.rootIPCNamespace = args.RootIPCNamespace
+	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
+	k.rootNetworkNamespace = args.RootNetworkNamespace
+	if k.rootNetworkNamespace == nil {
+		k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil)
+	}
+	k.applicationCores = args.ApplicationCores
+	if args.UseHostCores {
+		k.useHostCores = true
+		maxCPU, err := hostcpu.MaxPossibleCPU()
+		if err != nil {
+			return fmt.Errorf("Failed to get maximum CPU number: %v", err)
+		}
+		minAppCores := uint(maxCPU) + 1
+		if k.applicationCores < minAppCores {
+			log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
+			k.applicationCores = minAppCores
+		}
+	}
+	k.extraAuxv = args.ExtraAuxv
+	k.vdso = args.Vdso
+	k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime}
+	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
+	k.futexes = futex.NewManager()
+	k.netlinkPorts = port.New()
+
+	if VFS2Enabled {
+		if err := k.vfs.Init(); err != nil {
+			return fmt.Errorf("failed to initialize VFS: %v", err)
+		}
+
+		pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs)
+		if err != nil {
+			return fmt.Errorf("failed to create pipefs filesystem: %v", err)
+		}
+		defer pipeFilesystem.DecRef()
+		pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to create pipefs mount: %v", err)
+		}
+		k.pipeMount = pipeMount
+
+		tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace))
+		if err != nil {
+			return fmt.Errorf("failed to create tmpfs filesystem: %v", err)
+		}
+		defer tmpfsFilesystem.DecRef()
+		defer tmpfsRoot.DecRef()
+		shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to create tmpfs mount: %v", err)
+		}
+		k.shmMount = shmMount
+
+		socketFilesystem, err := sockfs.NewFilesystem(&k.vfs)
+		if err != nil {
+			return fmt.Errorf("failed to create sockfs filesystem: %v", err)
+		}
+		defer socketFilesystem.DecRef()
+		socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to create sockfs mount: %v", err)
+		}
+		k.socketMount = socketMount
+	}
+
+	return nil
+}
+
+// SaveTo saves the state of k to w.
+//
+// Preconditions: The kernel must be paused throughout the call to SaveTo.
+func (k *Kernel) SaveTo(w wire.Writer) error {
+	saveStart := time.Now()
+	ctx := k.SupervisorContext()
+
+	// Do not allow other Kernel methods to affect it while it's being saved.
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+
+	// Stop time.
+	k.pauseTimeLocked()
+	defer k.resumeTimeLocked()
+
+	// Evict all evictable MemoryFile allocations.
+	k.mf.StartEvictions()
+	k.mf.WaitForEvictions()
+
+	// Flush write operations on open files so data reaches backing storage.
+	// This must come after MemoryFile eviction since eviction may cause file
+	// writes.
+	if err := k.tasks.flushWritesToFiles(ctx); err != nil {
+		return err
+	}
+
+	// Remove all epoll waiter objects from underlying wait queues.
+	// NOTE: for programs to resume execution in future snapshot scenarios,
+	// we will need to re-establish these waiter objects after saving.
+	k.tasks.unregisterEpollWaiters()
+
+	// Clear the dirent cache before saving because Dirents must be Loaded in a
+	// particular order (parents before children), and Loading dirents from a cache
+	// breaks that order.
+	if err := k.flushMountSourceRefs(); err != nil {
+		return err
+	}
+
+	// Ensure that all inode and mount release operations have completed.
+	fs.AsyncBarrier()
+
+	// Once all fs work has completed (flushed references have all been released),
+	// reset mount mappings. This allows individual mounts to save how inodes map
+	// to filesystem resources. Without this, fs.Inodes cannot be restored.
+	fs.SaveInodeMappings()
+
+	// Discard unsavable mappings, such as those for host file descriptors.
+	// This must be done after waiting for "asynchronous fs work", which
+	// includes async I/O that may touch application memory.
+	if err := k.invalidateUnsavableMappings(ctx); err != nil {
+		return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+	}
+
+	// Save the CPUID FeatureSet before the rest of the kernel so we can
+	// verify its compatibility on restore before attempting to restore the
+	// entire kernel, which may fail on an incompatible machine.
+	//
+	// N.B. This will also be saved along with the full kernel save below.
+	cpuidStart := time.Now()
+	if _, err := state.Save(k.SupervisorContext(), w, k.FeatureSet()); err != nil {
+		return err
+	}
+	log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
+
+	// Save the kernel state.
+	kernelStart := time.Now()
+	stats, err := state.Save(k.SupervisorContext(), w, k)
+	if err != nil {
+		return err
+	}
+	log.Infof("Kernel save stats: %s", stats.String())
+	log.Infof("Kernel save took [%s].", time.Since(kernelStart))
+
+	// Save the memory file's state.
+	memoryStart := time.Now()
+	if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil {
+		return err
+	}
+	log.Infof("Memory save took [%s].", time.Since(memoryStart))
+
+	log.Infof("Overall save took [%s].", time.Since(saveStart))
+
+	return nil
+}
+
+// flushMountSourceRefs flushes the MountSources for all mounted filesystems
+// and open FDs.
+func (k *Kernel) flushMountSourceRefs() error {
+	// Flush all mount sources for currently mounted filesystems in each task.
+	flushed := make(map[*fs.MountNamespace]struct{})
+	k.tasks.mu.RLock()
+	k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) {
+		if _, ok := flushed[tg.mounts]; ok {
+			// Already flushed.
+			return
+		}
+		tg.mounts.FlushMountSourceRefs()
+		flushed[tg.mounts] = struct{}{}
+	})
+	k.tasks.mu.RUnlock()
+
+	// There may be some open FDs whose filesystems have been unmounted. We
+	// must flush those as well.
+	return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
+		file.Dirent.Inode.MountSource.FlushDirentRefs()
+		return nil
+	})
+}
+
+// forEachFDPaused applies the given function to each open file descriptor in
+// each task.
+//
+// Precondition: Must be called with the kernel paused.
+func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) {
+	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+	if VFS2Enabled {
+		return nil
+	}
+
+	ts.mu.RLock()
+	defer ts.mu.RUnlock()
+	for t := range ts.Root.tids {
+		// We can skip locking Task.mu here since the kernel is paused.
+		if t.fdTable == nil {
+			continue
+		}
+		t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) {
+			if lastErr := f(file, fileVFS2); lastErr != nil && err == nil {
+				err = lastErr
+			}
+		})
+	}
+	return err
+}
+
+func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
+	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+	return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
+		if flags := file.Flags(); !flags.Write {
+			return nil
+		}
+		if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
+			return nil
+		}
+		// Here we need all metadata synced.
+		syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+		if err := fs.SaveFileFsyncError(syncErr); err != nil {
+			name, _ := file.Dirent.FullName(nil /* root */)
+			// Wrap this error in ErrSaveRejection so that it will trigger a save
+			// error, rather than a panic. This also allows us to distinguish Fsync
+			// errors from state file errors in state.Save.
+			return fs.ErrSaveRejection{
+				Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err),
+			}
+		}
+		return nil
+	})
+}
+
+// Preconditions: The kernel must be paused.
+func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
+	invalidated := make(map[*mm.MemoryManager]struct{})
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+	for t := range k.tasks.Root.tids {
+		// We can skip locking Task.mu here since the kernel is paused.
+		if mm := t.tc.MemoryManager; mm != nil {
+			if _, ok := invalidated[mm]; !ok {
+				if err := mm.InvalidateUnsavable(ctx); err != nil {
+					return err
+				}
+				invalidated[mm] = struct{}{}
+			}
+		}
+		// I really wish we just had a sync.Map of all MMs...
+		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
+			if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (ts *TaskSet) unregisterEpollWaiters() {
+	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+	if VFS2Enabled {
+		return
+	}
+
+	ts.mu.RLock()
+	defer ts.mu.RUnlock()
+
+	// Tasks that belong to the same process could potentially point to the
+	// same FDTable. So we retain a map of processed ones to avoid
+	// processing the same FDTable multiple times.
+	processed := make(map[*FDTable]struct{})
+	for t := range ts.Root.tids {
+		// We can skip locking Task.mu here since the kernel is paused.
+		if t.fdTable == nil {
+			continue
+		}
+		if _, ok := processed[t.fdTable]; ok {
+			continue
+		}
+		t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+			if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
+				e.UnregisterEpollWaiters()
+			}
+		})
+		processed[t.fdTable] = struct{}{}
+	}
+}
+
+// LoadFrom returns a new Kernel loaded from args.
+func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
+	loadStart := time.Now()
+
+	initAppCores := k.applicationCores
+
+	// Load the pre-saved CPUID FeatureSet.
+	//
+	// N.B. This was also saved along with the full kernel below, so we
+	// don't need to explicitly install it in the Kernel.
+	cpuidStart := time.Now()
+	var features cpuid.FeatureSet
+	if _, err := state.Load(k.SupervisorContext(), r, &features); err != nil {
+		return err
+	}
+	log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
+
+	// Verify that the FeatureSet is usable on this host. We do this before
+	// Kernel load so that the explicit CPUID mismatch error has priority
+	// over floating point state restore errors that may occur on load on
+	// an incompatible machine.
+	if err := features.CheckHostCompatible(); err != nil {
+		return err
+	}
+
+	// Load the kernel state.
+	kernelStart := time.Now()
+	stats, err := state.Load(k.SupervisorContext(), r, k)
+	if err != nil {
+		return err
+	}
+	log.Infof("Kernel load stats: %s", stats.String())
+	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
+
+	// rootNetworkNamespace should be populated after loading the state file.
+	// Restore the root network stack.
+	k.rootNetworkNamespace.RestoreRootStack(net)
+
+	// Load the memory file's state.
+	memoryStart := time.Now()
+	if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil {
+		return err
+	}
+	log.Infof("Memory load took [%s].", time.Since(memoryStart))
+
+	log.Infof("Overall load took [%s]", time.Since(loadStart))
+
+	k.Timekeeper().SetClocks(clocks)
+	if net != nil {
+		net.Resume()
+	}
+
+	// Ensure that all pending asynchronous work is complete:
+	//   - namedpipe opening
+	//   - inode file opening
+	if err := fs.AsyncErrorBarrier(); err != nil {
+		return err
+	}
+
+	tcpip.AsyncLoading.Wait()
+
+	log.Infof("Overall load took [%s] after async work", time.Since(loadStart))
+
+	// Applications may size per-cpu structures based on k.applicationCores, so
+	// it can't change across save/restore. When we are virtualizing CPU
+	// numbers, this isn't a problem. However, when we are exposing host CPU
+	// assignments, we can't tolerate an increase in the number of host CPUs,
+	// which could result in getcpu(2) returning CPUs that applications expect
+	// not to exist.
+	if k.useHostCores && initAppCores > k.applicationCores {
+		return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
+	}
+
+	return nil
+}
+
+// UniqueID returns a unique identifier.
+func (k *Kernel) UniqueID() uint64 {
+	id := atomic.AddUint64(&k.uniqueID, 1)
+	if id == 0 {
+		panic("unique identifier generator wrapped around")
+	}
+	return id
+}
+
+// CreateProcessArgs holds arguments to kernel.CreateProcess.
+type CreateProcessArgs struct {
+	// Filename is the filename to load as the init binary.
+	//
+	// If this is provided as "", File will be checked, then the file will be
+	// guessed via Argv[0].
+	Filename string
+
+	// File is a passed host FD pointing to a file to load as the init binary.
+	//
+	// This is checked if and only if Filename is "".
+	File fsbridge.File
+
+	// Argvv is a list of arguments.
+	Argv []string
+
+	// Envv is a list of environment variables.
+	Envv []string
+
+	// WorkingDirectory is the initial working directory.
+	//
+	// This defaults to the root if empty.
+	WorkingDirectory string
+
+	// Credentials is the initial credentials.
+	Credentials *auth.Credentials
+
+	// FDTable is the initial set of file descriptors. If CreateProcess succeeds,
+	// it takes a reference on FDTable.
+	FDTable *FDTable
+
+	// Umask is the initial umask.
+	Umask uint
+
+	// Limits is the initial resource limits.
+	Limits *limits.LimitSet
+
+	// MaxSymlinkTraversals is the maximum number of symlinks to follow
+	// during resolution.
+	MaxSymlinkTraversals uint
+
+	// UTSNamespace is the initial UTS namespace.
+	UTSNamespace *UTSNamespace
+
+	// IPCNamespace is the initial IPC namespace.
+	IPCNamespace *IPCNamespace
+
+	// PIDNamespace is the initial PID Namespace.
+	PIDNamespace *PIDNamespace
+
+	// AbstractSocketNamespace is the initial Abstract Socket namespace.
+	AbstractSocketNamespace *AbstractSocketNamespace
+
+	// MountNamespace optionally contains the mount namespace for this
+	// process. If nil, the init process's mount namespace is used.
+	//
+	// Anyone setting MountNamespace must donate a reference (i.e.
+	// increment it).
+	MountNamespace *fs.MountNamespace
+
+	// MountNamespaceVFS2 optionally contains the mount namespace for this
+	// process. If nil, the init process's mount namespace is used.
+	//
+	// Anyone setting MountNamespaceVFS2 must donate a reference (i.e.
+	// increment it).
+	MountNamespaceVFS2 *vfs.MountNamespace
+
+	// ContainerID is the container that the process belongs to.
+	ContainerID string
+}
+
+// NewContext returns a context.Context that represents the task that will be
+// created by args.NewContext(k).
+func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext {
+	return &createProcessContext{
+		Logger: log.Log(),
+		k:      k,
+		args:   args,
+	}
+}
+
+// createProcessContext is a context.Context that represents the context
+// associated with a task that is being created.
+type createProcessContext struct {
+	context.NoopSleeper
+	log.Logger
+	k    *Kernel
+	args *CreateProcessArgs
+}
+
+// Value implements context.Context.Value.
+func (ctx *createProcessContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxKernel:
+		return ctx.k
+	case CtxPIDNamespace:
+		return ctx.args.PIDNamespace
+	case CtxUTSNamespace:
+		return ctx.args.UTSNamespace
+	case CtxIPCNamespace:
+		return ctx.args.IPCNamespace
+	case auth.CtxCredentials:
+		return ctx.args.Credentials
+	case fs.CtxRoot:
+		if ctx.args.MountNamespace != nil {
+			// MountNamespace.Root() will take a reference on the root dirent for us.
+			return ctx.args.MountNamespace.Root()
+		}
+		return nil
+	case vfs.CtxRoot:
+		if ctx.args.MountNamespaceVFS2 == nil {
+			return nil
+		}
+		// MountNamespaceVFS2.Root() takes a reference on the root dirent for us.
+		return ctx.args.MountNamespaceVFS2.Root()
+	case vfs.CtxMountNamespace:
+		if ctx.k.globalInit == nil {
+			return nil
+		}
+		// MountNamespaceVFS2 takes a reference for us.
+		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+	case fs.CtxDirentCacheLimiter:
+		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
+	case ktime.CtxRealtimeClock:
+		return ctx.k.RealtimeClock()
+	case limits.CtxLimits:
+		return ctx.args.Limits
+	case pgalloc.CtxMemoryFile:
+		return ctx.k.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return ctx.k
+	case platform.CtxPlatform:
+		return ctx.k
+	case uniqueid.CtxGlobalUniqueID:
+		return ctx.k.UniqueID()
+	case uniqueid.CtxGlobalUniqueIDProvider:
+		return ctx.k
+	case uniqueid.CtxInotifyCookie:
+		return ctx.k.GenerateInotifyCookie()
+	case unimpl.CtxEvents:
+		return ctx.k
+	default:
+		return nil
+	}
+}
+
+// CreateProcess creates a new task in a new thread group with the given
+// options. The new task has no parent and is in the root PID namespace.
+//
+// If k.Start() has already been called, then the created process must be
+// started by calling kernel.StartProcess(tg).
+//
+// If k.Start() has not yet been called, then the created task will begin
+// running when k.Start() is called.
+//
+// CreateProcess has no analogue in Linux; it is used to create the initial
+// application task, as well as processes started by the control server.
+func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	log.Infof("EXEC: %v", args.Argv)
+
+	ctx := args.NewContext(k)
+
+	var (
+		opener    fsbridge.Lookup
+		fsContext *FSContext
+		mntns     *fs.MountNamespace
+	)
+
+	if VFS2Enabled {
+		mntnsVFS2 := args.MountNamespaceVFS2
+		if mntnsVFS2 == nil {
+			// MountNamespaceVFS2 adds a reference to the namespace, which is
+			// transferred to the new process.
+			mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
+		}
+		// Get the root directory from the MountNamespace.
+		root := args.MountNamespaceVFS2.Root()
+		// The call to newFSContext below will take a reference on root, so we
+		// don't need to hold this one.
+		defer root.DecRef()
+
+		// Grab the working directory.
+		wd := root // Default.
+		if args.WorkingDirectory != "" {
+			pop := vfs.PathOperation{
+				Root:               root,
+				Start:              wd,
+				Path:               fspath.Parse(args.WorkingDirectory),
+				FollowFinalSymlink: true,
+			}
+			var err error
+			wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{
+				CheckSearchable: true,
+			})
+			if err != nil {
+				return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+			}
+			defer wd.DecRef()
+		}
+		opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd)
+		fsContext = NewFSContextVFS2(root, wd, args.Umask)
+
+	} else {
+		mntns = args.MountNamespace
+		if mntns == nil {
+			mntns = k.GlobalInit().Leader().MountNamespace()
+			mntns.IncRef()
+		}
+		// Get the root directory from the MountNamespace.
+		root := mntns.Root()
+		// The call to newFSContext below will take a reference on root, so we
+		// don't need to hold this one.
+		defer root.DecRef()
+
+		// Grab the working directory.
+		remainingTraversals := args.MaxSymlinkTraversals
+		wd := root // Default.
+		if args.WorkingDirectory != "" {
+			var err error
+			wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
+			if err != nil {
+				return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+			}
+			defer wd.DecRef()
+		}
+		opener = fsbridge.NewFSLookup(mntns, root, wd)
+		fsContext = newFSContext(root, wd, args.Umask)
+	}
+
+	tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
+
+	// Check which file to start from.
+	switch {
+	case args.Filename != "":
+		// If a filename is given, take that.
+		// Set File to nil so we resolve the path in LoadTaskImage.
+		args.File = nil
+	case args.File != nil:
+		// If File is set, take the File provided directly.
+	default:
+		// Otherwise look at Argv and see if the first argument is a valid path.
+		if len(args.Argv) == 0 {
+			return nil, 0, fmt.Errorf("no filename or command provided")
+		}
+		if !filepath.IsAbs(args.Argv[0]) {
+			return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
+		}
+		args.Filename = args.Argv[0]
+	}
+
+	// Create a fresh task context.
+	remainingTraversals := args.MaxSymlinkTraversals
+	loadArgs := loader.LoadArgs{
+		Opener:              opener,
+		RemainingTraversals: &remainingTraversals,
+		ResolveFinal:        true,
+		Filename:            args.Filename,
+		File:                args.File,
+		CloseOnExec:         false,
+		Argv:                args.Argv,
+		Envv:                args.Envv,
+		Features:            k.featureSet,
+	}
+
+	tc, se := k.LoadTaskImage(ctx, loadArgs)
+	if se != nil {
+		return nil, 0, errors.New(se.String())
+	}
+
+	// Take a reference on the FDTable, which will be transferred to
+	// TaskSet.NewTask().
+	args.FDTable.IncRef()
+
+	// Create the task.
+	config := &TaskConfig{
+		Kernel:                  k,
+		ThreadGroup:             tg,
+		TaskContext:             tc,
+		FSContext:               fsContext,
+		FDTable:                 args.FDTable,
+		Credentials:             args.Credentials,
+		NetworkNamespace:        k.RootNetworkNamespace(),
+		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
+		UTSNamespace:            args.UTSNamespace,
+		IPCNamespace:            args.IPCNamespace,
+		AbstractSocketNamespace: args.AbstractSocketNamespace,
+		MountNamespaceVFS2:      args.MountNamespaceVFS2,
+		ContainerID:             args.ContainerID,
+	}
+	t, err := k.tasks.NewTask(config)
+	if err != nil {
+		return nil, 0, err
+	}
+	t.traceExecEvent(tc) // Simulate exec for tracing.
+
+	// Success.
+	tgid := k.tasks.Root.IDOfThreadGroup(tg)
+	if k.globalInit == nil {
+		k.globalInit = tg
+	}
+	return tg, tgid, nil
+}
+
+// StartProcess starts running a process that was created with CreateProcess.
+func (k *Kernel) StartProcess(tg *ThreadGroup) {
+	t := tg.Leader()
+	tid := k.tasks.Root.IDOfTask(t)
+	t.Start(tid)
+}
+
+// Start starts execution of all tasks in k.
+//
+// Preconditions: Start may be called exactly once.
+func (k *Kernel) Start() error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+
+	if k.globalInit == nil {
+		return fmt.Errorf("kernel contains no tasks")
+	}
+	if k.started {
+		return fmt.Errorf("kernel already started")
+	}
+
+	k.started = true
+	k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, newKernelCPUClockTicker(k))
+	k.cpuClockTicker.Swap(ktime.Setting{
+		Enabled: true,
+		Period:  linux.ClockTick,
+	})
+	// If k was created by LoadKernelFrom, timers were stopped during
+	// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
+	// this is a no-op.
+	k.resumeTimeLocked()
+	// Start task goroutines.
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+	for t, tid := range k.tasks.Root.tids {
+		t.Start(tid)
+	}
+	return nil
+}
+
+// pauseTimeLocked pauses all Timers and Timekeeper updates.
+//
+// Preconditions: Any task goroutines running in k must be stopped. k.extMu
+// must be locked.
+func (k *Kernel) pauseTimeLocked() {
+	// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
+	// Kernel.Start().
+	if k.cpuClockTicker != nil {
+		k.cpuClockTicker.Pause()
+	}
+
+	// By precondition, nothing else can be interacting with PIDNamespace.tids
+	// or FDTable.files, so we can iterate them without synchronization. (We
+	// can't hold the TaskSet mutex when pausing thread group timers because
+	// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
+	// mutex, while holding the Timer mutex.)
+	for t := range k.tasks.Root.tids {
+		if t == t.tg.leader {
+			t.tg.itimerRealTimer.Pause()
+			for _, it := range t.tg.timers {
+				it.PauseTimer()
+			}
+		}
+		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
+		// but ktime.Timer.Pause is idempotent so this is harmless.
+		if t.fdTable != nil {
+			t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+				if VFS2Enabled {
+					if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
+						tfd.PauseTimer()
+					}
+				} else {
+					if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok {
+						tfd.PauseTimer()
+					}
+				}
+			})
+		}
+	}
+	k.timekeeper.PauseUpdates()
+}
+
+// resumeTimeLocked resumes all Timers and Timekeeper updates. If
+// pauseTimeLocked has not been previously called, resumeTimeLocked has no
+// effect.
+//
+// Preconditions: Any task goroutines running in k must be stopped. k.extMu
+// must be locked.
+func (k *Kernel) resumeTimeLocked() {
+	if k.cpuClockTicker != nil {
+		k.cpuClockTicker.Resume()
+	}
+
+	k.timekeeper.ResumeUpdates()
+	for t := range k.tasks.Root.tids {
+		if t == t.tg.leader {
+			t.tg.itimerRealTimer.Resume()
+			for _, it := range t.tg.timers {
+				it.ResumeTimer()
+			}
+		}
+		if t.fdTable != nil {
+			t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+				if VFS2Enabled {
+					if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
+						tfd.ResumeTimer()
+					}
+				} else {
+					if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok {
+						tfd.ResumeTimer()
+					}
+				}
+			})
+		}
+	}
+}
+
+func (k *Kernel) incRunningTasks() {
+	for {
+		tasks := atomic.LoadInt64(&k.runningTasks)
+		if tasks != 0 {
+			// Standard case. Simply increment.
+			if !atomic.CompareAndSwapInt64(&k.runningTasks, tasks, tasks+1) {
+				continue
+			}
+			return
+		}
+
+		// Transition from 0 -> 1. Synchronize with other transitions and timer.
+		k.runningTasksMu.Lock()
+		tasks = atomic.LoadInt64(&k.runningTasks)
+		if tasks != 0 {
+			// We're no longer the first task, no need to
+			// re-enable.
+			atomic.AddInt64(&k.runningTasks, 1)
+			k.runningTasksMu.Unlock()
+			return
+		}
+
+		if !k.cpuClockTickerDisabled {
+			// Timer was never disabled.
+			atomic.StoreInt64(&k.runningTasks, 1)
+			k.runningTasksMu.Unlock()
+			return
+		}
+
+		// We need to update cpuClock for all of the ticks missed while we
+		// slept, and then re-enable the timer.
+		//
+		// The Notify in Swap isn't sufficient. kernelCPUClockTicker.Notify
+		// always increments cpuClock by 1 regardless of the number of
+		// expirations as a heuristic to avoid over-accounting in cases of CPU
+		// throttling.
+		//
+		// We want to cover the normal case, when all time should be accounted,
+		// so we increment for all expirations. Throttling is less concerning
+		// here because the ticker is only disabled from Notify. This means
+		// that Notify must schedule and compensate for the throttled period
+		// before the timer is disabled. Throttling while the timer is disabled
+		// doesn't matter, as nothing is running or reading cpuClock anyways.
+		//
+		// S/R also adds complication, as there are two cases. Recall that
+		// monotonicClock will jump forward on restore.
+		//
+		// 1. If the ticker is enabled during save, then on Restore Notify is
+		// called with many expirations, covering the time jump, but cpuClock
+		// is only incremented by 1.
+		//
+		// 2. If the ticker is disabled during save, then after Restore the
+		// first wakeup will call this function and cpuClock will be
+		// incremented by the number of expirations across the S/R.
+		//
+		// These cause very different value of cpuClock. But again, since
+		// nothing was running while the ticker was disabled, those differences
+		// don't matter.
+		setting, exp := k.cpuClockTickerSetting.At(k.monotonicClock.Now())
+		if exp > 0 {
+			atomic.AddUint64(&k.cpuClock, exp)
+		}
+
+		// Now that cpuClock is updated it is safe to allow other tasks to
+		// transition to running.
+		atomic.StoreInt64(&k.runningTasks, 1)
+
+		// N.B. we must unlock before calling Swap to maintain lock ordering.
+		//
+		// cpuClockTickerDisabled need not wait until after Swap to become
+		// true. It is sufficient that the timer *will* be enabled.
+		k.cpuClockTickerDisabled = false
+		k.runningTasksMu.Unlock()
+
+		// This won't call Notify (unless it's been ClockTick since setting.At
+		// above). This means we skip the thread group work in Notify. However,
+		// since nothing was running while we were disabled, none of the timers
+		// could have expired.
+		k.cpuClockTicker.Swap(setting)
+
+		return
+	}
+}
+
+func (k *Kernel) decRunningTasks() {
+	tasks := atomic.AddInt64(&k.runningTasks, -1)
+	if tasks < 0 {
+		panic(fmt.Sprintf("Invalid running count %d", tasks))
+	}
+
+	// Nothing to do. The next CPU clock tick will disable the timer if
+	// there is still nothing running. This provides approximately one tick
+	// of slack in which we can switch back and forth between idle and
+	// active without an expensive transition.
+}
+
+// WaitExited blocks until all tasks in k have exited.
+func (k *Kernel) WaitExited() {
+	k.tasks.liveGoroutines.Wait()
+}
+
+// Kill requests that all tasks in k immediately exit as if group exiting with
+// status es. Kill does not wait for tasks to exit.
+func (k *Kernel) Kill(es ExitStatus) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.Kill(es)
+}
+
+// Pause requests that all tasks in k temporarily stop executing, and blocks
+// until all tasks and asynchronous I/O operations in k have stopped. Multiple
+// calls to Pause nest and require an equal number of calls to Unpause to
+// resume execution.
+func (k *Kernel) Pause() {
+	k.extMu.Lock()
+	k.tasks.BeginExternalStop()
+	k.extMu.Unlock()
+	k.tasks.runningGoroutines.Wait()
+	k.tasks.aioGoroutines.Wait()
+}
+
+// Unpause ends the effect of a previous call to Pause. If Unpause is called
+// without a matching preceding call to Pause, Unpause may panic.
+func (k *Kernel) Unpause() {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.EndExternalStop()
+}
+
+// SendExternalSignal injects a signal into the kernel.
+//
+// context is used only for debugging to describe how the signal was received.
+//
+// Preconditions: Kernel must have an init process.
+func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.sendExternalSignal(info, context)
+}
+
+// SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup.
+// This function doesn't skip signals like SendExternalSignal does.
+func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *arch.SignalInfo) error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return tg.SendSignal(info)
+}
+
+// SendContainerSignal sends the given signal to all processes inside the
+// namespace that match the given container ID.
+func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+
+	var lastErr error
+	for tg := range k.tasks.Root.tgids {
+		if tg.leader.ContainerID() == cid {
+			tg.signalHandlers.mu.Lock()
+			infoCopy := *info
+			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
+				lastErr = err
+			}
+			tg.signalHandlers.mu.Unlock()
+		}
+	}
+	return lastErr
+}
+
+// RebuildTraceContexts rebuilds the trace context for all tasks.
+//
+// Unfortunately, if these are built while tracing is not enabled, then we will
+// not have meaningful trace data. Rebuilding here ensures that we can do so
+// after tracing has been enabled.
+func (k *Kernel) RebuildTraceContexts() {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+
+	for t, tid := range k.tasks.Root.tids {
+		t.rebuildTraceContext(tid)
+	}
+}
+
+// FeatureSet returns the FeatureSet.
+func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
+	return k.featureSet
+}
+
+// Timekeeper returns the Timekeeper.
+func (k *Kernel) Timekeeper() *Timekeeper {
+	return k.timekeeper
+}
+
+// TaskSet returns the TaskSet.
+func (k *Kernel) TaskSet() *TaskSet {
+	return k.tasks
+}
+
+// RootUserNamespace returns the root UserNamespace.
+func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
+	return k.rootUserNamespace
+}
+
+// RootUTSNamespace returns the root UTSNamespace.
+func (k *Kernel) RootUTSNamespace() *UTSNamespace {
+	return k.rootUTSNamespace
+}
+
+// RootIPCNamespace returns the root IPCNamespace.
+func (k *Kernel) RootIPCNamespace() *IPCNamespace {
+	return k.rootIPCNamespace
+}
+
+// RootPIDNamespace returns the root PIDNamespace.
+func (k *Kernel) RootPIDNamespace() *PIDNamespace {
+	return k.tasks.Root
+}
+
+// RootAbstractSocketNamespace returns the root AbstractSocketNamespace.
+func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
+	return k.rootAbstractSocketNamespace
+}
+
+// RootNetworkNamespace returns the root network namespace, always non-nil.
+func (k *Kernel) RootNetworkNamespace() *inet.Namespace {
+	return k.rootNetworkNamespace
+}
+
+// GlobalInit returns the thread group with ID 1 in the root PID namespace, or
+// nil if no such thread group exists. GlobalInit may return a thread group
+// containing no tasks if the thread group has already exited.
+func (k *Kernel) GlobalInit() *ThreadGroup {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return k.globalInit
+}
+
+// TestOnly_SetGlobalInit sets the thread group with ID 1 in the root PID namespace.
+func (k *Kernel) TestOnly_SetGlobalInit(tg *ThreadGroup) {
+	k.globalInit = tg
+}
+
+// ApplicationCores returns the number of CPUs visible to sandboxed
+// applications.
+func (k *Kernel) ApplicationCores() uint {
+	return k.applicationCores
+}
+
+// RealtimeClock returns the application CLOCK_REALTIME clock.
+func (k *Kernel) RealtimeClock() ktime.Clock {
+	return k.realtimeClock
+}
+
+// MonotonicClock returns the application CLOCK_MONOTONIC clock.
+func (k *Kernel) MonotonicClock() ktime.Clock {
+	return k.monotonicClock
+}
+
+// CPUClockNow returns the current value of k.cpuClock.
+func (k *Kernel) CPUClockNow() uint64 {
+	return atomic.LoadUint64(&k.cpuClock)
+}
+
+// Syslog returns the syslog.
+func (k *Kernel) Syslog() *syslog {
+	return &k.syslog
+}
+
+// GenerateInotifyCookie generates a unique inotify event cookie.
+//
+// Returned values may overlap with previously returned values if the value
+// space is exhausted. 0 is not a valid cookie value, all other values
+// representable in a uint32 are allowed.
+func (k *Kernel) GenerateInotifyCookie() uint32 {
+	id := atomic.AddUint32(&k.nextInotifyCookie, 1)
+	// Wrap-around is explicitly allowed for inotify event cookies.
+	if id == 0 {
+		id = atomic.AddUint32(&k.nextInotifyCookie, 1)
+	}
+	return id
+}
+
+// NetlinkPorts returns the netlink port manager.
+func (k *Kernel) NetlinkPorts() *port.Manager {
+	return k.netlinkPorts
+}
+
+// SaveError returns the sandbox error that caused the kernel to exit during
+// save.
+func (k *Kernel) SaveError() error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return k.saveErr
+}
+
+// SetSaveError sets the sandbox error that caused the kernel to exit during
+// save, if one is not already set.
+func (k *Kernel) SetSaveError(err error) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	if k.saveErr == nil {
+		k.saveErr = err
+	}
+}
+
+var _ tcpip.Clock = (*Kernel)(nil)
+
+// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
+func (k *Kernel) NowNanoseconds() int64 {
+	now, err := k.timekeeper.GetTime(sentrytime.Realtime)
+	if err != nil {
+		panic("Kernel.NowNanoseconds: " + err.Error())
+	}
+	return now
+}
+
+// NowMonotonic implements tcpip.Clock.NowMonotonic.
+func (k *Kernel) NowMonotonic() int64 {
+	now, err := k.timekeeper.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		panic("Kernel.NowMonotonic: " + err.Error())
+	}
+	return now
+}
+
+// SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
+// LoadFrom.
+func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
+	k.mf = mf
+}
+
+// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
+func (k *Kernel) MemoryFile() *pgalloc.MemoryFile {
+	return k.mf
+}
+
+// SupervisorContext returns a Context with maximum privileges in k. It should
+// only be used by goroutines outside the control of the emulated kernel
+// defined by e.
+//
+// Callers are responsible for ensuring that the returned Context is not used
+// concurrently with changes to the Kernel.
+func (k *Kernel) SupervisorContext() context.Context {
+	return supervisorContext{
+		Logger: log.Log(),
+		k:      k,
+	}
+}
+
+// SocketEntry represents a socket recorded in Kernel.sockets. It implements
+// refs.WeakRefUser for sockets stored in the socket table.
+//
+// +stateify savable
+type SocketEntry struct {
+	socketEntry
+	k        *Kernel
+	Sock     *refs.WeakRef
+	SockVFS2 *vfs.FileDescription
+	ID       uint64 // Socket table entry number.
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+func (s *SocketEntry) WeakRefGone() {
+	s.k.extMu.Lock()
+	s.k.sockets.Remove(s)
+	s.k.extMu.Unlock()
+}
+
+// RecordSocket adds a socket to the system-wide socket table for tracking.
+//
+// Precondition: Caller must hold a reference to sock.
+func (k *Kernel) RecordSocket(sock *fs.File) {
+	k.extMu.Lock()
+	id := k.nextSocketEntry
+	k.nextSocketEntry++
+	s := &SocketEntry{k: k, ID: id}
+	s.Sock = refs.NewWeakRef(sock, s)
+	k.sockets.PushBack(s)
+	k.extMu.Unlock()
+}
+
+// RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for
+// tracking.
+//
+// Precondition: Caller must hold a reference to sock.
+//
+// Note that the socket table will not hold a reference on the
+// vfs.FileDescription, because we do not support weak refs on VFS2 files.
+func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
+	k.extMu.Lock()
+	id := k.nextSocketEntry
+	k.nextSocketEntry++
+	s := &SocketEntry{
+		k:        k,
+		ID:       id,
+		SockVFS2: sock,
+	}
+	k.sockets.PushBack(s)
+	k.extMu.Unlock()
+}
+
+// ListSockets returns a snapshot of all sockets.
+//
+// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef()
+// to get a reference on a socket in the table.
+func (k *Kernel) ListSockets() []*SocketEntry {
+	k.extMu.Lock()
+	var socks []*SocketEntry
+	for s := k.sockets.Front(); s != nil; s = s.Next() {
+		socks = append(socks, s)
+	}
+	k.extMu.Unlock()
+	return socks
+}
+
+// supervisorContext is a privileged context.
+type supervisorContext struct {
+	context.NoopSleeper
+	log.Logger
+	k *Kernel
+}
+
+// Value implements context.Context.
+func (ctx supervisorContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxCanTrace:
+		// The supervisor context can trace anything. (None of
+		// supervisorContext's users are expected to invoke ptrace, but ptrace
+		// permissions are required for certain file accesses.)
+		return func(*Task, bool) bool { return true }
+	case CtxKernel:
+		return ctx.k
+	case CtxPIDNamespace:
+		return ctx.k.tasks.Root
+	case CtxUTSNamespace:
+		return ctx.k.rootUTSNamespace
+	case CtxIPCNamespace:
+		return ctx.k.rootIPCNamespace
+	case auth.CtxCredentials:
+		// The supervisor context is global root.
+		return auth.NewRootCredentials(ctx.k.rootUserNamespace)
+	case fs.CtxRoot:
+		if ctx.k.globalInit != nil {
+			return ctx.k.globalInit.mounts.Root()
+		}
+		return nil
+	case vfs.CtxRoot:
+		if ctx.k.globalInit == nil {
+			return vfs.VirtualDentry{}
+		}
+		mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+		defer mntns.DecRef()
+		// Root() takes a reference on the root dirent for us.
+		return mntns.Root()
+	case vfs.CtxMountNamespace:
+		if ctx.k.globalInit == nil {
+			return nil
+		}
+		// MountNamespaceVFS2() takes a reference for us.
+		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+	case fs.CtxDirentCacheLimiter:
+		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
+	case ktime.CtxRealtimeClock:
+		return ctx.k.RealtimeClock()
+	case limits.CtxLimits:
+		// No limits apply.
+		return limits.NewLimitSet()
+	case pgalloc.CtxMemoryFile:
+		return ctx.k.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return ctx.k
+	case platform.CtxPlatform:
+		return ctx.k
+	case uniqueid.CtxGlobalUniqueID:
+		return ctx.k.UniqueID()
+	case uniqueid.CtxGlobalUniqueIDProvider:
+		return ctx.k
+	case uniqueid.CtxInotifyCookie:
+		return ctx.k.GenerateInotifyCookie()
+	case unimpl.CtxEvents:
+		return ctx.k
+	default:
+		return nil
+	}
+}
+
+// Rate limits for the number of unimplemented syscall events.
+const (
+	unimplementedSyscallsMaxRate = 100  // events per second
+	unimplementedSyscallBurst    = 1000 // events
+)
+
+// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
+// channel.
+func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
+	k.unimplementedSyscallEmitterOnce.Do(func() {
+		k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst)
+	})
+
+	t := TaskFromContext(ctx)
+	k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{
+		Tid:       int32(t.ThreadID()),
+		Registers: t.Arch().StateData().Proto(),
+	})
+}
+
+// VFS returns the virtual filesystem for the kernel.
+func (k *Kernel) VFS() *vfs.VirtualFilesystem {
+	return &k.vfs
+}
+
+// SetHostMount sets the hostfs mount.
+func (k *Kernel) SetHostMount(mnt *vfs.Mount) {
+	if k.hostMount != nil {
+		panic("Kernel.hostMount cannot be set more than once")
+	}
+	k.hostMount = mnt
+}
+
+// HostMount returns the hostfs mount.
+func (k *Kernel) HostMount() *vfs.Mount {
+	return k.hostMount
+}
+
+// PipeMount returns the pipefs mount.
+func (k *Kernel) PipeMount() *vfs.Mount {
+	return k.pipeMount
+}
+
+// ShmMount returns the tmpfs mount.
+func (k *Kernel) ShmMount() *vfs.Mount {
+	return k.shmMount
+}
+
+// SocketMount returns the sockfs mount.
+func (k *Kernel) SocketMount() *vfs.Mount {
+	return k.socketMount
+}
diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go
new file mode 100644
index 000000000..2e66ec587
--- /dev/null
+++ b/pkg/sentry/kernel/kernel_opts.go
@@ -0,0 +1,20 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// SpecialOpts contains non-standard options for the kernel.
+//
+// +stateify savable
+type SpecialOpts struct{}
diff --git a/pkg/sentry/kernel/kernel_state.go b/pkg/sentry/kernel/kernel_state.go
new file mode 100644
index 000000000..909219086
--- /dev/null
+++ b/pkg/sentry/kernel/kernel_state.go
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/device"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+// saveDanglingEndpoints is invoked by stateify.
+func (k *Kernel) saveDanglingEndpoints() []tcpip.Endpoint {
+	return tcpip.GetDanglingEndpoints()
+}
+
+// loadDanglingEndpoints is invoked by stateify.
+func (k *Kernel) loadDanglingEndpoints(es []tcpip.Endpoint) {
+	for _, e := range es {
+		tcpip.AddDanglingEndpoint(e)
+	}
+}
+
+// saveDeviceRegistry is invoked by stateify.
+func (k *Kernel) saveDeviceRegistry() *device.Registry {
+	return device.SimpleDevices
+}
+
+// loadDeviceRegistry is invoked by stateify.
+func (k *Kernel) loadDeviceRegistry(r *device.Registry) {
+	device.SimpleDevices.LoadFrom(r)
+}
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
new file mode 100644
index 000000000..4486848d2
--- /dev/null
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -0,0 +1,24 @@
+load("//tools:defs.bzl", "go_library", "proto_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "memevent",
+    srcs = ["memory_events.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+        ":memory_events_go_proto",
+        "//pkg/eventchannel",
+        "//pkg/log",
+        "//pkg/metric",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/usage",
+        "//pkg/sync",
+    ],
+)
+
+proto_library(
+    name = "memory_events",
+    srcs = ["memory_events.proto"],
+    visibility = ["//visibility:public"],
+)
diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
new file mode 100644
index 000000000..200565bb8
--- /dev/null
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -0,0 +1,111 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memevent implements the memory usage events controller, which
+// periodically emits events via the eventchannel.
+package memevent
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/eventchannel"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/metric"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	pb "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+var totalTicks = metric.MustCreateNewUint64Metric("/memory_events/ticks", false /*sync*/, "Total number of memory event periods that have elapsed since startup.")
+var totalEvents = metric.MustCreateNewUint64Metric("/memory_events/events", false /*sync*/, "Total number of memory events emitted.")
+
+// MemoryEvents describes the configuration for the global memory event emitter.
+type MemoryEvents struct {
+	k *kernel.Kernel
+
+	// The period is how often to emit an event. The memory events goroutine
+	// will ensure a minimum of one event is emitted per this period, regardless
+	// how of much memory usage has changed.
+	period time.Duration
+
+	// Writing to this channel indicates the memory goroutine should stop.
+	stop chan struct{}
+
+	// done is used to signal when the memory event goroutine has exited.
+	done sync.WaitGroup
+}
+
+// New creates a new MemoryEvents.
+func New(k *kernel.Kernel, period time.Duration) *MemoryEvents {
+	return &MemoryEvents{
+		k:      k,
+		period: period,
+		stop:   make(chan struct{}),
+	}
+}
+
+// Stop stops the memory usage events emitter goroutine. Stop must not be called
+// concurrently with Start and may only be called once.
+func (m *MemoryEvents) Stop() {
+	close(m.stop)
+	m.done.Wait()
+}
+
+// Start starts the memory usage events emitter goroutine. Start must not be
+// called concurrently with Stop and may only be called once.
+func (m *MemoryEvents) Start() {
+	if m.period == 0 {
+		return
+	}
+	m.done.Add(1)
+	go m.run() // S/R-SAFE: doesn't interact with saved state.
+}
+
+func (m *MemoryEvents) run() {
+	defer m.done.Done()
+
+	// Emit the first event immediately on startup.
+	totalTicks.Increment()
+	m.emit()
+
+	ticker := time.NewTicker(m.period)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-m.stop:
+			return
+		case <-ticker.C:
+			totalTicks.Increment()
+			m.emit()
+		}
+	}
+}
+
+func (m *MemoryEvents) emit() {
+	totalPlatform, err := m.k.MemoryFile().TotalUsage()
+	if err != nil {
+		log.Warningf("Failed to fetch memory usage for memory events: %v", err)
+		return
+	}
+	snapshot, _ := usage.MemoryAccounting.Copy()
+	total := totalPlatform + snapshot.Mapped
+
+	totalEvents.Increment()
+	eventchannel.Emit(&pb.MemoryUsageEvent{
+		Mapped: snapshot.Mapped,
+		Total:  total,
+	})
+}
diff --git a/pkg/sentry/kernel/memevent/memory_events.proto b/pkg/sentry/kernel/memevent/memory_events.proto
new file mode 100644
index 000000000..bf8029ff5
--- /dev/null
+++ b/pkg/sentry/kernel/memevent/memory_events.proto
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package gvisor;
+
+// MemoryUsageEvent describes the memory usage of the sandbox at a single
+// instant in time. These messages are emitted periodically on the eventchannel.
+message MemoryUsageEvent {
+  // The total memory usage of the sandboxed application in bytes, calculated
+  // using the 'fast' method.
+  uint64 total = 1;
+
+  // Memory used to back memory-mapped regions for files in the application, in
+  // bytes. This corresponds to the usage.MemoryKind.Mapped memory type.
+  uint64 mapped = 2;
+}
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
new file mode 100644
index 000000000..77a35b788
--- /dev/null
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -0,0 +1,142 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+const (
+	// stdSignalCap is the maximum number of instances of a given standard
+	// signal that may be pending. ("[If] multiple instances of a standard
+	// signal are delivered while that signal is currently blocked, then only
+	// one instance is queued.") - signal(7)
+	stdSignalCap = 1
+
+	// rtSignalCap is the maximum number of instances of a given realtime
+	// signal that may be pending.
+	//
+	// TODO(igudger): In Linux, the minimum signal queue size is
+	// RLIMIT_SIGPENDING, which is by default max_threads/2.
+	rtSignalCap = 32
+)
+
+// pendingSignals holds a collection of pending signals. The zero value of
+// pendingSignals is a valid empty collection. pendingSignals is thread-unsafe;
+// users must provide synchronization.
+//
+// +stateify savable
+type pendingSignals struct {
+	// signals contains all pending signals.
+	//
+	// Note that signals is zero-indexed, but signal 1 is the first valid
+	// signal, so signals[0] contains signals with signo 1 etc. This offset is
+	// usually handled by using Signal.index().
+	signals [linux.SignalMaximum]pendingSignalQueue `state:".([]savedPendingSignal)"`
+
+	// Bit i of pendingSet is set iff there is at least one signal with signo
+	// i+1 pending.
+	pendingSet linux.SignalSet `state:"manual"`
+}
+
+// pendingSignalQueue holds a pendingSignalList for a single signal number.
+//
+// +stateify savable
+type pendingSignalQueue struct {
+	pendingSignalList
+	length int
+}
+
+// +stateify savable
+type pendingSignal struct {
+	// pendingSignalEntry links into a pendingSignalList.
+	pendingSignalEntry
+	*arch.SignalInfo
+
+	// If timer is not nil, it is the IntervalTimer which sent this signal.
+	timer *IntervalTimer
+}
+
+// enqueue enqueues the given signal. enqueue returns true on success and false
+// on failure (if the given signal's queue is full).
+//
+// Preconditions: info represents a valid signal.
+func (p *pendingSignals) enqueue(info *arch.SignalInfo, timer *IntervalTimer) bool {
+	sig := linux.Signal(info.Signo)
+	q := &p.signals[sig.Index()]
+	if sig.IsStandard() {
+		if q.length >= stdSignalCap {
+			return false
+		}
+	} else if q.length >= rtSignalCap {
+		return false
+	}
+	q.pendingSignalList.PushBack(&pendingSignal{SignalInfo: info, timer: timer})
+	q.length++
+	p.pendingSet |= linux.SignalSetOf(sig)
+	return true
+}
+
+// dequeue dequeues and returns any pending signal not masked by mask. If no
+// unmasked signals are pending, dequeue returns nil.
+func (p *pendingSignals) dequeue(mask linux.SignalSet) *arch.SignalInfo {
+	// "Real-time signals are delivered in a guaranteed order. Multiple
+	// real-time signals of the same type are delivered in the order they were
+	// sent. If different real-time signals are sent to a process, they are
+	// delivered starting with the lowest-numbered signal. (I.e., low-numbered
+	// signals have highest priority.) By contrast, if multiple standard
+	// signals are pending for a process, the order in which they are delivered
+	// is unspecified. If both standard and real-time signals are pending for a
+	// process, POSIX leaves it unspecified which is delivered first. Linux,
+	// like many other implementations, gives priority to standard signals in
+	// this case." - signal(7)
+	lowestPendingUnblockedBit := bits.TrailingZeros64(uint64(p.pendingSet &^ mask))
+	if lowestPendingUnblockedBit >= linux.SignalMaximum {
+		return nil
+	}
+	return p.dequeueSpecific(linux.Signal(lowestPendingUnblockedBit + 1))
+}
+
+func (p *pendingSignals) dequeueSpecific(sig linux.Signal) *arch.SignalInfo {
+	q := &p.signals[sig.Index()]
+	ps := q.pendingSignalList.Front()
+	if ps == nil {
+		return nil
+	}
+	q.pendingSignalList.Remove(ps)
+	q.length--
+	if q.length == 0 {
+		p.pendingSet &^= linux.SignalSetOf(sig)
+	}
+	if ps.timer != nil {
+		ps.timer.updateDequeuedSignalLocked(ps.SignalInfo)
+	}
+	return ps.SignalInfo
+}
+
+// discardSpecific causes all pending signals with number sig to be discarded.
+func (p *pendingSignals) discardSpecific(sig linux.Signal) {
+	q := &p.signals[sig.Index()]
+	for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() {
+		if ps.timer != nil {
+			ps.timer.signalRejectedLocked()
+		}
+	}
+	q.pendingSignalList.Reset()
+	q.length = 0
+	p.pendingSet &^= linux.SignalSetOf(sig)
+}
diff --git a/pkg/sentry/kernel/pending_signals_state.go b/pkg/sentry/kernel/pending_signals_state.go
new file mode 100644
index 000000000..ca8b4e164
--- /dev/null
+++ b/pkg/sentry/kernel/pending_signals_state.go
@@ -0,0 +1,46 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+// +stateify savable
+type savedPendingSignal struct {
+	si    *arch.SignalInfo
+	timer *IntervalTimer
+}
+
+// saveSignals is invoked by stateify.
+func (p *pendingSignals) saveSignals() []savedPendingSignal {
+	var pending []savedPendingSignal
+	for _, q := range p.signals {
+		for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() {
+			pending = append(pending, savedPendingSignal{
+				si:    ps.SignalInfo,
+				timer: ps.timer,
+			})
+		}
+	}
+	return pending
+}
+
+// loadSignals is invoked by stateify.
+func (p *pendingSignals) loadSignals(pending []savedPendingSignal) {
+	for _, sps := range pending {
+		p.enqueue(sps.si, sps.timer)
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
new file mode 100644
index 000000000..449643118
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -0,0 +1,54 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "pipe",
+    srcs = [
+        "device.go",
+        "node.go",
+        "pipe.go",
+        "pipe_unsafe.go",
+        "pipe_util.go",
+        "reader.go",
+        "reader_writer.go",
+        "vfs.go",
+        "writer.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/buffer",
+        "//pkg/context",
+        "//pkg/safemem",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "pipe_test",
+    size = "small",
+    srcs = [
+        "node_test.go",
+        "pipe_test.go",
+    ],
+    library = ":pipe",
+    deps = [
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/pipe/device.go b/pkg/sentry/kernel/pipe/device.go
new file mode 100644
index 000000000..89f5d9342
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import "gvisor.dev/gvisor/pkg/sentry/device"
+
+// pipeDevice is used for all pipe files.
+var pipeDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
new file mode 100644
index 000000000..4b688c627
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -0,0 +1,139 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// inodeOperations implements fs.InodeOperations for pipes.
+//
+// +stateify savable
+type inodeOperations struct {
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+
+	// Marking pipe inodes as virtual allows them to be saved and restored
+	// even if they have been unlinked. We can get away with this because
+	// their state exists entirely within the sentry.
+	fsutil.InodeVirtual `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// p is the underlying Pipe object representing this fifo.
+	p *Pipe
+
+	// Channels for synchronizing the creation of new readers and writers of
+	// this fifo. See waitFor and newHandleLocked.
+	//
+	// These are not saved/restored because all waiters are unblocked on save,
+	// and either automatically restart (via ERESTARTSYS) or return EINTR on
+	// resume. On restarts via ERESTARTSYS, the appropriate channel will be
+	// recreated.
+	rWakeup chan struct{} `state:"nosave"`
+	wWakeup chan struct{} `state:"nosave"`
+}
+
+var _ fs.InodeOperations = (*inodeOperations)(nil)
+
+// NewInodeOperations returns a new fs.InodeOperations for a given pipe.
+func NewInodeOperations(ctx context.Context, perms fs.FilePermissions, p *Pipe) *inodeOperations {
+	return &inodeOperations{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.FileOwnerFromContext(ctx), perms, linux.PIPEFS_MAGIC),
+		p:                     p,
+	}
+}
+
+// GetFile implements fs.InodeOperations.GetFile. Named pipes have special blocking
+// semantics during open:
+//
+// "Normally, opening the FIFO blocks until the other end is opened also. A
+// process can open a FIFO in nonblocking mode. In this case, opening for
+// read-only will succeed even if no-one has opened on the write side yet,
+// opening for write-only will fail with ENXIO (no such device or address)
+// unless the other end has already been opened. Under Linux, opening a FIFO
+// for read and write will succeed both in blocking and nonblocking mode. POSIX
+// leaves this behavior undefined. This can be used to open a FIFO for writing
+// while there are no readers available." - fifo(7)
+func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	switch {
+	case flags.Read && !flags.Write: // O_RDONLY.
+		r := i.p.Open(ctx, d, flags)
+		newHandleLocked(&i.rWakeup)
+
+		if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() {
+			if !waitFor(&i.mu, &i.wWakeup, ctx) {
+				r.DecRef()
+				return nil, syserror.ErrInterrupted
+			}
+		}
+
+		// By now, either we're doing a nonblocking open or we have a writer. On
+		// a nonblocking read-only open, the open succeeds even if no-one has
+		// opened the write side yet.
+		return r, nil
+
+	case flags.Write && !flags.Read: // O_WRONLY.
+		w := i.p.Open(ctx, d, flags)
+		newHandleLocked(&i.wWakeup)
+
+		if i.p.isNamed && !i.p.HasReaders() {
+			// On a nonblocking, write-only open, the open fails with ENXIO if the
+			// read side isn't open yet.
+			if flags.NonBlocking {
+				w.DecRef()
+				return nil, syserror.ENXIO
+			}
+
+			if !waitFor(&i.mu, &i.rWakeup, ctx) {
+				w.DecRef()
+				return nil, syserror.ErrInterrupted
+			}
+		}
+		return w, nil
+
+	case flags.Read && flags.Write: // O_RDWR.
+		// Pipes opened for read-write always succeeds without blocking.
+		rw := i.p.Open(ctx, d, flags)
+		newHandleLocked(&i.rWakeup)
+		newHandleLocked(&i.wWakeup)
+		return rw, nil
+
+	default:
+		return nil, syserror.EINVAL
+	}
+}
+
+func (*inodeOperations) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error {
+	return syserror.EPIPE
+}
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
new file mode 100644
index 000000000..ab75a87ff
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -0,0 +1,320 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type sleeper struct {
+	context.Context
+	ch chan struct{}
+}
+
+func newSleeperContext(t *testing.T) context.Context {
+	return &sleeper{
+		Context: contexttest.Context(t),
+		ch:      make(chan struct{}),
+	}
+}
+
+func (s *sleeper) SleepStart() <-chan struct{} {
+	return s.ch
+}
+
+func (s *sleeper) SleepFinish(bool) {
+}
+
+func (s *sleeper) Cancel() {
+	s.ch <- struct{}{}
+}
+
+func (s *sleeper) Interrupted() bool {
+	return len(s.ch) != 0
+}
+
+type openResult struct {
+	*fs.File
+	error
+}
+
+var perms fs.FilePermissions = fs.FilePermissions{
+	User: fs.PermMask{Read: true, Write: true},
+}
+
+func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, doneChan chan<- struct{}) (*fs.File, error) {
+	inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe})
+	d := fs.NewDirent(ctx, inode, "pipe")
+	file, err := n.GetFile(ctx, d, flags)
+	if err != nil {
+		t.Fatalf("open with flags %+v failed: %v", flags, err)
+	}
+	if doneChan != nil {
+		doneChan <- struct{}{}
+	}
+	return file, err
+}
+
+func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, resChan chan<- openResult) (*fs.File, error) {
+	inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe})
+	d := fs.NewDirent(ctx, inode, "pipe")
+	file, err := n.GetFile(ctx, d, flags)
+	if resChan != nil {
+		resChan <- openResult{file, err}
+	}
+	return file, err
+}
+
+func newNamedPipe(t *testing.T) *Pipe {
+	return NewPipe(true, DefaultPipeSize, usermem.PageSize)
+}
+
+func newAnonPipe(t *testing.T) *Pipe {
+	return NewPipe(false, DefaultPipeSize, usermem.PageSize)
+}
+
+// assertRecvBlocks ensures that a recv attempt on c blocks for at least
+// blockDuration. This is useful for checking that a goroutine that is supposed
+// to be executing a blocking operation is actually blocking.
+func assertRecvBlocks(t *testing.T, c <-chan struct{}, blockDuration time.Duration, failMsg string) {
+	select {
+	case <-c:
+		t.Fatalf(failMsg)
+	case <-time.After(blockDuration):
+		// Ok, blocked for the required duration.
+	}
+}
+
+func TestReadOpenBlocksForWriteOpen(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
+
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	// Verify that the open for read is blocking.
+	assertRecvBlocks(t, rDone, time.Millisecond*100,
+		"open for read not blocking with no writers")
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	<-wDone
+	<-rDone
+}
+
+func TestWriteOpenBlocksForReadOpen(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	// Verify that the open for write is blocking
+	assertRecvBlocks(t, wDone, time.Millisecond*100,
+		"open for write not blocking with no readers")
+
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	<-rDone
+	<-wDone
+}
+
+func TestMultipleWriteOpenDoesntCountAsReadOpen(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
+
+	rDone1 := make(chan struct{})
+	rDone2 := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone1)
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone2)
+
+	assertRecvBlocks(t, rDone1, time.Millisecond*100,
+		"open for read didn't block with no writers")
+	assertRecvBlocks(t, rDone2, time.Millisecond*100,
+		"open for read didn't block with no writers")
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	<-wDone
+	<-rDone2
+	<-rDone1
+}
+
+func TestClosedReaderBlocksWriteOpen(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
+
+	rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil)
+	rFile.DecRef()
+
+	wDone := make(chan struct{})
+	// This open for write should block because the reader is now gone.
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+	assertRecvBlocks(t, wDone, time.Millisecond*100,
+		"open for write didn't block with no concurrent readers")
+
+	// Open for read again. This should unblock the open for write.
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	<-rDone
+	<-wDone
+}
+
+func TestReadWriteOpenNeverBlocks(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
+
+	rwDone := make(chan struct{})
+	// Open for read-write never wait for a reader or writer, even if the
+	// nonblocking flag is not set.
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true, NonBlocking: false}, rwDone)
+	<-rwDone
+}
+
+func TestReadWriteOpenUnblocksReadOpen(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
+
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	rwDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true}, rwDone)
+
+	<-rwDone
+	<-rDone
+}
+
+func TestReadWriteOpenUnblocksWriteOpen(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	rwDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true}, rwDone)
+
+	<-rwDone
+	<-wDone
+}
+
+func TestBlockedOpenIsCancellable(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
+
+	done := make(chan openResult)
+	go testOpen(ctx, t, f, fs.FileFlags{Read: true}, done)
+	select {
+	case <-done:
+		t.Fatalf("open for read didn't block with no writers")
+	case <-time.After(time.Millisecond * 100):
+		// Ok.
+	}
+
+	ctx.(*sleeper).Cancel()
+	// If the cancel on the sleeper didn't work, the open for read would never
+	// return.
+	res := <-done
+	if res.error != syserror.ErrInterrupted {
+		t.Fatalf("Cancellation didn't cause GetFile to return fs.ErrInterrupted, got %v.",
+			res.error)
+	}
+}
+
+func TestNonblockingReadOpenFileNoWriters(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil {
+		t.Fatalf("Nonblocking open for read failed with error %v.", err)
+	}
+}
+
+func TestNonblockingWriteOpenFileNoReaders(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != syserror.ENXIO {
+		t.Fatalf("Nonblocking open for write failed unexpected error %v.", err)
+	}
+}
+
+func TestNonBlockingReadOpenWithWriter(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	// Open for write blocks since there are no readers yet.
+	assertRecvBlocks(t, wDone, time.Millisecond*100,
+		"Open for write didn't block with no reader.")
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil {
+		t.Fatalf("Nonblocking open for read failed with error %v.", err)
+	}
+
+	// Open for write should now be unblocked.
+	<-wDone
+}
+
+func TestNonBlockingWriteOpenWithReader(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
+
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	// Open for write blocked, since no reader yet.
+	assertRecvBlocks(t, rDone, time.Millisecond*100,
+		"Open for reader didn't block with no writer.")
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != nil {
+		t.Fatalf("Nonblocking open for write failed with error %v.", err)
+	}
+
+	// Open for write should now be unblocked.
+	<-rDone
+}
+
+func TestAnonReadOpen(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newAnonPipe(t))
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true}, nil); err != nil {
+		t.Fatalf("open anon pipe for read failed: %v", err)
+	}
+}
+
+func TestAnonWriteOpen(t *testing.T) {
+	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newAnonPipe(t))
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true}, nil); err != nil {
+		t.Fatalf("open anon pipe for write failed: %v", err)
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
new file mode 100644
index 000000000..79645d7d2
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -0,0 +1,419 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pipe provides a pipe implementation.
+package pipe
+
+import (
+	"fmt"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/buffer"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// MinimumPipeSize is a hard limit of the minimum size of a pipe.
+	MinimumPipeSize = 64 << 10
+
+	// DefaultPipeSize is the system-wide default size of a pipe in bytes.
+	DefaultPipeSize = MinimumPipeSize
+
+	// MaximumPipeSize is a hard limit on the maximum size of a pipe.
+	MaximumPipeSize = 8 << 20
+)
+
+// Pipe is an encapsulation of a platform-independent pipe.
+// It manages a buffered byte queue shared between a reader/writer
+// pair.
+//
+// +stateify savable
+type Pipe struct {
+	waiter.Queue `state:"nosave"`
+
+	// isNamed indicates whether this is a named pipe.
+	//
+	// This value is immutable.
+	isNamed bool
+
+	// atomicIOBytes is the maximum number of bytes that the pipe will
+	// guarantee atomic reads or writes atomically.
+	//
+	// This value is immutable.
+	atomicIOBytes int64
+
+	// The number of active readers for this pipe.
+	//
+	// Access atomically.
+	readers int32
+
+	// The number of active writes for this pipe.
+	//
+	// Access atomically.
+	writers int32
+
+	// mu protects all pipe internal state below.
+	mu sync.Mutex `state:"nosave"`
+
+	// view is the underlying set of buffers.
+	//
+	// This is protected by mu.
+	view buffer.View
+
+	// max is the maximum size of the pipe in bytes. When this max has been
+	// reached, writers will get EWOULDBLOCK.
+	//
+	// This is protected by mu.
+	max int64
+
+	// hadWriter indicates if this pipe ever had a writer. Note that this
+	// does not necessarily indicate there is *currently* a writer, just
+	// that there has been a writer at some point since the pipe was
+	// created.
+	//
+	// This is protected by mu.
+	hadWriter bool
+}
+
+// NewPipe initializes and returns a pipe.
+//
+// N.B. The size and atomicIOBytes will be bounded.
+func NewPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe {
+	if sizeBytes < MinimumPipeSize {
+		sizeBytes = MinimumPipeSize
+	}
+	if sizeBytes > MaximumPipeSize {
+		sizeBytes = MaximumPipeSize
+	}
+	if atomicIOBytes <= 0 {
+		atomicIOBytes = 1
+	}
+	if atomicIOBytes > sizeBytes {
+		atomicIOBytes = sizeBytes
+	}
+	var p Pipe
+	initPipe(&p, isNamed, sizeBytes, atomicIOBytes)
+	return &p
+}
+
+func initPipe(pipe *Pipe, isNamed bool, sizeBytes, atomicIOBytes int64) {
+	if sizeBytes < MinimumPipeSize {
+		sizeBytes = MinimumPipeSize
+	}
+	if sizeBytes > MaximumPipeSize {
+		sizeBytes = MaximumPipeSize
+	}
+	if atomicIOBytes <= 0 {
+		atomicIOBytes = 1
+	}
+	if atomicIOBytes > sizeBytes {
+		atomicIOBytes = sizeBytes
+	}
+	pipe.isNamed = isNamed
+	pipe.max = sizeBytes
+	pipe.atomicIOBytes = atomicIOBytes
+}
+
+// NewConnectedPipe initializes a pipe and returns a pair of objects
+// representing the read and write ends of the pipe.
+func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) {
+	p := NewPipe(false /* isNamed */, sizeBytes, atomicIOBytes)
+
+	// Build an fs.Dirent for the pipe which will be shared by both
+	// returned files.
+	perms := fs.FilePermissions{
+		User: fs.PermMask{Read: true, Write: true},
+	}
+	iops := NewInodeOperations(ctx, perms, p)
+	ino := pipeDevice.NextIno()
+	sattr := fs.StableAttr{
+		Type:      fs.Pipe,
+		DeviceID:  pipeDevice.DeviceID(),
+		InodeID:   ino,
+		BlockSize: int64(atomicIOBytes),
+	}
+	ms := fs.NewPseudoMountSource(ctx)
+	d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino))
+	// The p.Open calls below will each take a reference on the Dirent. We
+	// must drop the one we already have.
+	defer d.DecRef()
+	return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true})
+}
+
+// Open opens the pipe and returns a new file.
+//
+// Precondition: at least one of flags.Read or flags.Write must be set.
+func (p *Pipe) Open(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) *fs.File {
+	flags.NonSeekable = true
+	switch {
+	case flags.Read && flags.Write:
+		p.rOpen()
+		p.wOpen()
+		return fs.NewFile(ctx, d, flags, &ReaderWriter{
+			Pipe: p,
+		})
+	case flags.Read:
+		p.rOpen()
+		return fs.NewFile(ctx, d, flags, &Reader{
+			ReaderWriter: ReaderWriter{Pipe: p},
+		})
+	case flags.Write:
+		p.wOpen()
+		return fs.NewFile(ctx, d, flags, &Writer{
+			ReaderWriter: ReaderWriter{Pipe: p},
+		})
+	default:
+		// Precondition violated.
+		panic("invalid pipe flags")
+	}
+}
+
+type readOps struct {
+	// left returns the bytes remaining.
+	left func() int64
+
+	// limit limits subsequence reads.
+	limit func(int64)
+
+	// read performs the actual read operation.
+	read func(*buffer.View) (int64, error)
+}
+
+// read reads data from the pipe into dst and returns the number of bytes
+// read, or returns ErrWouldBlock if the pipe is empty.
+//
+// Precondition: this pipe must have readers.
+func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
+	// Don't block for a zero-length read even if the pipe is empty.
+	if ops.left() == 0 {
+		return 0, nil
+	}
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.readLocked(ctx, ops)
+}
+
+func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
+	// Is the pipe empty?
+	if p.view.Size() == 0 {
+		if !p.HasWriters() {
+			// There are no writers, return EOF.
+			return 0, nil
+		}
+		return 0, syserror.ErrWouldBlock
+	}
+
+	// Limit how much we consume.
+	if ops.left() > p.view.Size() {
+		ops.limit(p.view.Size())
+	}
+
+	// Copy user data; the read op is responsible for trimming.
+	done, err := ops.read(&p.view)
+	return done, err
+}
+
+type writeOps struct {
+	// left returns the bytes remaining.
+	left func() int64
+
+	// limit should limit subsequent writes.
+	limit func(int64)
+
+	// write should write to the provided buffer.
+	write func(*buffer.View) (int64, error)
+}
+
+// write writes data from sv into the pipe and returns the number of bytes
+// written. If no bytes are written because the pipe is full (or has less than
+// atomicIOBytes free capacity), write returns ErrWouldBlock.
+//
+// Precondition: this pipe must have writers.
+func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.writeLocked(ctx, ops)
+}
+
+func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) {
+	// Can't write to a pipe with no readers.
+	if !p.HasReaders() {
+		return 0, syscall.EPIPE
+	}
+
+	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
+	// atomic, but requires no atomicity for writes larger than this.
+	wanted := ops.left()
+	avail := p.max - p.view.Size()
+	if wanted > avail {
+		if wanted <= p.atomicIOBytes {
+			return 0, syserror.ErrWouldBlock
+		}
+		ops.limit(avail)
+	}
+
+	// Copy user data.
+	done, err := ops.write(&p.view)
+	if err != nil {
+		return done, err
+	}
+
+	if done < avail {
+		// Non-failure, but short write.
+		return done, nil
+	}
+	if done < wanted {
+		// Partial write due to full pipe. Note that this could also be
+		// the short write case above, we would expect a second call
+		// and the write to return zero bytes in this case.
+		return done, syserror.ErrWouldBlock
+	}
+
+	return done, nil
+}
+
+// rOpen signals a new reader of the pipe.
+func (p *Pipe) rOpen() {
+	atomic.AddInt32(&p.readers, 1)
+}
+
+// wOpen signals a new writer of the pipe.
+func (p *Pipe) wOpen() {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	p.hadWriter = true
+	atomic.AddInt32(&p.writers, 1)
+}
+
+// rClose signals that a reader has closed their end of the pipe.
+func (p *Pipe) rClose() {
+	newReaders := atomic.AddInt32(&p.readers, -1)
+	if newReaders < 0 {
+		panic(fmt.Sprintf("Refcounting bug, pipe has negative readers: %v", newReaders))
+	}
+}
+
+// wClose signals that a writer has closed their end of the pipe.
+func (p *Pipe) wClose() {
+	newWriters := atomic.AddInt32(&p.writers, -1)
+	if newWriters < 0 {
+		panic(fmt.Sprintf("Refcounting bug, pipe has negative writers: %v.", newWriters))
+	}
+}
+
+// HasReaders returns whether the pipe has any active readers.
+func (p *Pipe) HasReaders() bool {
+	return atomic.LoadInt32(&p.readers) > 0
+}
+
+// HasWriters returns whether the pipe has any active writers.
+func (p *Pipe) HasWriters() bool {
+	return atomic.LoadInt32(&p.writers) > 0
+}
+
+// rReadinessLocked calculates the read readiness.
+//
+// Precondition: mu must be held.
+func (p *Pipe) rReadinessLocked() waiter.EventMask {
+	ready := waiter.EventMask(0)
+	if p.HasReaders() && p.view.Size() != 0 {
+		ready |= waiter.EventIn
+	}
+	if !p.HasWriters() && p.hadWriter {
+		// POLLHUP must be suppressed until the pipe has had at least one writer
+		// at some point. Otherwise a reader thread may poll and immediately get
+		// a POLLHUP before the writer ever opens the pipe, which the reader may
+		// interpret as the writer opening then closing the pipe.
+		ready |= waiter.EventHUp
+	}
+	return ready
+}
+
+// rReadiness returns a mask that states whether the read end of the pipe is
+// ready for reading.
+func (p *Pipe) rReadiness() waiter.EventMask {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.rReadinessLocked()
+}
+
+// wReadinessLocked calculates the write readiness.
+//
+// Precondition: mu must be held.
+func (p *Pipe) wReadinessLocked() waiter.EventMask {
+	ready := waiter.EventMask(0)
+	if p.HasWriters() && p.view.Size() < p.max {
+		ready |= waiter.EventOut
+	}
+	if !p.HasReaders() {
+		ready |= waiter.EventErr
+	}
+	return ready
+}
+
+// wReadiness returns a mask that states whether the write end of the pipe
+// is ready for writing.
+func (p *Pipe) wReadiness() waiter.EventMask {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.wReadinessLocked()
+}
+
+// rwReadiness returns a mask that states whether a read-write handle to the
+// pipe is ready for IO.
+func (p *Pipe) rwReadiness() waiter.EventMask {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.rReadinessLocked() | p.wReadinessLocked()
+}
+
+// queued returns the amount of queued data.
+func (p *Pipe) queued() int64 {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.view.Size()
+}
+
+// FifoSize implements fs.FifoSizer.FifoSize.
+func (p *Pipe) FifoSize(context.Context, *fs.File) (int64, error) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.max, nil
+}
+
+// SetFifoSize implements fs.FifoSizer.SetFifoSize.
+func (p *Pipe) SetFifoSize(size int64) (int64, error) {
+	if size < 0 {
+		return 0, syserror.EINVAL
+	}
+	if size < MinimumPipeSize {
+		size = MinimumPipeSize // Per spec.
+	}
+	if size > MaximumPipeSize {
+		return 0, syserror.EPERM
+	}
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	if size < p.view.Size() {
+		return 0, syserror.EBUSY
+	}
+	p.max = size
+	return size, nil
+}
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
new file mode 100644
index 000000000..bda739dbe
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -0,0 +1,139 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"bytes"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func TestPipeRW(t *testing.T) {
+	ctx := contexttest.Context(t)
+	r, w := NewConnectedPipe(ctx, 65536, 4096)
+	defer r.DecRef()
+	defer w.DecRef()
+
+	msg := []byte("here's some bytes")
+	wantN := int64(len(msg))
+	n, err := w.Writev(ctx, usermem.BytesIOSequence(msg))
+	if n != wantN || err != nil {
+		t.Fatalf("Writev: got (%d, %v), wanted (%d, nil)", n, err, wantN)
+	}
+
+	buf := make([]byte, len(msg))
+	n, err = r.Readv(ctx, usermem.BytesIOSequence(buf))
+	if n != wantN || err != nil || !bytes.Equal(buf, msg) {
+		t.Fatalf("Readv: got (%d, %v) %q, wanted (%d, nil) %q", n, err, buf, wantN, msg)
+	}
+}
+
+func TestPipeReadBlock(t *testing.T) {
+	ctx := contexttest.Context(t)
+	r, w := NewConnectedPipe(ctx, 65536, 4096)
+	defer r.DecRef()
+	defer w.DecRef()
+
+	n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1)))
+	if n != 0 || err != syserror.ErrWouldBlock {
+		t.Fatalf("Readv: got (%d, %v), wanted (0, %v)", n, err, syserror.ErrWouldBlock)
+	}
+}
+
+func TestPipeWriteBlock(t *testing.T) {
+	const atomicIOBytes = 2
+	const capacity = MinimumPipeSize
+
+	ctx := contexttest.Context(t)
+	r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes)
+	defer r.DecRef()
+	defer w.DecRef()
+
+	msg := make([]byte, capacity+1)
+	n, err := w.Writev(ctx, usermem.BytesIOSequence(msg))
+	if wantN, wantErr := int64(capacity), syserror.ErrWouldBlock; n != wantN || err != wantErr {
+		t.Fatalf("Writev: got (%d, %v), wanted (%d, %v)", n, err, wantN, wantErr)
+	}
+}
+
+func TestPipeWriteUntilEnd(t *testing.T) {
+	const atomicIOBytes = 2
+
+	ctx := contexttest.Context(t)
+	r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes)
+	defer r.DecRef()
+	defer w.DecRef()
+
+	msg := []byte("here's some bytes")
+
+	wDone := make(chan struct{}, 0)
+	rDone := make(chan struct{}, 0)
+	defer func() {
+		// Signal the reader to stop and wait until it does so.
+		close(wDone)
+		<-rDone
+	}()
+
+	go func() {
+		defer close(rDone)
+		// Read from r until done is closed.
+		ctx := contexttest.Context(t)
+		buf := make([]byte, len(msg)+1)
+		dst := usermem.BytesIOSequence(buf)
+		e, ch := waiter.NewChannelEntry(nil)
+		r.EventRegister(&e, waiter.EventIn)
+		defer r.EventUnregister(&e)
+		for {
+			n, err := r.Readv(ctx, dst)
+			dst = dst.DropFirst64(n)
+			if err == syserror.ErrWouldBlock {
+				select {
+				case <-ch:
+					continue
+				case <-wDone:
+					// We expect to have 1 byte left in dst since len(buf) ==
+					// len(msg)+1.
+					if dst.NumBytes() != 1 || !bytes.Equal(buf[:len(msg)], msg) {
+						t.Errorf("Reader: got %q (%d bytes remaining), wanted %q", buf, dst.NumBytes(), msg)
+					}
+					return
+				}
+			}
+			if err != nil {
+				t.Fatalf("Readv: got unexpected error %v", err)
+			}
+		}
+	}()
+
+	src := usermem.BytesIOSequence(msg)
+	e, ch := waiter.NewChannelEntry(nil)
+	w.EventRegister(&e, waiter.EventOut)
+	defer w.EventUnregister(&e)
+	for src.NumBytes() != 0 {
+		n, err := w.Writev(ctx, src)
+		src = src.DropFirst64(n)
+		if err == syserror.ErrWouldBlock {
+			<-ch
+			continue
+		}
+		if err != nil {
+			t.Fatalf("Writev: got (%d, %v)", n, err)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/pipe_unsafe.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go
new file mode 100644
index 000000000..dd60cba24
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go
@@ -0,0 +1,35 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"unsafe"
+)
+
+// lockTwoPipes locks both x.mu and y.mu in an order that is guaranteed to be
+// consistent for both lockTwoPipes(x, y) and lockTwoPipes(y, x), such that
+// concurrent calls cannot deadlock.
+//
+// Preconditions: x != y.
+func lockTwoPipes(x, y *Pipe) {
+	// Lock the two pipes in order of increasing address.
+	if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) {
+		x.mu.Lock()
+		y.mu.Lock()
+	} else {
+		y.mu.Lock()
+		x.mu.Lock()
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
new file mode 100644
index 000000000..aacf28da2
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -0,0 +1,214 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"io"
+	"math"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/amutex"
+	"gvisor.dev/gvisor/pkg/buffer"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// This file contains Pipe file functionality that is tied to neither VFS nor
+// the old fs architecture.
+
+// Release cleans up the pipe's state.
+func (p *Pipe) Release() {
+	p.rClose()
+	p.wClose()
+
+	// Wake up readers and writers.
+	p.Notify(waiter.EventIn | waiter.EventOut)
+}
+
+// Read reads from the Pipe into dst.
+func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	n, err := p.read(ctx, readOps{
+		left: func() int64 {
+			return dst.NumBytes()
+		},
+		limit: func(l int64) {
+			dst = dst.TakeFirst64(l)
+		},
+		read: func(view *buffer.View) (int64, error) {
+			n, err := dst.CopyOutFrom(ctx, view)
+			dst = dst.DropFirst64(n)
+			view.TrimFront(n)
+			return n, err
+		},
+	})
+	if n > 0 {
+		p.Notify(waiter.EventOut)
+	}
+	return n, err
+}
+
+// WriteTo writes to w from the Pipe.
+func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) (int64, error) {
+	ops := readOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadToWriter(w, count)
+			if !dup {
+				view.TrimFront(n)
+			}
+			count -= n
+			return n, err
+		},
+	}
+	n, err := p.read(ctx, ops)
+	if n > 0 {
+		p.Notify(waiter.EventOut)
+	}
+	return n, err
+}
+
+// Write writes to the Pipe from src.
+func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	n, err := p.write(ctx, writeOps{
+		left: func() int64 {
+			return src.NumBytes()
+		},
+		limit: func(l int64) {
+			src = src.TakeFirst64(l)
+		},
+		write: func(view *buffer.View) (int64, error) {
+			n, err := src.CopyInTo(ctx, view)
+			src = src.DropFirst64(n)
+			return n, err
+		},
+	})
+	if n > 0 {
+		p.Notify(waiter.EventIn)
+	}
+	return n, err
+}
+
+// ReadFrom reads from r to the Pipe.
+func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, error) {
+	n, err := p.write(ctx, writeOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		write: func(view *buffer.View) (int64, error) {
+			n, err := view.WriteFromReader(r, count)
+			count -= n
+			return n, err
+		},
+	})
+	if n > 0 {
+		p.Notify(waiter.EventIn)
+	}
+	return n, err
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (p *Pipe) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return p.rwReadiness() & mask
+}
+
+// Ioctl implements ioctls on the Pipe.
+func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Switch on ioctl request.
+	switch int(args[1].Int()) {
+	case linux.FIONREAD:
+		v := p.queued()
+		if v > math.MaxInt32 {
+			v = math.MaxInt32 // Silently truncate.
+		}
+		// Copy result to userspace.
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	default:
+		return 0, syscall.ENOTTY
+	}
+}
+
+// waitFor blocks until the underlying pipe has at least one reader/writer is
+// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this
+// function will block for either readers or writers, depending on where
+// 'wakeupChan' points.
+//
+// mu must be held by the caller. waitFor returns with mu held, but it will
+// drop mu before blocking for any reader/writers.
+func waitFor(mu *sync.Mutex, wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool {
+	// Ideally this function would simply use a condition variable. However, the
+	// wait needs to be interruptible via 'sleeper', so we must sychronize via a
+	// channel. The synchronization below relies on the fact that closing a
+	// channel unblocks all receives on the channel.
+
+	// Does an appropriate wakeup channel already exist? If not, create a new
+	// one. This is all done under f.mu to avoid races.
+	if *wakeupChan == nil {
+		*wakeupChan = make(chan struct{})
+	}
+
+	// Grab a local reference to the wakeup channel since it may disappear as
+	// soon as we drop f.mu.
+	wakeup := *wakeupChan
+
+	// Drop the lock and prepare to sleep.
+	mu.Unlock()
+	cancel := sleeper.SleepStart()
+
+	// Wait for either a new reader/write to be signalled via 'wakeup', or
+	// for the sleep to be cancelled.
+	select {
+	case <-wakeup:
+		sleeper.SleepFinish(true)
+	case <-cancel:
+		sleeper.SleepFinish(false)
+	}
+
+	// Take the lock and check if we were woken. If we were woken and
+	// interrupted, the former takes priority.
+	mu.Lock()
+	select {
+	case <-wakeup:
+		return true
+	default:
+		return false
+	}
+}
+
+// newHandleLocked signals a new pipe reader or writer depending on where
+// 'wakeupChan' points. This unblocks any corresponding reader or writer
+// waiting for the other end of the channel to be opened, see Fifo.waitFor.
+//
+// Precondition: the mutex protecting wakeupChan must be held.
+func newHandleLocked(wakeupChan *chan struct{}) {
+	if *wakeupChan != nil {
+		close(*wakeupChan)
+		*wakeupChan = nil
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
new file mode 100644
index 000000000..7724b4452
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Reader satisfies the fs.FileOperations interface for read-only pipes.
+// Reader should be used with !fs.FileFlags.Write to reject writes.
+//
+// +stateify savable
+type Reader struct {
+	ReaderWriter
+}
+
+// Release implements fs.FileOperations.Release.
+//
+// This overrides ReaderWriter.Release.
+func (r *Reader) Release() {
+	r.Pipe.rClose()
+
+	// Wake up writers.
+	r.Pipe.Notify(waiter.EventOut)
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (r *Reader) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return r.Pipe.rReadiness() & mask
+}
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
new file mode 100644
index 000000000..b2b5691ee
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -0,0 +1,67 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// ReaderWriter satisfies the FileOperations interface and services both
+// read and write requests. This should only be used directly for named pipes.
+// pipe(2) and pipe2(2) only support unidirectional pipes and should use
+// either pipe.Reader or pipe.Writer.
+//
+// +stateify savable
+type ReaderWriter struct {
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	*Pipe
+}
+
+// Read implements fs.FileOperations.Read.
+func (rw *ReaderWriter) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	return rw.Pipe.Read(ctx, dst)
+}
+
+// WriteTo implements fs.FileOperations.WriteTo.
+func (rw *ReaderWriter) WriteTo(ctx context.Context, _ *fs.File, w io.Writer, count int64, dup bool) (int64, error) {
+	return rw.Pipe.WriteTo(ctx, w, count, dup)
+}
+
+// Write implements fs.FileOperations.Write.
+func (rw *ReaderWriter) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	return rw.Pipe.Write(ctx, src)
+}
+
+// ReadFrom implements fs.FileOperations.WriteTo.
+func (rw *ReaderWriter) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) {
+	return rw.Pipe.ReadFrom(ctx, r, count)
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (rw *ReaderWriter) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return rw.Pipe.Ioctl(ctx, io, args)
+}
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
new file mode 100644
index 000000000..45d4c5fc1
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -0,0 +1,468 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/buffer"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// This file contains types enabling the pipe package to be used with the vfs
+// package.
+
+// VFSPipe represents the actual pipe, analagous to an inode. VFSPipes should
+// not be copied.
+type VFSPipe struct {
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// pipe is the underlying pipe.
+	pipe Pipe
+
+	// Channels for synchronizing the creation of new readers and writers
+	// of this fifo. See waitFor and newHandleLocked.
+	//
+	// These are not saved/restored because all waiters are unblocked on
+	// save, and either automatically restart (via ERESTARTSYS) or return
+	// EINTR on resume. On restarts via ERESTARTSYS, the appropriate
+	// channel will be recreated.
+	rWakeup chan struct{} `state:"nosave"`
+	wWakeup chan struct{} `state:"nosave"`
+}
+
+// NewVFSPipe returns an initialized VFSPipe.
+func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe {
+	var vp VFSPipe
+	initPipe(&vp.pipe, isNamed, sizeBytes, atomicIOBytes)
+	return &vp
+}
+
+// ReaderWriterPair returns read-only and write-only FDs for vp.
+//
+// Preconditions: statusFlags should not contain an open access mode.
+func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) {
+	// Connected pipes share the same locks.
+	locks := &vfs.FileLocks{}
+	return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks)
+}
+
+// Open opens the pipe represented by vp.
+func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
+	vp.mu.Lock()
+	defer vp.mu.Unlock()
+
+	readable := vfs.MayReadFileWithOpenFlags(statusFlags)
+	writable := vfs.MayWriteFileWithOpenFlags(statusFlags)
+	if !readable && !writable {
+		return nil, syserror.EINVAL
+	}
+
+	fd := vp.newFD(mnt, vfsd, statusFlags, locks)
+
+	// Named pipes have special blocking semantics during open:
+	//
+	// "Normally, opening the FIFO blocks until the other end is opened also. A
+	// process can open a FIFO in nonblocking mode. In this case, opening for
+	// read-only will succeed even if no-one has opened on the write side yet,
+	// opening for write-only will fail with ENXIO (no such device or address)
+	// unless the other end has already been opened. Under Linux, opening a
+	// FIFO for read and write will succeed both in blocking and nonblocking
+	// mode. POSIX leaves this behavior undefined. This can be used to open a
+	// FIFO for writing while there are no readers available." - fifo(7)
+	switch {
+	case readable && writable:
+		// Pipes opened for read-write always succeed without blocking.
+		newHandleLocked(&vp.rWakeup)
+		newHandleLocked(&vp.wWakeup)
+
+	case readable:
+		newHandleLocked(&vp.rWakeup)
+		// If this pipe is being opened as blocking and there's no
+		// writer, we have to wait for a writer to open the other end.
+		if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
+			fd.DecRef()
+			return nil, syserror.EINTR
+		}
+
+	case writable:
+		newHandleLocked(&vp.wWakeup)
+
+		if vp.pipe.isNamed && !vp.pipe.HasReaders() {
+			// Non-blocking, write-only opens fail with ENXIO when the read
+			// side isn't open yet.
+			if statusFlags&linux.O_NONBLOCK != 0 {
+				fd.DecRef()
+				return nil, syserror.ENXIO
+			}
+			// Wait for a reader to open the other end.
+			if !waitFor(&vp.mu, &vp.rWakeup, ctx) {
+				fd.DecRef()
+				return nil, syserror.EINTR
+			}
+		}
+
+	default:
+		panic("invalid pipe flags: must be readable, writable, or both")
+	}
+
+	return fd, nil
+}
+
+// Preconditions: vp.mu must be held.
+func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) *vfs.FileDescription {
+	fd := &VFSPipeFD{
+		pipe: &vp.pipe,
+	}
+	fd.LockFD.Init(locks)
+	fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		DenyPRead:         true,
+		DenyPWrite:        true,
+		UseDentryMetadata: true,
+	})
+
+	switch {
+	case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable():
+		vp.pipe.rOpen()
+		vp.pipe.wOpen()
+	case fd.vfsfd.IsReadable():
+		vp.pipe.rOpen()
+	case fd.vfsfd.IsWritable():
+		vp.pipe.wOpen()
+	default:
+		panic("invalid pipe flags: must be readable, writable, or both")
+	}
+
+	return &fd.vfsfd
+}
+
+// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements
+// non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to
+// other FileDescriptions for splice(2) and tee(2).
+type VFSPipeFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+	vfs.LockFD
+
+	pipe *Pipe
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *VFSPipeFD) Release() {
+	var event waiter.EventMask
+	if fd.vfsfd.IsReadable() {
+		fd.pipe.rClose()
+		event |= waiter.EventOut
+	}
+	if fd.vfsfd.IsWritable() {
+		fd.pipe.wClose()
+		event |= waiter.EventIn | waiter.EventHUp
+	}
+	if event == 0 {
+		panic("invalid pipe flags: must be readable, writable, or both")
+	}
+
+	fd.pipe.Notify(event)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	switch {
+	case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable():
+		return fd.pipe.rwReadiness()
+	case fd.vfsfd.IsReadable():
+		return fd.pipe.rReadiness()
+	case fd.vfsfd.IsWritable():
+		return fd.pipe.wReadiness()
+	default:
+		panic("pipe FD is neither readable nor writable")
+	}
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return syserror.ESPIPE
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.pipe.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *VFSPipeFD) EventUnregister(e *waiter.Entry) {
+	fd.pipe.EventUnregister(e)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	return fd.pipe.Read(ctx, dst)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	return fd.pipe.Write(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return fd.pipe.Ioctl(ctx, uio, args)
+}
+
+// PipeSize implements fcntl(F_GETPIPE_SZ).
+func (fd *VFSPipeFD) PipeSize() int64 {
+	// Inline Pipe.FifoSize() rather than calling it with nil Context and
+	// fs.File and ignoring the returned error (which is always nil).
+	fd.pipe.mu.Lock()
+	defer fd.pipe.mu.Unlock()
+	return fd.pipe.max
+}
+
+// SetPipeSize implements fcntl(F_SETPIPE_SZ).
+func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
+	return fd.pipe.SetFifoSize(size)
+}
+
+// IOSequence returns a useremm.IOSequence that reads up to count bytes from,
+// or writes up to count bytes to, fd.
+func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence {
+	return usermem.IOSequence{
+		IO:    fd,
+		Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+	}
+}
+
+// CopyIn implements usermem.IO.CopyIn.
+func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
+	origCount := int64(len(dst))
+	n, err := fd.pipe.read(ctx, readOps{
+		left: func() int64 {
+			return int64(len(dst))
+		},
+		limit: func(l int64) {
+			dst = dst[:l]
+		},
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadAt(dst, 0)
+			view.TrimFront(int64(n))
+			return int64(n), err
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventOut)
+	}
+	if err == nil && n != origCount {
+		return int(n), syserror.ErrWouldBlock
+	}
+	return int(n), err
+}
+
+// CopyOut implements usermem.IO.CopyOut.
+func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
+	origCount := int64(len(src))
+	n, err := fd.pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return int64(len(src))
+		},
+		limit: func(l int64) {
+			src = src[:l]
+		},
+		write: func(view *buffer.View) (int64, error) {
+			view.Append(src)
+			return int64(len(src)), nil
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventIn)
+	}
+	if err == nil && n != origCount {
+		return int(n), syserror.ErrWouldBlock
+	}
+	return int(n), err
+}
+
+// ZeroOut implements usermem.IO.ZeroOut.
+func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
+	origCount := toZero
+	n, err := fd.pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return toZero
+		},
+		limit: func(l int64) {
+			toZero = l
+		},
+		write: func(view *buffer.View) (int64, error) {
+			view.Grow(view.Size()+toZero, true /* zero */)
+			return toZero, nil
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventIn)
+	}
+	if err == nil && n != origCount {
+		return n, syserror.ErrWouldBlock
+	}
+	return n, err
+}
+
+// CopyInTo implements usermem.IO.CopyInTo.
+func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
+	count := ars.NumBytes()
+	if count == 0 {
+		return 0, nil
+	}
+	origCount := count
+	n, err := fd.pipe.read(ctx, readOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadToSafememWriter(dst, uint64(count))
+			view.TrimFront(int64(n))
+			return int64(n), err
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventOut)
+	}
+	if err == nil && n != origCount {
+		return n, syserror.ErrWouldBlock
+	}
+	return n, err
+}
+
+// CopyOutFrom implements usermem.IO.CopyOutFrom.
+func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
+	count := ars.NumBytes()
+	if count == 0 {
+		return 0, nil
+	}
+	origCount := count
+	n, err := fd.pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		write: func(view *buffer.View) (int64, error) {
+			n, err := view.WriteFromSafememReader(src, uint64(count))
+			return int64(n), err
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventIn)
+	}
+	if err == nil && n != origCount {
+		return n, syserror.ErrWouldBlock
+	}
+	return n, err
+}
+
+// SwapUint32 implements usermem.IO.SwapUint32.
+func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
+	// How did a pipe get passed as the virtual address space to futex(2)?
+	panic("VFSPipeFD.SwapUint32 called unexpectedly")
+}
+
+// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
+func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
+	panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly")
+}
+
+// LoadUint32 implements usermem.IO.LoadUint32.
+func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) {
+	panic("VFSPipeFD.LoadUint32 called unexpectedly")
+}
+
+// Splice reads up to count bytes from src and writes them to dst. It returns
+// the number of bytes moved.
+//
+// Preconditions: count > 0.
+func Splice(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
+	return spliceOrTee(ctx, dst, src, count, true /* removeFromSrc */)
+}
+
+// Tee reads up to count bytes from src and writes them to dst, without
+// removing the read bytes from src. It returns the number of bytes copied.
+//
+// Preconditions: count > 0.
+func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
+	return spliceOrTee(ctx, dst, src, count, false /* removeFromSrc */)
+}
+
+// Preconditions: count > 0.
+func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) {
+	if dst.pipe == src.pipe {
+		return 0, syserror.EINVAL
+	}
+
+	lockTwoPipes(dst.pipe, src.pipe)
+	defer dst.pipe.mu.Unlock()
+	defer src.pipe.mu.Unlock()
+
+	n, err := dst.pipe.writeLocked(ctx, writeOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		write: func(dstView *buffer.View) (int64, error) {
+			return src.pipe.readLocked(ctx, readOps{
+				left: func() int64 {
+					return count
+				},
+				limit: func(l int64) {
+					count = l
+				},
+				read: func(srcView *buffer.View) (int64, error) {
+					n, err := srcView.ReadToSafememWriter(dstView, uint64(count))
+					if n > 0 && removeFromSrc {
+						srcView.TrimFront(int64(n))
+					}
+					return int64(n), err
+				},
+			})
+		},
+	})
+	if n > 0 {
+		dst.pipe.Notify(waiter.EventIn)
+		src.pipe.Notify(waiter.EventOut)
+	}
+	return n, err
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *VFSPipeFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *VFSPipeFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
new file mode 100644
index 000000000..5bc6aa931
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Writer satisfies the fs.FileOperations interface for write-only pipes.
+// Writer should be used with !fs.FileFlags.Read to reject reads.
+//
+// +stateify savable
+type Writer struct {
+	ReaderWriter
+}
+
+// Release implements fs.FileOperations.Release.
+//
+// This overrides ReaderWriter.Release.
+func (w *Writer) Release() {
+	w.Pipe.wClose()
+
+	// Wake up readers.
+	w.Pipe.Notify(waiter.EventHUp)
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (w *Writer) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return w.Pipe.wReadiness() & mask
+}
diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go
new file mode 100644
index 000000000..2e861a5a8
--- /dev/null
+++ b/pkg/sentry/kernel/posixtimer.go
@@ -0,0 +1,308 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"math"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// IntervalTimer represents a POSIX interval timer as described by
+// timer_create(2).
+//
+// +stateify savable
+type IntervalTimer struct {
+	timer *ktime.Timer
+
+	// If target is not nil, it receives signo from timer expirations. If group
+	// is true, these signals are thread-group-directed. These fields are
+	// immutable.
+	target *Task
+	signo  linux.Signal
+	id     linux.TimerID
+	sigval uint64
+	group  bool
+
+	// If sigpending is true, a signal to target is already queued, and timer
+	// expirations should increment overrunCur instead of sending another
+	// signal. sigpending is protected by target's signal mutex. (If target is
+	// nil, the timer will never send signals, so sigpending will be unused.)
+	sigpending bool
+
+	// If sigorphan is true, timer's setting has been changed since sigpending
+	// last became true, such that overruns should no longer be counted in the
+	// pending signals si_overrun. sigorphan is protected by target's signal
+	// mutex.
+	sigorphan bool
+
+	// overrunCur is the number of overruns that have occurred since the last
+	// time a signal was sent. overrunCur is protected by target's signal
+	// mutex.
+	overrunCur uint64
+
+	// Consider the last signal sent by this timer that has been dequeued.
+	// overrunLast is the number of overruns that occurred between when this
+	// signal was sent and when it was dequeued. Equivalently, overrunLast was
+	// the value of overrunCur when this signal was dequeued. overrunLast is
+	// protected by target's signal mutex.
+	overrunLast uint64
+}
+
+// DestroyTimer releases it's resources.
+func (it *IntervalTimer) DestroyTimer() {
+	it.timer.Destroy()
+	it.timerSettingChanged()
+	// A destroyed IntervalTimer is still potentially reachable via a
+	// pendingSignal; nil out timer so that it won't be saved.
+	it.timer = nil
+}
+
+func (it *IntervalTimer) timerSettingChanged() {
+	if it.target == nil {
+		return
+	}
+	it.target.tg.pidns.owner.mu.RLock()
+	defer it.target.tg.pidns.owner.mu.RUnlock()
+	it.target.tg.signalHandlers.mu.Lock()
+	defer it.target.tg.signalHandlers.mu.Unlock()
+	it.sigorphan = true
+	it.overrunCur = 0
+	it.overrunLast = 0
+}
+
+// PauseTimer pauses the associated Timer.
+func (it *IntervalTimer) PauseTimer() {
+	it.timer.Pause()
+}
+
+// ResumeTimer resumes the associated Timer.
+func (it *IntervalTimer) ResumeTimer() {
+	it.timer.Resume()
+}
+
+// Preconditions: it.target's signal mutex must be locked.
+func (it *IntervalTimer) updateDequeuedSignalLocked(si *arch.SignalInfo) {
+	it.sigpending = false
+	if it.sigorphan {
+		return
+	}
+	it.overrunLast = it.overrunCur
+	it.overrunCur = 0
+	si.SetOverrun(saturateI32FromU64(it.overrunLast))
+}
+
+// Preconditions: it.target's signal mutex must be locked.
+func (it *IntervalTimer) signalRejectedLocked() {
+	it.sigpending = false
+	if it.sigorphan {
+		return
+	}
+	it.overrunCur++
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (it *IntervalTimer) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
+	if it.target == nil {
+		return ktime.Setting{}, false
+	}
+
+	it.target.tg.pidns.owner.mu.RLock()
+	defer it.target.tg.pidns.owner.mu.RUnlock()
+	it.target.tg.signalHandlers.mu.Lock()
+	defer it.target.tg.signalHandlers.mu.Unlock()
+
+	if it.sigpending {
+		it.overrunCur += exp
+		return ktime.Setting{}, false
+	}
+
+	// sigpending must be set before sendSignalTimerLocked() so that it can be
+	// unset if the signal is discarded (in which case sendSignalTimerLocked()
+	// will return nil).
+	it.sigpending = true
+	it.sigorphan = false
+	it.overrunCur += exp - 1
+	si := &arch.SignalInfo{
+		Signo: int32(it.signo),
+		Code:  arch.SignalInfoTimer,
+	}
+	si.SetTimerID(it.id)
+	si.SetSigval(it.sigval)
+	// si_overrun is set when the signal is dequeued.
+	if err := it.target.sendSignalTimerLocked(si, it.group, it); err != nil {
+		it.signalRejectedLocked()
+	}
+
+	return ktime.Setting{}, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy. Users of Timer should call
+// DestroyTimer instead.
+func (it *IntervalTimer) Destroy() {
+}
+
+// IntervalTimerCreate implements timer_create(2).
+func (t *Task) IntervalTimerCreate(c ktime.Clock, sigev *linux.Sigevent) (linux.TimerID, error) {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+
+	// Allocate a timer ID.
+	var id linux.TimerID
+	end := t.tg.nextTimerID
+	for {
+		id = t.tg.nextTimerID
+		_, ok := t.tg.timers[id]
+		t.tg.nextTimerID++
+		if t.tg.nextTimerID < 0 {
+			t.tg.nextTimerID = 0
+		}
+		if !ok {
+			break
+		}
+		if t.tg.nextTimerID == end {
+			return 0, syserror.EAGAIN
+		}
+	}
+
+	// "The implementation of the default case where evp [sic] is NULL is
+	// handled inside glibc, which invokes the underlying system call with a
+	// suitably populated sigevent structure." - timer_create(2). This is
+	// misleading; the timer_create syscall also handles a NULL sevp as
+	// described by the man page
+	// (kernel/time/posix-timers.c:sys_timer_create(), do_timer_create()). This
+	// must be handled here instead of the syscall wrapper since sigval is the
+	// timer ID, which isn't available until we allocate it in this function.
+	if sigev == nil {
+		sigev = &linux.Sigevent{
+			Signo:  int32(linux.SIGALRM),
+			Notify: linux.SIGEV_SIGNAL,
+			Value:  uint64(id),
+		}
+	}
+
+	// Construct the timer.
+	it := &IntervalTimer{
+		id:     id,
+		sigval: sigev.Value,
+	}
+	switch sigev.Notify {
+	case linux.SIGEV_NONE:
+		// leave it.target = nil
+	case linux.SIGEV_SIGNAL, linux.SIGEV_THREAD:
+		// POSIX SIGEV_THREAD semantics are implemented in userspace by libc;
+		// to the kernel, SIGEV_THREAD and SIGEV_SIGNAL are equivalent. (See
+		// Linux's kernel/time/posix-timers.c:good_sigevent().)
+		it.target = t.tg.leader
+		it.group = true
+	case linux.SIGEV_THREAD_ID:
+		t.tg.pidns.owner.mu.RLock()
+		target, ok := t.tg.pidns.tasks[ThreadID(sigev.Tid)]
+		t.tg.pidns.owner.mu.RUnlock()
+		if !ok || target.tg != t.tg {
+			return 0, syserror.EINVAL
+		}
+		it.target = target
+	default:
+		return 0, syserror.EINVAL
+	}
+	if sigev.Notify != linux.SIGEV_NONE {
+		it.signo = linux.Signal(sigev.Signo)
+		if !it.signo.IsValid() {
+			return 0, syserror.EINVAL
+		}
+	}
+	it.timer = ktime.NewTimer(c, it)
+
+	t.tg.timers[id] = it
+	return id, nil
+}
+
+// IntervalTimerDelete implements timer_delete(2).
+func (t *Task) IntervalTimerDelete(id linux.TimerID) error {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+	it := t.tg.timers[id]
+	if it == nil {
+		return syserror.EINVAL
+	}
+	delete(t.tg.timers, id)
+	it.DestroyTimer()
+	return nil
+}
+
+// IntervalTimerSettime implements timer_settime(2).
+func (t *Task) IntervalTimerSettime(id linux.TimerID, its linux.Itimerspec, abs bool) (linux.Itimerspec, error) {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+	it := t.tg.timers[id]
+	if it == nil {
+		return linux.Itimerspec{}, syserror.EINVAL
+	}
+
+	newS, err := ktime.SettingFromItimerspec(its, abs, it.timer.Clock())
+	if err != nil {
+		return linux.Itimerspec{}, err
+	}
+	tm, oldS := it.timer.SwapAnd(newS, it.timerSettingChanged)
+	its = ktime.ItimerspecFromSetting(tm, oldS)
+	return its, nil
+}
+
+// IntervalTimerGettime implements timer_gettime(2).
+func (t *Task) IntervalTimerGettime(id linux.TimerID) (linux.Itimerspec, error) {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+	it := t.tg.timers[id]
+	if it == nil {
+		return linux.Itimerspec{}, syserror.EINVAL
+	}
+
+	tm, s := it.timer.Get()
+	its := ktime.ItimerspecFromSetting(tm, s)
+	return its, nil
+}
+
+// IntervalTimerGetoverrun implements timer_getoverrun(2).
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) IntervalTimerGetoverrun(id linux.TimerID) (int32, error) {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+	it := t.tg.timers[id]
+	if it == nil {
+		return 0, syserror.EINVAL
+	}
+	// By timer_create(2) invariant, either it.target == nil (in which case
+	// it.overrunLast is immutably 0) or t.tg == it.target.tg; and the fact
+	// that t is executing timer_getoverrun(2) means that t.tg can't be
+	// completing execve, so t.tg.signalHandlers can't be changing, allowing us
+	// to lock t.tg.signalHandlers.mu without holding the TaskSet mutex.
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	// This is consistent with Linux after 78c9c4dfbf8c ("posix-timers:
+	// Sanitize overrun handling").
+	return saturateI32FromU64(it.overrunLast), nil
+}
+
+func saturateI32FromU64(x uint64) int32 {
+	if x > math.MaxInt32 {
+		return math.MaxInt32
+	}
+	return int32(x)
+}
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
new file mode 100644
index 000000000..e23e796ef
--- /dev/null
+++ b/pkg/sentry/kernel/ptrace.go
@@ -0,0 +1,1119 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// ptraceOptions are the subset of options controlling a task's ptrace behavior
+// that are set by ptrace(PTRACE_SETOPTIONS).
+//
+// +stateify savable
+type ptraceOptions struct {
+	// ExitKill is true if the tracee should be sent SIGKILL when the tracer
+	// exits.
+	ExitKill bool
+
+	// If SysGood is true, set bit 7 in the signal number for
+	// syscall-entry-stop and syscall-exit-stop traps delivered to this task's
+	// tracer.
+	SysGood bool
+
+	// TraceClone is true if the tracer wants to receive PTRACE_EVENT_CLONE
+	// events.
+	TraceClone bool
+
+	// TraceExec is true if the tracer wants to receive PTRACE_EVENT_EXEC
+	// events.
+	TraceExec bool
+
+	// TraceExit is true if the tracer wants to receive PTRACE_EVENT_EXIT
+	// events.
+	TraceExit bool
+
+	// TraceFork is true if the tracer wants to receive PTRACE_EVENT_FORK
+	// events.
+	TraceFork bool
+
+	// TraceSeccomp is true if the tracer wants to receive PTRACE_EVENT_SECCOMP
+	// events.
+	TraceSeccomp bool
+
+	// TraceVfork is true if the tracer wants to receive PTRACE_EVENT_VFORK
+	// events.
+	TraceVfork bool
+
+	// TraceVforkDone is true if the tracer wants to receive
+	// PTRACE_EVENT_VFORK_DONE events.
+	TraceVforkDone bool
+}
+
+// ptraceSyscallMode controls the behavior of a ptraced task at syscall entry
+// and exit.
+type ptraceSyscallMode int
+
+const (
+	// ptraceSyscallNone indicates that the task has never ptrace-stopped, or
+	// that it was resumed from its last ptrace-stop by PTRACE_CONT or
+	// PTRACE_DETACH. The task's syscalls will not be intercepted.
+	ptraceSyscallNone ptraceSyscallMode = iota
+
+	// ptraceSyscallIntercept indicates that the task was resumed from its last
+	// ptrace-stop by PTRACE_SYSCALL. The next time the task enters or exits a
+	// syscall, a ptrace-stop will occur.
+	ptraceSyscallIntercept
+
+	// ptraceSyscallEmu indicates that the task was resumed from its last
+	// ptrace-stop by PTRACE_SYSEMU or PTRACE_SYSEMU_SINGLESTEP. The next time
+	// the task enters a syscall, the syscall will be skipped, and a
+	// ptrace-stop will occur.
+	ptraceSyscallEmu
+)
+
+// CanTrace checks that t is permitted to access target's state, as defined by
+// ptrace(2), subsection "Ptrace access mode checking". If attach is true, it
+// checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access
+// mode PTRACE_MODE_READ.
+//
+// NOTE(b/30815691): The result of CanTrace is immediately stale (e.g., a
+// racing setuid(2) may change traceability). This may pose a risk when a task
+// changes from traceable to not traceable. This is only problematic across
+// execve, where privileges may increase.
+//
+// We currently do not implement privileged executables (set-user/group-ID bits
+// and file capabilities), so that case is not reachable.
+func (t *Task) CanTrace(target *Task, attach bool) bool {
+	// "1. If the calling thread and the target thread are in the same thread
+	// group, access is always allowed." - ptrace(2)
+	//
+	// Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access()
+	// should not deny sub-threads", first released in Linux 3.12), the rule
+	// only applies if t and target are the same task. But, as that commit
+	// message puts it, "[any] security check is pointless when the tasks share
+	// the same ->mm."
+	if t.tg == target.tg {
+		return true
+	}
+
+	// """
+	// 2. If the access mode specifies PTRACE_MODE_FSCREDS (ED: snipped,
+	// doesn't exist until Linux 4.5).
+	//
+	// Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the
+	// caller's real UID and GID for the checks in the next step. (Most APIs
+	// that check the caller's UID and GID use the effective IDs. For
+	// historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs
+	// instead.)
+	//
+	// 3. Deny access if neither of the following is true:
+	//
+	// - The real, effective, and saved-set user IDs of the target match the
+	// caller's user ID, *and* the real, effective, and saved-set group IDs of
+	// the target match the caller's group ID.
+	//
+	// - The caller has the CAP_SYS_PTRACE capability in the user namespace of
+	// the target.
+	//
+	// 4. Deny access if the target process "dumpable" attribute has a value
+	// other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in
+	// prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in
+	// the user namespace of the target process.
+	//
+	// 5. The kernel LSM security_ptrace_access_check() interface is invoked to
+	// see if ptrace access is permitted. The results depend on the LSM(s). The
+	// implementation of this interface in the commoncap LSM performs the
+	// following steps:
+	//
+	// a) If the access mode includes PTRACE_MODE_FSCREDS, then use the
+	// caller's effective capability set; otherwise (the access mode specifies
+	// PTRACE_MODE_REALCREDS, so) use the caller's permitted capability set.
+	//
+	// b) Deny access if neither of the following is true:
+	//
+	// - The caller and the target process are in the same user namespace, and
+	// the caller's capabilities are a proper superset of the target process's
+	// permitted capabilities.
+	//
+	// - The caller has the CAP_SYS_PTRACE capability in the target process's
+	// user namespace.
+	//
+	// Note that the commoncap LSM does not distinguish between
+	// PTRACE_MODE_READ and PTRACE_MODE_ATTACH. (ED: From earlier in this
+	// section: "the commoncap LSM ... is always invoked".)
+	// """
+	callerCreds := t.Credentials()
+	targetCreds := target.Credentials()
+	if callerCreds.HasCapabilityIn(linux.CAP_SYS_PTRACE, targetCreds.UserNamespace) {
+		return true
+	}
+	if cuid := callerCreds.RealKUID; cuid != targetCreds.RealKUID || cuid != targetCreds.EffectiveKUID || cuid != targetCreds.SavedKUID {
+		return false
+	}
+	if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID {
+		return false
+	}
+	var targetMM *mm.MemoryManager
+	target.WithMuLocked(func(t *Task) {
+		targetMM = t.MemoryManager()
+	})
+	if targetMM != nil && targetMM.Dumpability() != mm.UserDumpable {
+		return false
+	}
+	if callerCreds.UserNamespace != targetCreds.UserNamespace {
+		return false
+	}
+	if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 {
+		return false
+	}
+	return true
+}
+
+// Tracer returns t's ptrace Tracer.
+func (t *Task) Tracer() *Task {
+	return t.ptraceTracer.Load().(*Task)
+}
+
+// hasTracer returns true if t has a ptrace tracer attached.
+func (t *Task) hasTracer() bool {
+	// This isn't just inlined into callers so that if Task.Tracer() turns out
+	// to be too expensive because of e.g. interface conversion, we can switch
+	// to having a separate atomic flag more easily.
+	return t.Tracer() != nil
+}
+
+// ptraceStop is a TaskStop placed on tasks in a ptrace-stop.
+//
+// +stateify savable
+type ptraceStop struct {
+	// If frozen is true, the stopped task's tracer is currently operating on
+	// it, so Task.Kill should not remove the stop.
+	frozen bool
+
+	// If listen is true, the stopped task's tracer invoked PTRACE_LISTEN, so
+	// ptraceFreeze should fail.
+	listen bool
+}
+
+// Killable implements TaskStop.Killable.
+func (s *ptraceStop) Killable() bool {
+	return !s.frozen
+}
+
+// beginPtraceStopLocked initiates an unfrozen ptrace-stop on t. If t has been
+// killed, the stop is skipped, and beginPtraceStopLocked returns false.
+//
+// beginPtraceStopLocked does not signal t's tracer or wake it if it is
+// waiting.
+//
+// Preconditions: The TaskSet mutex must be locked. The caller must be running
+// on the task goroutine.
+func (t *Task) beginPtraceStopLocked() bool {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	// This is analogous to Linux's kernel/signal.c:ptrace_stop() => ... =>
+	// kernel/sched/core.c:__schedule() => signal_pending_state() check, which
+	// is what prevents tasks from entering ptrace-stops after being killed.
+	// Note that if t was SIGKILLed and beingPtraceStopLocked is being called
+	// for PTRACE_EVENT_EXIT, the task will have dequeued the signal before
+	// entering the exit path, so t.killedLocked() will no longer return true.
+	// This is consistent with Linux: "Bugs: ... A SIGKILL signal may still
+	// cause a PTRACE_EVENT_EXIT stop before actual signal death. This may be
+	// changed in the future; SIGKILL is meant to always immediately kill tasks
+	// even under ptrace. Last confirmed on Linux 3.13." - ptrace(2)
+	if t.killedLocked() {
+		return false
+	}
+	t.beginInternalStopLocked(&ptraceStop{})
+	return true
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceTrapLocked(code int32) {
+	// This is unconditional in ptrace_stop().
+	t.tg.signalHandlers.mu.Lock()
+	t.trapStopPending = false
+	t.tg.signalHandlers.mu.Unlock()
+	t.ptraceCode = code
+	t.ptraceSiginfo = &arch.SignalInfo{
+		Signo: int32(linux.SIGTRAP),
+		Code:  code,
+	}
+	t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
+	t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+	if t.beginPtraceStopLocked() {
+		tracer := t.Tracer()
+		tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP))
+		tracer.tg.eventQueue.Notify(EventTraceeStop)
+	}
+}
+
+// ptraceFreeze checks if t is in a ptraceStop. If so, it freezes the
+// ptraceStop, temporarily preventing it from being removed by a concurrent
+// Task.Kill, and returns true. Otherwise it returns false.
+//
+// Preconditions: The TaskSet mutex must be locked. The caller must be running
+// on the task goroutine of t's tracer.
+func (t *Task) ptraceFreeze() bool {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.stop == nil {
+		return false
+	}
+	s, ok := t.stop.(*ptraceStop)
+	if !ok {
+		return false
+	}
+	if s.listen {
+		return false
+	}
+	s.frozen = true
+	return true
+}
+
+// ptraceUnfreeze ends the effect of a previous successful call to
+// ptraceFreeze.
+//
+// Preconditions: t must be in a frozen ptraceStop.
+func (t *Task) ptraceUnfreeze() {
+	// t.tg.signalHandlers is stable because t is in a frozen ptrace-stop,
+	// preventing its thread group from completing execve.
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.ptraceUnfreezeLocked()
+}
+
+// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be
+// locked.
+func (t *Task) ptraceUnfreezeLocked() {
+	// Do this even if the task has been killed to ensure a panic if t.stop is
+	// nil or not a ptraceStop.
+	t.stop.(*ptraceStop).frozen = false
+	if t.killedLocked() {
+		t.endInternalStopLocked()
+	}
+}
+
+// ptraceUnstop implements ptrace request PTRACE_CONT, PTRACE_SYSCALL,
+// PTRACE_SINGLESTEP, PTRACE_SYSEMU, or PTRACE_SYSEMU_SINGLESTEP depending on
+// mode and singlestep.
+//
+// Preconditions: t must be in a frozen ptrace stop.
+//
+// Postconditions: If ptraceUnstop returns nil, t will no longer be in a ptrace
+// stop.
+func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error {
+	if sig != 0 && !sig.IsValid() {
+		return syserror.EIO
+	}
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.ptraceCode = int32(sig)
+	t.ptraceSyscallMode = mode
+	t.ptraceSinglestep = singlestep
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.endInternalStopLocked()
+	return nil
+}
+
+func (t *Task) ptraceTraceme() error {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if t.hasTracer() {
+		return syserror.EPERM
+	}
+	if t.parent == nil {
+		// In Linux, only init can not have a parent, and init is assumed never
+		// to invoke PTRACE_TRACEME. In the sentry, TGID 1 is an arbitrary user
+		// application that may invoke PTRACE_TRACEME; having no parent can
+		// also occur if all tasks in the parent thread group have exited, and
+		// failed to find a living thread group to reparent to. The former case
+		// is treated as if TGID 1 has an exited parent in an invisible
+		// ancestor PID namespace that is an owner of the root user namespace
+		// (and consequently has CAP_SYS_PTRACE), and the latter case is a
+		// special form of the exited parent case below. In either case,
+		// returning nil here is correct.
+		return nil
+	}
+	if !t.parent.CanTrace(t, true) {
+		return syserror.EPERM
+	}
+	if t.parent.exitState != TaskExitNone {
+		// Fail silently, as if we were successfully attached but then
+		// immediately detached. This is consistent with Linux.
+		return nil
+	}
+	t.ptraceTracer.Store(t.parent)
+	t.parent.ptraceTracees[t] = struct{}{}
+	return nil
+}
+
+// ptraceAttach implements ptrace(PTRACE_ATTACH, target) if seize is false, and
+// ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller.
+func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error {
+	if t.tg == target.tg {
+		return syserror.EPERM
+	}
+	if !t.CanTrace(target, true) {
+		return syserror.EPERM
+	}
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if target.hasTracer() {
+		return syserror.EPERM
+	}
+	// Attaching to zombies and dead tasks is not permitted; the exit
+	// notification logic relies on this. Linux allows attaching to PF_EXITING
+	// tasks, though.
+	if target.exitState >= TaskExitZombie {
+		return syserror.EPERM
+	}
+	if seize {
+		if err := target.ptraceSetOptionsLocked(opts); err != nil {
+			return syserror.EIO
+		}
+	}
+	target.ptraceTracer.Store(t)
+	t.ptraceTracees[target] = struct{}{}
+	target.ptraceSeized = seize
+	target.tg.signalHandlers.mu.Lock()
+	// "Unlike PTRACE_ATTACH, PTRACE_SEIZE does not stop the process." -
+	// ptrace(2)
+	if !seize {
+		target.sendSignalLocked(&arch.SignalInfo{
+			Signo: int32(linux.SIGSTOP),
+			Code:  arch.SignalInfoUser,
+		}, false /* group */)
+	}
+	// Undocumented Linux feature: If the tracee is already group-stopped (and
+	// consequently will not report the SIGSTOP just sent), force it to leave
+	// and re-enter the stop so that it will switch to a ptrace-stop.
+	if target.stop == (*groupStop)(nil) {
+		target.trapStopPending = true
+		target.endInternalStopLocked()
+		// TODO(jamieliu): Linux blocks ptrace_attach() until the task has
+		// entered the ptrace-stop (or exited) via JOBCTL_TRAPPING.
+	}
+	target.tg.signalHandlers.mu.Unlock()
+	return nil
+}
+
+// ptraceDetach implements ptrace(PTRACE_DETACH, target, 0, sig). t is the
+// caller.
+//
+// Preconditions: target must be a tracee of t in a frozen ptrace stop.
+//
+// Postconditions: If ptraceDetach returns nil, target will no longer be in a
+// ptrace stop.
+func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error {
+	if sig != 0 && !sig.IsValid() {
+		return syserror.EIO
+	}
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	target.ptraceCode = int32(sig)
+	target.forgetTracerLocked()
+	delete(t.ptraceTracees, target)
+	return nil
+}
+
+// exitPtrace is called in the exit path to detach all of t's tracees.
+func (t *Task) exitPtrace() {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	for target := range t.ptraceTracees {
+		if target.ptraceOpts.ExitKill {
+			target.tg.signalHandlers.mu.Lock()
+			target.sendSignalLocked(&arch.SignalInfo{
+				Signo: int32(linux.SIGKILL),
+			}, false /* group */)
+			target.tg.signalHandlers.mu.Unlock()
+		}
+		// Leave ptraceCode unchanged so that if the task is ptrace-stopped, it
+		// observes the ptraceCode it set before it entered the stop. I believe
+		// this is consistent with Linux.
+		target.forgetTracerLocked()
+	}
+	// "nil maps cannot be saved"
+	t.ptraceTracees = make(map[*Task]struct{})
+}
+
+// forgetTracerLocked detaches t's tracer and ensures that t is no longer
+// ptrace-stopped.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) forgetTracerLocked() {
+	t.ptraceSeized = false
+	t.ptraceOpts = ptraceOptions{}
+	t.ptraceSyscallMode = ptraceSyscallNone
+	t.ptraceSinglestep = false
+	t.ptraceTracer.Store((*Task)(nil))
+	if t.exitTracerNotified && !t.exitTracerAcked {
+		t.exitTracerAcked = true
+		t.exitNotifyLocked(true)
+	}
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	// Unset t.trapStopPending, which might have been set by PTRACE_INTERRUPT. If
+	// it wasn't, it will be reset via t.groupStopPending after the following.
+	t.trapStopPending = false
+	// If t's thread group is in a group stop and t is eligible to participate,
+	// make it do so. This is essentially the reverse of the special case in
+	// ptraceAttach, which converts a group stop to a ptrace stop. ("Handling
+	// of restart from group-stop is currently buggy, but the "as planned"
+	// behavior is to leave tracee stopped and waiting for SIGCONT." -
+	// ptrace(2))
+	if (t.tg.groupStopComplete || t.tg.groupStopPendingCount != 0) && !t.groupStopPending && t.exitState < TaskExitInitiated {
+		t.groupStopPending = true
+		// t already participated in the group stop when it unset
+		// groupStopPending.
+		t.groupStopAcknowledged = true
+		t.interrupt()
+	}
+	if _, ok := t.stop.(*ptraceStop); ok {
+		t.endInternalStopLocked()
+	}
+}
+
+// ptraceSignalLocked is called after signal dequeueing to check if t should
+// enter ptrace signal-delivery-stop.
+//
+// Preconditions: The signal mutex must be locked. The caller must be running
+// on the task goroutine.
+func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool {
+	if linux.Signal(info.Signo) == linux.SIGKILL {
+		return false
+	}
+	if !t.hasTracer() {
+		return false
+	}
+	// The tracer might change this signal into a stop signal, in which case
+	// any SIGCONT received after the signal was originally dequeued should
+	// cancel it. This is consistent with Linux.
+	t.tg.groupStopDequeued = true
+	// This is unconditional in ptrace_stop().
+	t.trapStopPending = false
+	// Can't lock the TaskSet mutex while holding a signal mutex.
+	t.tg.signalHandlers.mu.Unlock()
+	defer t.tg.signalHandlers.mu.Lock()
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	tracer := t.Tracer()
+	if tracer == nil {
+		return false
+	}
+	t.ptraceCode = info.Signo
+	t.ptraceSiginfo = info
+	t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo)
+	if t.beginPtraceStopLocked() {
+		tracer.signalStop(t, arch.CLD_TRAPPED, info.Signo)
+		tracer.tg.eventQueue.Notify(EventTraceeStop)
+	}
+	return true
+}
+
+// ptraceSeccomp is called when a seccomp-bpf filter returns action
+// SECCOMP_RET_TRACE to check if t should enter PTRACE_EVENT_SECCOMP stop. data
+// is the lower 16 bits of the filter's return value.
+func (t *Task) ptraceSeccomp(data uint16) bool {
+	if !t.hasTracer() {
+		return false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if !t.ptraceOpts.TraceSeccomp {
+		return false
+	}
+	t.Debugf("Entering PTRACE_EVENT_SECCOMP stop")
+	t.ptraceEventLocked(linux.PTRACE_EVENT_SECCOMP, uint64(data))
+	return true
+}
+
+// ptraceSyscallEnter is called immediately before entering a syscall to check
+// if t should enter ptrace syscall-enter-stop.
+func (t *Task) ptraceSyscallEnter() (taskRunState, bool) {
+	if !t.hasTracer() {
+		return nil, false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	switch t.ptraceSyscallMode {
+	case ptraceSyscallNone:
+		return nil, false
+	case ptraceSyscallIntercept:
+		t.Debugf("Entering syscall-enter-stop from PTRACE_SYSCALL")
+		t.ptraceSyscallStopLocked()
+		return (*runSyscallAfterSyscallEnterStop)(nil), true
+	case ptraceSyscallEmu:
+		t.Debugf("Entering syscall-enter-stop from PTRACE_SYSEMU")
+		t.ptraceSyscallStopLocked()
+		return (*runSyscallAfterSysemuStop)(nil), true
+	}
+	panic(fmt.Sprintf("Unknown ptraceSyscallMode: %v", t.ptraceSyscallMode))
+}
+
+// ptraceSyscallExit is called immediately after leaving a syscall to check if
+// t should enter ptrace syscall-exit-stop.
+func (t *Task) ptraceSyscallExit() {
+	if !t.hasTracer() {
+		return
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if t.ptraceSyscallMode != ptraceSyscallIntercept {
+		return
+	}
+	t.Debugf("Entering syscall-exit-stop")
+	t.ptraceSyscallStopLocked()
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceSyscallStopLocked() {
+	code := int32(linux.SIGTRAP)
+	if t.ptraceOpts.SysGood {
+		code |= 0x80
+	}
+	t.ptraceTrapLocked(code)
+}
+
+type ptraceCloneKind int32
+
+const (
+	// ptraceCloneKindClone represents a call to Task.Clone where
+	// TerminationSignal is not SIGCHLD and Vfork is false.
+	ptraceCloneKindClone ptraceCloneKind = iota
+
+	// ptraceCloneKindFork represents a call to Task.Clone where
+	// TerminationSignal is SIGCHLD and Vfork is false.
+	ptraceCloneKindFork
+
+	// ptraceCloneKindVfork represents a call to Task.Clone where Vfork is
+	// true.
+	ptraceCloneKindVfork
+)
+
+// ptraceClone is called at the end of a clone or fork syscall to check if t
+// should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK
+// stop. child is the new task.
+func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool {
+	if !t.hasTracer() {
+		return false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	event := false
+	if !opts.Untraced {
+		switch kind {
+		case ptraceCloneKindClone:
+			if t.ptraceOpts.TraceClone {
+				t.Debugf("Entering PTRACE_EVENT_CLONE stop")
+				t.ptraceEventLocked(linux.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child]))
+				event = true
+			}
+		case ptraceCloneKindFork:
+			if t.ptraceOpts.TraceFork {
+				t.Debugf("Entering PTRACE_EVENT_FORK stop")
+				t.ptraceEventLocked(linux.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child]))
+				event = true
+			}
+		case ptraceCloneKindVfork:
+			if t.ptraceOpts.TraceVfork {
+				t.Debugf("Entering PTRACE_EVENT_VFORK stop")
+				t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child]))
+				event = true
+			}
+		default:
+			panic(fmt.Sprintf("Unknown ptraceCloneKind: %v", kind))
+		}
+	}
+	// "If the PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or PTRACE_O_TRACECLONE
+	// options are in effect, then children created by, respectively, vfork(2)
+	// or clone(2) with the CLONE_VFORK flag, fork(2) or clone(2) with the exit
+	// signal set to SIGCHLD, and other kinds of clone(2), are automatically
+	// attached to the same tracer which traced their parent. SIGSTOP is
+	// delivered to the children, causing them to enter signal-delivery-stop
+	// after they exit the system call which created them." - ptrace(2)
+	//
+	// clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is
+	// confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() =>
+	// include/linux/ptrace.h:ptrace_init_task().
+	if event || opts.InheritTracer {
+		tracer := t.Tracer()
+		if tracer != nil {
+			child.ptraceTracer.Store(tracer)
+			tracer.ptraceTracees[child] = struct{}{}
+			// "The "seized" behavior ... is inherited by children that are
+			// automatically attached using PTRACE_O_TRACEFORK,
+			// PTRACE_O_TRACEVFORK, and PTRACE_O_TRACECLONE." - ptrace(2)
+			child.ptraceSeized = t.ptraceSeized
+			// "Flags are inherited by new tracees created and "auto-attached"
+			// via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or
+			// PTRACE_O_TRACECLONE options." - ptrace(2)
+			child.ptraceOpts = t.ptraceOpts
+			child.tg.signalHandlers.mu.Lock()
+			// "PTRACE_SEIZE: ... Automatically attached children stop with
+			// PTRACE_EVENT_STOP and WSTOPSIG(status) returns SIGTRAP instead
+			// of having SIGSTOP signal delivered to them." - ptrace(2)
+			if child.ptraceSeized {
+				child.trapStopPending = true
+			} else {
+				child.pendingSignals.enqueue(&arch.SignalInfo{
+					Signo: int32(linux.SIGSTOP),
+				}, nil)
+			}
+			// The child will self-interrupt() when its task goroutine starts
+			// running, so we don't have to.
+			child.tg.signalHandlers.mu.Unlock()
+		}
+	}
+	return event
+}
+
+// ptraceVforkDone is called after the end of a vfork stop to check if t should
+// enter PTRACE_EVENT_VFORK_DONE stop. child is the new task's thread ID in t's
+// PID namespace.
+func (t *Task) ptraceVforkDone(child ThreadID) bool {
+	if !t.hasTracer() {
+		return false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if !t.ptraceOpts.TraceVforkDone {
+		return false
+	}
+	t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop")
+	t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK_DONE, uint64(child))
+	return true
+}
+
+// ptraceExec is called at the end of an execve syscall to check if t should
+// enter PTRACE_EVENT_EXEC stop. oldTID is t's thread ID, in its *tracer's* PID
+// namespace, prior to the execve. (If t did not have a tracer at the time
+// oldTID was read, oldTID may be 0. This is consistent with Linux.)
+func (t *Task) ptraceExec(oldTID ThreadID) {
+	if !t.hasTracer() {
+		return
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	// Recheck with the TaskSet mutex locked. Most ptrace points don't need to
+	// do this because detaching resets ptrace options, but PTRACE_EVENT_EXEC
+	// is special because both TraceExec and !TraceExec do something if a
+	// tracer is attached.
+	if !t.hasTracer() {
+		return
+	}
+	if t.ptraceOpts.TraceExec {
+		t.Debugf("Entering PTRACE_EVENT_EXEC stop")
+		t.ptraceEventLocked(linux.PTRACE_EVENT_EXEC, uint64(oldTID))
+		return
+	}
+	// "If the PTRACE_O_TRACEEXEC option is not in effect for the execing
+	// tracee, and if the tracee was PTRACE_ATTACHed rather that [sic]
+	// PTRACE_SEIZEd, the kernel delivers an extra SIGTRAP to the tracee after
+	// execve(2) returns. This is an ordinary signal (similar to one which can
+	// be generated by `kill -TRAP`, not a special kind of ptrace-stop.
+	// Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0
+	// (SI_USER). This signal may be blocked by signal mask, and thus may be
+	// delivered (much) later." - ptrace(2)
+	if t.ptraceSeized {
+		return
+	}
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.sendSignalLocked(&arch.SignalInfo{
+		Signo: int32(linux.SIGTRAP),
+		Code:  arch.SignalInfoUser,
+	}, false /* group */)
+}
+
+// ptraceExit is called early in the task exit path to check if t should enter
+// PTRACE_EVENT_EXIT stop.
+func (t *Task) ptraceExit() {
+	if !t.hasTracer() {
+		return
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if !t.ptraceOpts.TraceExit {
+		return
+	}
+	t.tg.signalHandlers.mu.Lock()
+	status := t.exitStatus.Status()
+	t.tg.signalHandlers.mu.Unlock()
+	t.Debugf("Entering PTRACE_EVENT_EXIT stop")
+	t.ptraceEventLocked(linux.PTRACE_EVENT_EXIT, uint64(status))
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceEventLocked(event int32, msg uint64) {
+	t.ptraceEventMsg = msg
+	// """
+	// PTRACE_EVENT stops are observed by the tracer as waitpid(2) returning
+	// with WIFSTOPPED(status), and WSTOPSIG(status) returns SIGTRAP. An
+	// additional bit is set in the higher byte of the status word: the value
+	// status>>8 will be
+	//
+	//   (SIGTRAP | PTRACE_EVENT_foo << 8).
+	//
+	// ...
+	//
+	// """ - ptrace(2)
+	t.ptraceTrapLocked(int32(linux.SIGTRAP) | (event << 8))
+}
+
+// ptraceKill implements ptrace(PTRACE_KILL, target). t is the caller.
+func (t *Task) ptraceKill(target *Task) error {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if target.Tracer() != t {
+		return syserror.ESRCH
+	}
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	// "This operation is deprecated; do not use it! Instead, send a SIGKILL
+	// directly using kill(2) or tgkill(2). The problem with PTRACE_KILL is
+	// that it requires the tracee to be in signal-delivery-stop, otherwise it
+	// may not work (i.e., may complete successfully but won't kill the
+	// tracee)." - ptrace(2)
+	if target.stop == nil {
+		return nil
+	}
+	if _, ok := target.stop.(*ptraceStop); !ok {
+		return nil
+	}
+	target.ptraceCode = int32(linux.SIGKILL)
+	target.endInternalStopLocked()
+	return nil
+}
+
+func (t *Task) ptraceInterrupt(target *Task) error {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if target.Tracer() != t {
+		return syserror.ESRCH
+	}
+	if !target.ptraceSeized {
+		return syserror.EIO
+	}
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if target.killedLocked() || target.exitState >= TaskExitInitiated {
+		return nil
+	}
+	target.trapStopPending = true
+	if s, ok := target.stop.(*ptraceStop); ok && s.listen {
+		target.endInternalStopLocked()
+	}
+	target.interrupt()
+	return nil
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing. t must have a
+// tracer.
+func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
+	const valid = uintptr(linux.PTRACE_O_EXITKILL |
+		linux.PTRACE_O_TRACESYSGOOD |
+		linux.PTRACE_O_TRACECLONE |
+		linux.PTRACE_O_TRACEEXEC |
+		linux.PTRACE_O_TRACEEXIT |
+		linux.PTRACE_O_TRACEFORK |
+		linux.PTRACE_O_TRACESECCOMP |
+		linux.PTRACE_O_TRACEVFORK |
+		linux.PTRACE_O_TRACEVFORKDONE)
+	if opts&^valid != 0 {
+		return syserror.EINVAL
+	}
+	t.ptraceOpts = ptraceOptions{
+		ExitKill:       opts&linux.PTRACE_O_EXITKILL != 0,
+		SysGood:        opts&linux.PTRACE_O_TRACESYSGOOD != 0,
+		TraceClone:     opts&linux.PTRACE_O_TRACECLONE != 0,
+		TraceExec:      opts&linux.PTRACE_O_TRACEEXEC != 0,
+		TraceExit:      opts&linux.PTRACE_O_TRACEEXIT != 0,
+		TraceFork:      opts&linux.PTRACE_O_TRACEFORK != 0,
+		TraceSeccomp:   opts&linux.PTRACE_O_TRACESECCOMP != 0,
+		TraceVfork:     opts&linux.PTRACE_O_TRACEVFORK != 0,
+		TraceVforkDone: opts&linux.PTRACE_O_TRACEVFORKDONE != 0,
+	}
+	return nil
+}
+
+// Ptrace implements the ptrace system call.
+func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
+	// PTRACE_TRACEME ignores all other arguments.
+	if req == linux.PTRACE_TRACEME {
+		return t.ptraceTraceme()
+	}
+	// All other ptrace requests operate on a current or future tracee
+	// specified by pid.
+	target := t.tg.pidns.TaskWithID(pid)
+	if target == nil {
+		return syserror.ESRCH
+	}
+
+	// PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already
+	// a tracee.
+	if req == linux.PTRACE_ATTACH || req == linux.PTRACE_SEIZE {
+		seize := req == linux.PTRACE_SEIZE
+		if seize && addr != 0 {
+			return syserror.EIO
+		}
+		return t.ptraceAttach(target, seize, uintptr(data))
+	}
+	// PTRACE_KILL and PTRACE_INTERRUPT require that the target is a tracee,
+	// but does not require that it is ptrace-stopped.
+	if req == linux.PTRACE_KILL {
+		return t.ptraceKill(target)
+	}
+	if req == linux.PTRACE_INTERRUPT {
+		return t.ptraceInterrupt(target)
+	}
+	// All other ptrace requests require that the target is a ptrace-stopped
+	// tracee, and freeze the ptrace-stop so the tracee can be operated on.
+	t.tg.pidns.owner.mu.RLock()
+	if target.Tracer() != t {
+		t.tg.pidns.owner.mu.RUnlock()
+		return syserror.ESRCH
+	}
+	if !target.ptraceFreeze() {
+		t.tg.pidns.owner.mu.RUnlock()
+		// "Most ptrace commands (all except PTRACE_ATTACH, PTRACE_SEIZE,
+		// PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the
+		// tracee to be in a ptrace-stop, otherwise they fail with ESRCH." -
+		// ptrace(2)
+		return syserror.ESRCH
+	}
+	t.tg.pidns.owner.mu.RUnlock()
+	// Even if the target has a ptrace-stop active, the tracee's task goroutine
+	// may not yet have reached Task.doStop; wait for it to do so. This is safe
+	// because there's no way for target to initiate a ptrace-stop and then
+	// block (by calling Task.block) before entering it.
+	//
+	// Caveat: If tasks were just restored, the tracee's first call to
+	// Task.Activate (in Task.run) occurs before its first call to Task.doStop,
+	// which may block if the tracer's address space is active.
+	t.UninterruptibleSleepStart(true)
+	target.waitGoroutineStoppedOrExited()
+	t.UninterruptibleSleepFinish(true)
+
+	// Resuming commands end the ptrace stop, but only if successful.
+	// PTRACE_LISTEN ends the ptrace stop if trapNotifyPending is already set on the
+	// target.
+	switch req {
+	case linux.PTRACE_DETACH:
+		if err := t.ptraceDetach(target, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+
+	case linux.PTRACE_CONT:
+		if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+
+	case linux.PTRACE_SYSCALL:
+		if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+
+	case linux.PTRACE_SINGLESTEP:
+		if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+
+	case linux.PTRACE_SYSEMU:
+		if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+
+	case linux.PTRACE_SYSEMU_SINGLESTEP:
+		if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+
+	case linux.PTRACE_LISTEN:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		if !target.ptraceSeized {
+			return syserror.EIO
+		}
+		if target.ptraceSiginfo == nil {
+			return syserror.EIO
+		}
+		if target.ptraceSiginfo.Code>>8 != linux.PTRACE_EVENT_STOP {
+			return syserror.EIO
+		}
+		target.tg.signalHandlers.mu.Lock()
+		defer target.tg.signalHandlers.mu.Unlock()
+		if target.trapNotifyPending {
+			target.endInternalStopLocked()
+		} else {
+			target.stop.(*ptraceStop).listen = true
+			target.ptraceUnfreezeLocked()
+		}
+		return nil
+	}
+
+	// All other ptrace requests expect us to unfreeze the stop.
+	defer target.ptraceUnfreeze()
+
+	switch req {
+	case linux.PTRACE_PEEKTEXT, linux.PTRACE_PEEKDATA:
+		// "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and
+		// PTRACE_PEEKUSER requests have a different API: they store the result
+		// at the address specified by the data parameter, and the return value
+		// is the error flag." - ptrace(2)
+		word := t.Arch().Native(0)
+		if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{
+			IgnorePermissions: true,
+		}); err != nil {
+			return err
+		}
+		_, err := t.CopyOut(data, word)
+		return err
+
+	case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
+		_, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{
+			IgnorePermissions: true,
+		})
+		return err
+
+	case linux.PTRACE_GETREGSET:
+		// "Read the tracee's registers. addr specifies, in an
+		// architecture-dependent way, the type of registers to be read. ...
+		// data points to a struct iovec, which describes the destination
+		// buffer's location and length. On return, the kernel modifies iov.len
+		// to indicate the actual number of bytes returned." - ptrace(2)
+		ars, err := t.CopyInIovecs(data, 1)
+		if err != nil {
+			return err
+		}
+		ar := ars.Head()
+		n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: ar.Start,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		}, int(ar.Length()))
+		if err != nil {
+			return err
+		}
+
+		// Update iovecs to represent the range of the written register set.
+		end, ok := ar.Start.AddLength(uint64(n))
+		if !ok {
+			panic(fmt.Sprintf("%#x + %#x overflows. Invalid reg size > %#x", ar.Start, n, ar.Length()))
+		}
+		ar.End = end
+		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+
+	case linux.PTRACE_SETREGSET:
+		ars, err := t.CopyInIovecs(data, 1)
+		if err != nil {
+			return err
+		}
+		ar := ars.Head()
+		n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: ar.Start,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		}, int(ar.Length()))
+		if err != nil {
+			return err
+		}
+		ar.End -= usermem.Addr(n)
+		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+
+	case linux.PTRACE_GETSIGINFO:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		if target.ptraceSiginfo == nil {
+			return syserror.EINVAL
+		}
+		_, err := t.CopyOut(data, target.ptraceSiginfo)
+		return err
+
+	case linux.PTRACE_SETSIGINFO:
+		var info arch.SignalInfo
+		if _, err := t.CopyIn(data, &info); err != nil {
+			return err
+		}
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		if target.ptraceSiginfo == nil {
+			return syserror.EINVAL
+		}
+		target.ptraceSiginfo = &info
+		return nil
+
+	case linux.PTRACE_GETSIGMASK:
+		if addr != linux.SignalSetSize {
+			return syserror.EINVAL
+		}
+		_, err := t.CopyOut(data, target.SignalMask())
+		return err
+
+	case linux.PTRACE_SETSIGMASK:
+		if addr != linux.SignalSetSize {
+			return syserror.EINVAL
+		}
+		var mask linux.SignalSet
+		if _, err := t.CopyIn(data, &mask); err != nil {
+			return err
+		}
+		// The target's task goroutine is stopped, so this is safe:
+		target.SetSignalMask(mask &^ UnblockableSignals)
+		return nil
+
+	case linux.PTRACE_SETOPTIONS:
+		t.tg.pidns.owner.mu.Lock()
+		defer t.tg.pidns.owner.mu.Unlock()
+		return target.ptraceSetOptionsLocked(uintptr(data))
+
+	case linux.PTRACE_GETEVENTMSG:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		_, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg)
+		return err
+
+	// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
+
+	default:
+		return t.ptraceArch(target, req, addr, data)
+	}
+}
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
new file mode 100644
index 000000000..cef1276ec
--- /dev/null
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -0,0 +1,89 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// ptraceArch implements arch-specific ptrace commands.
+func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error {
+	switch req {
+	case linux.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER
+		n, err := target.Arch().PtracePeekUser(uintptr(addr))
+		if err != nil {
+			return err
+		}
+		_, err = t.CopyOut(data, n)
+		return err
+
+	case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
+		return target.Arch().PtracePokeUser(uintptr(addr), uintptr(data))
+
+	case linux.PTRACE_GETREGS:
+		// "Copy the tracee's general-purpose ... registers ... to the address
+		// data in the tracer. ... (addr is ignored.) Note that SPARC systems
+		// have the meaning of data and addr reversed ..."
+		_, err := target.Arch().PtraceGetRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case linux.PTRACE_GETFPREGS:
+		_, err := target.Arch().PtraceGetFPRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case linux.PTRACE_SETREGS:
+		_, err := target.Arch().PtraceSetRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case linux.PTRACE_SETFPREGS:
+		_, err := target.Arch().PtraceSetFPRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	default:
+		return syserror.EIO
+	}
+}
diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go
new file mode 100644
index 000000000..d971b96b3
--- /dev/null
+++ b/pkg/sentry/kernel/ptrace_arm64.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// ptraceArch implements arch-specific ptrace commands.
+func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error {
+	return syserror.EIO
+}
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
new file mode 100644
index 000000000..18416643b
--- /dev/null
+++ b/pkg/sentry/kernel/rseq.go
@@ -0,0 +1,393 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Restartable sequences.
+//
+// We support two different APIs for restartable sequences.
+//
+//  1. The upstream interface added in v4.18.
+//  2. The interface described in https://lwn.net/Articles/650333/.
+//
+// Throughout this file and other parts of the kernel, the latter is referred
+// to as "old rseq". This interface was never merged upstream, but is supported
+// for a limited set of applications that use it regardless.
+
+// OldRSeqCriticalRegion describes an old rseq critical region.
+//
+// +stateify savable
+type OldRSeqCriticalRegion struct {
+	// When a task in this thread group has its CPU preempted (as defined by
+	// platform.ErrContextCPUPreempted) or has a signal delivered to an
+	// application handler while its instruction pointer is in CriticalSection,
+	// set the instruction pointer to Restart and application register r10 (on
+	// amd64) to the former instruction pointer.
+	CriticalSection usermem.AddrRange
+	Restart         usermem.Addr
+}
+
+// RSeqAvailable returns true if t supports (old and new) restartable sequences.
+func (t *Task) RSeqAvailable() bool {
+	return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption()
+}
+
+// SetRSeq registers addr as this thread's rseq structure.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error {
+	if t.rseqAddr != 0 {
+		if t.rseqAddr != addr {
+			return syserror.EINVAL
+		}
+		if t.rseqSignature != signature {
+			return syserror.EINVAL
+		}
+		return syserror.EBUSY
+	}
+
+	// rseq must be aligned and correctly sized.
+	if addr&(linux.AlignOfRSeq-1) != 0 {
+		return syserror.EINVAL
+	}
+	if length != linux.SizeOfRSeq {
+		return syserror.EINVAL
+	}
+	if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok {
+		return syserror.EFAULT
+	}
+
+	t.rseqAddr = addr
+	t.rseqSignature = signature
+
+	// Initialize the CPUID.
+	//
+	// Linux implicitly does this on return from userspace, where failure
+	// would cause SIGSEGV.
+	if err := t.rseqUpdateCPU(); err != nil {
+		t.rseqAddr = 0
+		t.rseqSignature = 0
+
+		t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return syserror.EFAULT
+	}
+
+	return nil
+}
+
+// ClearRSeq unregisters addr as this thread's rseq structure.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) ClearRSeq(addr usermem.Addr, length, signature uint32) error {
+	if t.rseqAddr == 0 {
+		return syserror.EINVAL
+	}
+	if t.rseqAddr != addr {
+		return syserror.EINVAL
+	}
+	if length != linux.SizeOfRSeq {
+		return syserror.EINVAL
+	}
+	if t.rseqSignature != signature {
+		return syserror.EPERM
+	}
+
+	if err := t.rseqClearCPU(); err != nil {
+		return err
+	}
+
+	t.rseqAddr = 0
+	t.rseqSignature = 0
+
+	if t.oldRSeqCPUAddr == 0 {
+		// rseqCPU no longer needed.
+		t.rseqCPU = -1
+	}
+
+	return nil
+}
+
+// OldRSeqCriticalRegion returns a copy of t's thread group's current
+// old restartable sequence.
+func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion {
+	return *t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
+}
+
+// SetOldRSeqCriticalRegion replaces t's thread group's old restartable
+// sequence.
+//
+// Preconditions: t.RSeqAvailable() == true.
+func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error {
+	// These checks are somewhat more lenient than in Linux, which (bizarrely)
+	// requires r.CriticalSection to be non-empty and r.Restart to be
+	// outside of r.CriticalSection, even if r.CriticalSection.Start == 0
+	// (which disables the critical region).
+	if r.CriticalSection.Start == 0 {
+		r.CriticalSection.End = 0
+		r.Restart = 0
+		t.tg.oldRSeqCritical.Store(&r)
+		return nil
+	}
+	if r.CriticalSection.Start >= r.CriticalSection.End {
+		return syserror.EINVAL
+	}
+	if r.CriticalSection.Contains(r.Restart) {
+		return syserror.EINVAL
+	}
+	// TODO(jamieliu): check that r.CriticalSection and r.Restart are in
+	// the application address range, for consistency with Linux.
+	t.tg.oldRSeqCritical.Store(&r)
+	return nil
+}
+
+// OldRSeqCPUAddr returns the address that old rseq will keep updated with t's
+// CPU number.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) OldRSeqCPUAddr() usermem.Addr {
+	return t.oldRSeqCPUAddr
+}
+
+// SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
+// t's CPU number.
+//
+// Preconditions: t.RSeqAvailable() == true. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
+	t.oldRSeqCPUAddr = addr
+
+	// Check that addr is writable.
+	//
+	// N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's
+	// unfortunate, but unlikely in a correct program.
+	if err := t.rseqUpdateCPU(); err != nil {
+		t.oldRSeqCPUAddr = 0
+		return syserror.EINVAL // yes, EINVAL, not err or EFAULT
+	}
+	return nil
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqUpdateCPU() error {
+	if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
+		t.rseqCPU = -1
+		return nil
+	}
+
+	t.rseqCPU = int32(hostcpu.GetCPU())
+
+	// Update both CPUs, even if one fails.
+	rerr := t.rseqCopyOutCPU()
+	oerr := t.oldRSeqCopyOutCPU()
+
+	if rerr != nil {
+		return rerr
+	}
+	return oerr
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) oldRSeqCopyOutCPU() error {
+	if t.oldRSeqCPUAddr == 0 {
+		return nil
+	}
+
+	buf := t.CopyScratchBuffer(4)
+	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
+	_, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf)
+	return err
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqCopyOutCPU() error {
+	if t.rseqAddr == 0 {
+		return nil
+	}
+
+	buf := t.CopyScratchBuffer(8)
+	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
+	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))     // CPUIDStart
+	usermem.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID
+	// N.B. This write is not atomic, but since this occurs on the task
+	// goroutine then as long as userspace uses a single-instruction read
+	// it can't see an invalid value.
+	_, err := t.CopyOutBytes(t.rseqAddr, buf)
+	return err
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqClearCPU() error {
+	buf := t.CopyScratchBuffer(8)
+	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
+	usermem.ByteOrder.PutUint32(buf, 0)                                   // CPUIDStart
+	usermem.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID
+	// N.B. This write is not atomic, but since this occurs on the task
+	// goroutine then as long as userspace uses a single-instruction read
+	// it can't see an invalid value.
+	_, err := t.CopyOutBytes(t.rseqAddr, buf)
+	return err
+}
+
+// rseqAddrInterrupt checks if IP is in a critical section, and aborts if so.
+//
+// This is a bit complex since both the RSeq and RSeqCriticalSection structs
+// are stored in userspace. So we must:
+//
+// 1. Copy in the address of RSeqCriticalSection from RSeq.
+// 2. Copy in RSeqCriticalSection itself.
+// 3. Validate critical section struct version, address range, abort address.
+// 4. Validate the abort signature (4 bytes preceding abort IP match expected
+//    signature).
+// 5. Clear address of RSeqCriticalSection from RSeq.
+// 6. Finally, conditionally abort.
+//
+// See kernel/rseq.c:rseq_ip_fixup for reference.
+//
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqAddrInterrupt() {
+	if t.rseqAddr == 0 {
+		return
+	}
+
+	critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection)
+	if !ok {
+		// SetRSeq should validate this.
+		panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr))
+	}
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		t.Debugf("Only 64-bit rseq supported.")
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	buf := t.CopyScratchBuffer(8)
+	if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil {
+		t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	critAddr := usermem.Addr(usermem.ByteOrder.Uint64(buf))
+	if critAddr == 0 {
+		return
+	}
+
+	var cs linux.RSeqCriticalSection
+	if _, err := cs.CopyIn(t, critAddr); err != nil {
+		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	if cs.Version != 0 {
+		t.Debugf("Unknown version in %+v", cs)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	start := usermem.Addr(cs.Start)
+	critRange, ok := start.ToRange(cs.PostCommitOffset)
+	if !ok {
+		t.Debugf("Invalid start and offset in %+v", cs)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	abort := usermem.Addr(cs.Abort)
+	if critRange.Contains(abort) {
+		t.Debugf("Abort in critical section in %+v", cs)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Verify signature.
+	sigAddr := abort - linux.SizeOfRSeqSignature
+
+	buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature)
+	if _, err := t.CopyInBytes(sigAddr, buf); err != nil {
+		t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	sig := usermem.ByteOrder.Uint32(buf)
+	if sig != t.rseqSignature {
+		t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Clear the critical section address.
+	//
+	// NOTE(b/143949567): We don't support any rseq flags, so we always
+	// restart if we are in the critical section, and thus *always* clear
+	// critAddrAddr.
+	if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{
+		AddressSpaceActive: true,
+	}); err != nil {
+		t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Finally we can actually decide whether or not to restart.
+	if !critRange.Contains(usermem.Addr(t.Arch().IP())) {
+		return
+	}
+
+	t.Arch().SetIP(uintptr(cs.Abort))
+}
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) oldRSeqInterrupt() {
+	r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
+	if ip := t.Arch().IP(); r.CriticalSection.Contains(usermem.Addr(ip)) {
+		t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart)
+		t.Arch().SetIP(uintptr(r.Restart))
+		t.Arch().SetOldRSeqInterruptedIP(ip)
+	}
+}
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) rseqInterrupt() {
+	t.rseqAddrInterrupt()
+	t.oldRSeqInterrupt()
+}
diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD
new file mode 100644
index 000000000..1b82e087b
--- /dev/null
+++ b/pkg/sentry/kernel/sched/BUILD
@@ -0,0 +1,19 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "sched",
+    srcs = [
+        "cpuset.go",
+        "sched.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+)
+
+go_test(
+    name = "sched_test",
+    size = "small",
+    srcs = ["cpuset_test.go"],
+    library = ":sched",
+)
diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go
new file mode 100644
index 000000000..c6c436690
--- /dev/null
+++ b/pkg/sentry/kernel/sched/cpuset.go
@@ -0,0 +1,105 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sched
+
+import "math/bits"
+
+const (
+	bitsPerByte  = 8
+	bytesPerLong = 8 // only for 64-bit architectures
+)
+
+// CPUSet contains a bitmap to record CPU information.
+//
+// Note that this definition is only correct for little-endian architectures,
+// since Linux's cpumask_t uses unsigned long.
+type CPUSet []byte
+
+// CPUSetSize returns the size in bytes of a CPUSet that can contain num cpus.
+func CPUSetSize(num uint) uint {
+	// NOTE(b/68859821): Applications may expect that the size of a CPUSet in
+	// bytes is always a multiple of sizeof(unsigned long), since this is true
+	// in Linux. Thus we always round up.
+	bytes := (num + bitsPerByte - 1) / bitsPerByte
+	longs := (bytes + bytesPerLong - 1) / bytesPerLong
+	return longs * bytesPerLong
+}
+
+// NewCPUSet returns a CPUSet for the given number of CPUs which initially
+// contains no CPUs.
+func NewCPUSet(num uint) CPUSet {
+	return CPUSet(make([]byte, CPUSetSize(num)))
+}
+
+// NewFullCPUSet returns a CPUSet for the given number of CPUs, all of which
+// are present in the set.
+func NewFullCPUSet(num uint) CPUSet {
+	c := NewCPUSet(num)
+	var i uint
+	for ; i < num/bitsPerByte; i++ {
+		c[i] = 0xff
+	}
+	if rem := num % bitsPerByte; rem != 0 {
+		c[i] = (1 << rem) - 1
+	}
+	return c
+}
+
+// Size returns the size of 'c' in bytes.
+func (c CPUSet) Size() uint {
+	return uint(len(c))
+}
+
+// NumCPUs returns how many cpus are set in the CPUSet.
+func (c CPUSet) NumCPUs() uint {
+	var n int
+	for _, b := range c {
+		n += bits.OnesCount8(b)
+	}
+	return uint(n)
+}
+
+// Copy returns a copy of the CPUSet.
+func (c CPUSet) Copy() CPUSet {
+	return append(CPUSet(nil), c...)
+}
+
+// Set sets the bit corresponding to cpu.
+func (c *CPUSet) Set(cpu uint) {
+	(*c)[cpu/bitsPerByte] |= 1 << (cpu % bitsPerByte)
+}
+
+// ClearAbove clears bits corresponding to cpu and all higher cpus.
+func (c *CPUSet) ClearAbove(cpu uint) {
+	i := cpu / bitsPerByte
+	if i >= c.Size() {
+		return
+	}
+	(*c)[i] &^= 0xff << (cpu % bitsPerByte)
+	for i++; i < c.Size(); i++ {
+		(*c)[i] = 0
+	}
+}
+
+// ForEachCPU iterates over the CPUSet and calls fn with the cpu index if
+// it's set.
+func (c CPUSet) ForEachCPU(fn func(uint)) {
+	for i := uint(0); i < c.Size()*bitsPerByte; i++ {
+		bit := uint(1) << (i & (bitsPerByte - 1))
+		if uint(c[i/bitsPerByte])&bit == bit {
+			fn(i)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/sched/cpuset_test.go b/pkg/sentry/kernel/sched/cpuset_test.go
new file mode 100644
index 000000000..3af9f1197
--- /dev/null
+++ b/pkg/sentry/kernel/sched/cpuset_test.go
@@ -0,0 +1,44 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sched
+
+import (
+	"testing"
+)
+
+func TestNumCPUs(t *testing.T) {
+	for i := uint(0); i < 1024; i++ {
+		c := NewCPUSet(i)
+		for j := uint(0); j < i; j++ {
+			c.Set(j)
+		}
+		n := c.NumCPUs()
+		if n != i {
+			t.Errorf("got wrong number of cpus %d, want %d", n, i)
+		}
+	}
+}
+
+func TestClearAbove(t *testing.T) {
+	const n = 1024
+	c := NewFullCPUSet(n)
+	for i := uint(0); i < n; i++ {
+		cpu := n - i
+		c.ClearAbove(cpu)
+		if got := c.NumCPUs(); got != cpu {
+			t.Errorf("iteration %d: got %d cpus, wanted %d", i, got, cpu)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/sched/sched.go b/pkg/sentry/kernel/sched/sched.go
new file mode 100644
index 000000000..de18c9d02
--- /dev/null
+++ b/pkg/sentry/kernel/sched/sched.go
@@ -0,0 +1,16 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sched implements scheduler related features.
+package sched
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
new file mode 100644
index 000000000..c38c5a40c
--- /dev/null
+++ b/pkg/sentry/kernel/seccomp.go
@@ -0,0 +1,217 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const maxSyscallFilterInstructions = 1 << 15
+
+// seccompData is equivalent to struct seccomp_data, which contains the data
+// passed to seccomp-bpf filters.
+type seccompData struct {
+	// nr is the system call number.
+	nr int32
+
+	// arch is an AUDIT_ARCH_* value indicating the system call convention.
+	arch uint32
+
+	// instructionPointer is the value of the instruction pointer at the time
+	// of the system call.
+	instructionPointer uint64
+
+	// args contains the first 6 system call arguments.
+	args [6]uint64
+}
+
+func (d *seccompData) asBPFInput() bpf.Input {
+	return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder}
+}
+
+func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo {
+	si := &arch.SignalInfo{
+		Signo: int32(linux.SIGSYS),
+		Errno: errno,
+		Code:  arch.SYS_SECCOMP,
+	}
+	si.SetCallAddr(uint64(ip))
+	si.SetSyscall(sysno)
+	si.SetArch(t.SyscallTable().AuditNumber)
+	return si
+}
+
+// checkSeccompSyscall applies the task's seccomp filters before the execution
+// of syscall sysno at instruction pointer ip. (These parameters must be passed
+// in because vsyscalls do not use the values in t.Arch().)
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) linux.BPFAction {
+	result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip))
+	action := result & linux.SECCOMP_RET_ACTION
+	switch action {
+	case linux.SECCOMP_RET_TRAP:
+		// "Results in the kernel sending a SIGSYS signal to the triggering
+		// task without executing the system call. ... The SECCOMP_RET_DATA
+		// portion of the return value will be passed as si_errno." -
+		// Documentation/prctl/seccomp_filter.txt
+		t.SendSignal(seccompSiginfo(t, int32(result.Data()), sysno, ip))
+		// "The return value register will contain an arch-dependent value." In
+		// practice, it's ~always the syscall number.
+		t.Arch().SetReturn(uintptr(sysno))
+
+	case linux.SECCOMP_RET_ERRNO:
+		// "Results in the lower 16-bits of the return value being passed to
+		// userland as the errno without executing the system call."
+		t.Arch().SetReturn(-uintptr(result.Data()))
+
+	case linux.SECCOMP_RET_TRACE:
+		// "When returned, this value will cause the kernel to attempt to
+		// notify a ptrace()-based tracer prior to executing the system call.
+		// If there is no tracer present, -ENOSYS is returned to userland and
+		// the system call is not executed."
+		if !t.ptraceSeccomp(result.Data()) {
+			// This useless-looking temporary is needed because Go.
+			tmp := uintptr(syscall.ENOSYS)
+			t.Arch().SetReturn(-tmp)
+			return linux.SECCOMP_RET_ERRNO
+		}
+
+	case linux.SECCOMP_RET_ALLOW:
+		// "Results in the system call being executed."
+
+	case linux.SECCOMP_RET_KILL_THREAD:
+		// "Results in the task exiting immediately without executing the
+		// system call. The exit status of the task will be SIGSYS, not
+		// SIGKILL."
+
+	default:
+		// consistent with Linux
+		return linux.SECCOMP_RET_KILL_THREAD
+	}
+	return action
+}
+
+func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
+	data := seccompData{
+		nr:                 sysno,
+		arch:               t.tc.st.AuditNumber,
+		instructionPointer: uint64(ip),
+	}
+	// data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
+	// we can't do any slicing tricks or even use copy/append here.
+	for i, arg := range args {
+		if i >= len(data.args) {
+			break
+		}
+		data.args[i] = arg.Uint64()
+	}
+	input := data.asBPFInput()
+
+	ret := uint32(linux.SECCOMP_RET_ALLOW)
+	f := t.syscallFilters.Load()
+	if f == nil {
+		return ret
+	}
+
+	// "Every filter successfully installed will be evaluated (in reverse
+	// order) for each system call the task makes." - kernel/seccomp.c
+	for i := len(f.([]bpf.Program)) - 1; i >= 0; i-- {
+		thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input)
+		if err != nil {
+			t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
+			thisRet = uint32(linux.SECCOMP_RET_KILL_THREAD)
+		}
+		// "If multiple filters exist, the return value for the evaluation of a
+		// given system call will always use the highest precedent value." -
+		// Documentation/prctl/seccomp_filter.txt
+		//
+		// (Note that this contradicts prctl(2): "If the filters permit prctl()
+		// calls, then additional filters can be added; they are run in order
+		// until the first non-allow result is seen." prctl(2) is incorrect.)
+		//
+		// "The ordering ensures that a min_t() over composed return values
+		// always selects the least permissive choice." -
+		// include/uapi/linux/seccomp.h
+		if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) {
+			ret = thisRet
+		}
+	}
+
+	return ret
+}
+
+// AppendSyscallFilter adds BPF program p as a system call filter.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) AppendSyscallFilter(p bpf.Program, syncAll bool) error {
+	// While syscallFilters are an atomic.Value we must take the mutex to prevent
+	// our read-copy-update from happening while another task is syncing syscall
+	// filters to us, this keeps the filters in a consistent state.
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+
+	// Cap the combined length of all syscall filters (plus a penalty of 4
+	// instructions per filter beyond the first) to maxSyscallFilterInstructions.
+	// This restriction is inherited from Linux.
+	totalLength := p.Length()
+	var newFilters []bpf.Program
+
+	if sf := t.syscallFilters.Load(); sf != nil {
+		oldFilters := sf.([]bpf.Program)
+		for _, f := range oldFilters {
+			totalLength += f.Length() + 4
+		}
+		newFilters = append(newFilters, oldFilters...)
+	}
+
+	if totalLength > maxSyscallFilterInstructions {
+		return syserror.ENOMEM
+	}
+
+	newFilters = append(newFilters, p)
+	t.syscallFilters.Store(newFilters)
+
+	if syncAll {
+		// Note: No new privs is always assumed to be set.
+		for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() {
+			if ot != t {
+				var copiedFilters []bpf.Program
+				copiedFilters = append(copiedFilters, newFilters...)
+				ot.syscallFilters.Store(copiedFilters)
+			}
+		}
+	}
+
+	return nil
+}
+
+// SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current
+// seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP)
+// and /proc/[pid]/status.
+func (t *Task) SeccompMode() int {
+	f := t.syscallFilters.Load()
+	if f != nil && len(f.([]bpf.Program)) > 0 {
+		return linux.SECCOMP_MODE_FILTER
+	}
+	return linux.SECCOMP_MODE_NONE
+}
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
new file mode 100644
index 000000000..65e5427c1
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -0,0 +1,49 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "waiter_list",
+    out = "waiter_list.go",
+    package = "semaphore",
+    prefix = "waiter",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*waiter",
+        "Linker": "*waiter",
+    },
+)
+
+go_library(
+    name = "semaphore",
+    srcs = [
+        "semaphore.go",
+        "waiter_list.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/log",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sync",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "semaphore_test",
+    size = "small",
+    srcs = ["semaphore_test.go"],
+    library = ":semaphore",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
new file mode 100644
index 000000000..c00fa1138
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -0,0 +1,572 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package semaphore implements System V semaphores.
+package semaphore
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+	valueMax = 32767 // SEMVMX
+
+	// semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL).
+	semaphoresMax = 32000
+
+	// setMax is "system-wide limit on the number of semaphore sets" (SEMMNI).
+	setsMax = 32000
+
+	// semaphoresTotalMax is "system-wide limit on the number of semaphores"
+	// (SEMMNS = SEMMNI*SEMMSL).
+	semaphoresTotalMax = 1024000000
+)
+
+// Registry maintains a set of semaphores that can be found by key or ID.
+//
+// +stateify savable
+type Registry struct {
+	// userNS owning the ipc name this registry belongs to. Immutable.
+	userNS *auth.UserNamespace
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	semaphores map[int32]*Set
+	lastIDUsed int32
+}
+
+// Set represents a set of semaphores that can be operated atomically.
+//
+// +stateify savable
+type Set struct {
+	// registry owning this sem set. Immutable.
+	registry *Registry
+
+	// Id is a handle that identifies the set.
+	ID int32
+
+	// key is an user provided key that can be shared between processes.
+	key int32
+
+	// creator is the user that created the set. Immutable.
+	creator fs.FileOwner
+
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	owner      fs.FileOwner
+	perms      fs.FilePermissions
+	opTime     ktime.Time
+	changeTime ktime.Time
+
+	// sems holds all semaphores in the set. The slice itself is immutable after
+	// it's been set, however each 'sem' object in the slice requires 'mu' lock.
+	sems []sem
+
+	// dead is set to true when the set is removed and can't be reached anymore.
+	// All waiters must wake up and fail when set is dead.
+	dead bool
+}
+
+// sem represents a single semaphore from a set.
+//
+// +stateify savable
+type sem struct {
+	value   int16
+	waiters waiterList `state:"zerovalue"`
+	pid     int32
+}
+
+// waiter represents a caller that is waiting for the semaphore value to
+// become positive or zero.
+//
+// +stateify savable
+type waiter struct {
+	waiterEntry
+
+	// value represents how much resource the waiter needs to wake up.
+	value int16
+	ch    chan struct{}
+}
+
+// NewRegistry creates a new semaphore set registry.
+func NewRegistry(userNS *auth.UserNamespace) *Registry {
+	return &Registry{
+		userNS:     userNS,
+		semaphores: make(map[int32]*Set),
+	}
+}
+
+// FindOrCreate searches for a semaphore set that matches 'key'. If not found,
+// it may create a new one if requested. If private is true, key is ignored and
+// a new set is always created. If create is false, it fails if a set cannot
+// be found. If exclusive is true, it fails if a set with the same key already
+// exists.
+func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) {
+	if nsems < 0 || nsems > semaphoresMax {
+		return nil, syserror.EINVAL
+	}
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if !private {
+		// Look up an existing semaphore.
+		if set := r.findByKey(key); set != nil {
+			set.mu.Lock()
+			defer set.mu.Unlock()
+
+			// Check that caller can access semaphore set.
+			creds := auth.CredentialsFromContext(ctx)
+			if !set.checkPerms(creds, fs.PermsFromMode(mode)) {
+				return nil, syserror.EACCES
+			}
+
+			// Validate parameters.
+			if nsems > int32(set.Size()) {
+				return nil, syserror.EINVAL
+			}
+			if create && exclusive {
+				return nil, syserror.EEXIST
+			}
+			return set, nil
+		}
+
+		if !create {
+			// Semaphore not found and should not be created.
+			return nil, syserror.ENOENT
+		}
+	}
+
+	// Zero is only valid if an existing set is found.
+	if nsems == 0 {
+		return nil, syserror.EINVAL
+	}
+
+	// Apply system limits.
+	if len(r.semaphores) >= setsMax {
+		return nil, syserror.EINVAL
+	}
+	if r.totalSems() > int(semaphoresTotalMax-nsems) {
+		return nil, syserror.EINVAL
+	}
+
+	// Finally create a new set.
+	owner := fs.FileOwnerFromContext(ctx)
+	perms := fs.FilePermsFromMode(mode)
+	return r.newSet(ctx, key, owner, owner, perms, nsems)
+}
+
+// RemoveID removes set with give 'id' from the registry and marks the set as
+// dead. All waiters will be awakened and fail.
+func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	set := r.semaphores[id]
+	if set == nil {
+		return syserror.EINVAL
+	}
+
+	set.mu.Lock()
+	defer set.mu.Unlock()
+
+	// "The effective user ID of the calling process must match the creator or
+	// owner of the semaphore set, or the caller must be privileged."
+	if !set.checkCredentials(creds) && !set.checkCapability(creds) {
+		return syserror.EACCES
+	}
+
+	delete(r.semaphores, set.ID)
+	set.destroy()
+	return nil
+}
+
+func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) {
+	set := &Set{
+		registry:   r,
+		key:        key,
+		owner:      owner,
+		creator:    owner,
+		perms:      perms,
+		changeTime: ktime.NowFromContext(ctx),
+		sems:       make([]sem, nsems),
+	}
+
+	// Find the next available ID.
+	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
+		// Handle wrap around.
+		if id < 0 {
+			id = 0
+			continue
+		}
+		if r.semaphores[id] == nil {
+			r.lastIDUsed = id
+			r.semaphores[id] = set
+			set.ID = id
+			return set, nil
+		}
+	}
+
+	log.Warningf("Semaphore map is full, they must be leaking")
+	return nil, syserror.ENOMEM
+}
+
+// FindByID looks up a set given an ID.
+func (r *Registry) FindByID(id int32) *Set {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return r.semaphores[id]
+}
+
+func (r *Registry) findByKey(key int32) *Set {
+	for _, v := range r.semaphores {
+		if v.key == key {
+			return v
+		}
+	}
+	return nil
+}
+
+func (r *Registry) totalSems() int {
+	totalSems := 0
+	for _, v := range r.semaphores {
+		totalSems += v.Size()
+	}
+	return totalSems
+}
+
+func (s *Set) findSem(num int32) *sem {
+	if num < 0 || int(num) >= s.Size() {
+		return nil
+	}
+	return &s.sems[num]
+}
+
+// Size returns the number of semaphores in the set. Size is immutable.
+func (s *Set) Size() int {
+	return len(s.sems)
+}
+
+// Change changes some fields from the set atomically.
+func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The effective UID of the calling process must match the owner or creator
+	// of the semaphore set, or the caller must be privileged."
+	if !s.checkCredentials(creds) && !s.checkCapability(creds) {
+		return syserror.EACCES
+	}
+
+	s.owner = owner
+	s.perms = perms
+	s.changeTime = ktime.NowFromContext(ctx)
+	return nil
+}
+
+// SetVal overrides a semaphore value, waking up waiters as needed.
+func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error {
+	if val < 0 || val > valueMax {
+		return syserror.ERANGE
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have alter permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Write: true}) {
+		return syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return syserror.ERANGE
+	}
+
+	// TODO(gvisor.dev/issue/137): Clear undo entries in all processes.
+	sem.value = val
+	sem.pid = pid
+	s.changeTime = ktime.NowFromContext(ctx)
+	sem.wakeWaiters()
+	return nil
+}
+
+// SetValAll overrides all semaphores values, waking up waiters as needed. It also
+// sets semaphore's PID which was fixed in Linux 4.6.
+//
+// 'len(vals)' must be equal to 's.Size()'.
+func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credentials, pid int32) error {
+	if len(vals) != s.Size() {
+		panic(fmt.Sprintf("vals length (%d) different that Set.Size() (%d)", len(vals), s.Size()))
+	}
+
+	for _, val := range vals {
+		if val < 0 || val > valueMax {
+			return syserror.ERANGE
+		}
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have alter permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Write: true}) {
+		return syserror.EACCES
+	}
+
+	for i, val := range vals {
+		sem := &s.sems[i]
+
+		// TODO(gvisor.dev/issue/137): Clear undo entries in all processes.
+		sem.value = int16(val)
+		sem.pid = pid
+		sem.wakeWaiters()
+	}
+	s.changeTime = ktime.NowFromContext(ctx)
+	return nil
+}
+
+// GetVal returns a semaphore value.
+func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return 0, syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return 0, syserror.ERANGE
+	}
+	return sem.value, nil
+}
+
+// GetValAll returns value for all semaphores.
+func (s *Set) GetValAll(creds *auth.Credentials) ([]uint16, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return nil, syserror.EACCES
+	}
+
+	vals := make([]uint16, s.Size())
+	for i, sem := range s.sems {
+		vals[i] = uint16(sem.value)
+	}
+	return vals, nil
+}
+
+// GetPID returns the PID set when performing operations in the semaphore.
+func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return 0, syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return 0, syserror.ERANGE
+	}
+	return sem.pid, nil
+}
+
+// ExecuteOps attempts to execute a list of operations to the set. It only
+// succeeds when all operations can be applied. No changes are made if it fails.
+//
+// On failure, it may return an error (retries are hopeless) or it may return
+// a channel that can be waited on before attempting again.
+func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials, pid int32) (chan struct{}, int32, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// Did it race with a removal operation?
+	if s.dead {
+		return nil, 0, syserror.EIDRM
+	}
+
+	// Validate the operations.
+	readOnly := true
+	for _, op := range ops {
+		if s.findSem(int32(op.SemNum)) == nil {
+			return nil, 0, syserror.EFBIG
+		}
+		if op.SemOp != 0 {
+			readOnly = false
+		}
+	}
+
+	if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) {
+		return nil, 0, syserror.EACCES
+	}
+
+	ch, num, err := s.executeOps(ctx, ops, pid)
+	if err != nil {
+		return nil, 0, err
+	}
+	return ch, num, nil
+}
+
+func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (chan struct{}, int32, error) {
+	// Changes to semaphores go to this slice temporarily until they all succeed.
+	tmpVals := make([]int16, len(s.sems))
+	for i := range s.sems {
+		tmpVals[i] = s.sems[i].value
+	}
+
+	for _, op := range ops {
+		sem := &s.sems[op.SemNum]
+		if op.SemOp == 0 {
+			// Handle 'wait for zero' operation.
+			if tmpVals[op.SemNum] != 0 {
+				// Semaphore isn't 0, must wait.
+				if op.SemFlg&linux.IPC_NOWAIT != 0 {
+					return nil, 0, syserror.ErrWouldBlock
+				}
+
+				w := newWaiter(op.SemOp)
+				sem.waiters.PushBack(w)
+				return w.ch, int32(op.SemNum), nil
+			}
+		} else {
+			if op.SemOp < 0 {
+				// Handle 'wait' operation.
+				if -op.SemOp > valueMax {
+					return nil, 0, syserror.ERANGE
+				}
+				if -op.SemOp > tmpVals[op.SemNum] {
+					// Not enough resources, must wait.
+					if op.SemFlg&linux.IPC_NOWAIT != 0 {
+						return nil, 0, syserror.ErrWouldBlock
+					}
+
+					w := newWaiter(op.SemOp)
+					sem.waiters.PushBack(w)
+					return w.ch, int32(op.SemNum), nil
+				}
+			} else {
+				// op.SemOp > 0: Handle 'signal' operation.
+				if tmpVals[op.SemNum] > valueMax-op.SemOp {
+					return nil, 0, syserror.ERANGE
+				}
+			}
+
+			tmpVals[op.SemNum] += op.SemOp
+		}
+	}
+
+	// All operations succeeded, apply them.
+	// TODO(gvisor.dev/issue/137): handle undo operations.
+	for i, v := range tmpVals {
+		s.sems[i].value = v
+		s.sems[i].wakeWaiters()
+		s.sems[i].pid = pid
+	}
+	s.opTime = ktime.NowFromContext(ctx)
+	return nil, 0, nil
+}
+
+// AbortWait notifies that a waiter is giving up and will not wait on the
+// channel anymore.
+func (s *Set) AbortWait(num int32, ch chan struct{}) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	sem := &s.sems[num]
+	for w := sem.waiters.Front(); w != nil; w = w.Next() {
+		if w.ch == ch {
+			sem.waiters.Remove(w)
+			return
+		}
+	}
+	// Waiter may not be found in case it raced with wakeWaiters().
+}
+
+func (s *Set) checkCredentials(creds *auth.Credentials) bool {
+	return s.owner.UID == creds.EffectiveKUID ||
+		s.owner.GID == creds.EffectiveKGID ||
+		s.creator.UID == creds.EffectiveKUID ||
+		s.creator.GID == creds.EffectiveKGID
+}
+
+func (s *Set) checkCapability(creds *auth.Credentials) bool {
+	return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok()
+}
+
+func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool {
+	// Are we owner, or in group, or other?
+	p := s.perms.Other
+	if s.owner.UID == creds.EffectiveKUID {
+		p = s.perms.User
+	} else if creds.InGroup(s.owner.GID) {
+		p = s.perms.Group
+	}
+
+	// Are permissions satisfied without capability checks?
+	if p.SupersetOf(reqPerms) {
+		return true
+	}
+
+	return s.checkCapability(creds)
+}
+
+// destroy destroys the set. Caller must hold 's.mu'.
+func (s *Set) destroy() {
+	// Notify all waiters. They will fail on the next attempt to execute
+	// operations and return error.
+	s.dead = true
+	for _, s := range s.sems {
+		for w := s.waiters.Front(); w != nil; w = w.Next() {
+			w.ch <- struct{}{}
+		}
+		s.waiters.Reset()
+	}
+}
+
+// wakeWaiters goes over all waiters and checks which of them can be notified.
+func (s *sem) wakeWaiters() {
+	// Note that this will release all waiters waiting for 0 too.
+	for w := s.waiters.Front(); w != nil; {
+		if s.value < w.value {
+			// Still blocked, skip it.
+			w = w.Next()
+			continue
+		}
+		w.ch <- struct{}{}
+		old := w
+		w = w.Next()
+		s.waiters.Remove(old)
+	}
+}
+
+func newWaiter(val int16) *waiter {
+	return &waiter{
+		value: val,
+		ch:    make(chan struct{}, 1),
+	}
+}
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
new file mode 100644
index 000000000..e47acefdf
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -0,0 +1,172 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package semaphore
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func executeOps(ctx context.Context, t *testing.T, set *Set, ops []linux.Sembuf, block bool) chan struct{} {
+	ch, _, err := set.executeOps(ctx, ops, 123)
+	if err != nil {
+		t.Fatalf("ExecuteOps(ops) failed, err: %v, ops: %+v", err, ops)
+	}
+	if block {
+		if ch == nil {
+			t.Fatalf("ExecuteOps(ops) got: nil, expected: !nil, ops: %+v", ops)
+		}
+		if signalled(ch) {
+			t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops)
+		}
+	} else {
+		if ch != nil {
+			t.Fatalf("ExecuteOps(ops) got: %v, expected: nil, ops: %+v", ch, ops)
+		}
+	}
+	return ch
+}
+
+func signalled(ch chan struct{}) bool {
+	select {
+	case <-ch:
+		return true
+	default:
+		return false
+	}
+}
+
+func TestBasic(t *testing.T) {
+	ctx := contexttest.Context(t)
+	set := &Set{ID: 123, sems: make([]sem, 1)}
+	ops := []linux.Sembuf{
+		{SemOp: 1},
+	}
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -1
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -1
+	ch1 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 1
+	executeOps(ctx, t, set, ops, false)
+	if !signalled(ch1) {
+		t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops)
+	}
+}
+
+func TestWaitForZero(t *testing.T) {
+	ctx := contexttest.Context(t)
+	set := &Set{ID: 123, sems: make([]sem, 1)}
+	ops := []linux.Sembuf{
+		{SemOp: 0},
+	}
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -2
+	ch1 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 0
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = 1
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = 0
+	chZero1 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 0
+	chZero2 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 1
+	executeOps(ctx, t, set, ops, false)
+	if !signalled(ch1) {
+		t.Fatalf("ExecuteOps(ops) channel should have been signalled, ops: %+v, set: %+v", ops, set)
+	}
+
+	ops[0].SemOp = -2
+	executeOps(ctx, t, set, ops, false)
+	if !signalled(chZero1) {
+		t.Fatalf("ExecuteOps(ops) channel zero 1 should have been signalled, ops: %+v, set: %+v", ops, set)
+	}
+	if !signalled(chZero2) {
+		t.Fatalf("ExecuteOps(ops) channel zero 2 should have been signalled, ops: %+v, set: %+v", ops, set)
+	}
+}
+
+func TestNoWait(t *testing.T) {
+	ctx := contexttest.Context(t)
+	set := &Set{ID: 123, sems: make([]sem, 1)}
+	ops := []linux.Sembuf{
+		{SemOp: 1},
+	}
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -2
+	ops[0].SemFlg = linux.IPC_NOWAIT
+	if _, _, err := set.executeOps(ctx, ops, 123); err != syserror.ErrWouldBlock {
+		t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
+	}
+
+	ops[0].SemOp = 0
+	ops[0].SemFlg = linux.IPC_NOWAIT
+	if _, _, err := set.executeOps(ctx, ops, 123); err != syserror.ErrWouldBlock {
+		t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
+	}
+}
+
+func TestUnregister(t *testing.T) {
+	ctx := contexttest.Context(t)
+	r := NewRegistry(auth.NewRootUserNamespace())
+	set, err := r.FindOrCreate(ctx, 123, 2, linux.FileMode(0x600), true, true, true)
+	if err != nil {
+		t.Fatalf("FindOrCreate() failed, err: %v", err)
+	}
+	if got := r.FindByID(set.ID); got.ID != set.ID {
+		t.Fatalf("FindById(%d) failed, got: %+v, expected: %+v", set.ID, got, set)
+	}
+
+	ops := []linux.Sembuf{
+		{SemOp: -1},
+	}
+	chs := make([]chan struct{}, 0, 5)
+	for i := 0; i < 5; i++ {
+		ch := executeOps(ctx, t, set, ops, true)
+		chs = append(chs, ch)
+	}
+
+	creds := auth.CredentialsFromContext(ctx)
+	if err := r.RemoveID(set.ID, creds); err != nil {
+		t.Fatalf("RemoveID(%d) failed, err: %v", set.ID, err)
+	}
+	if !set.dead {
+		t.Fatalf("set is not dead: %+v", set)
+	}
+	if got := r.FindByID(set.ID); got != nil {
+		t.Fatalf("FindById(%d) failed, got: %+v, expected: nil", set.ID, got)
+	}
+	for i, ch := range chs {
+		if !signalled(ch) {
+			t.Fatalf("channel %d should have been signalled", i)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
new file mode 100644
index 000000000..0e19286de
--- /dev/null
+++ b/pkg/sentry/kernel/sessions.go
@@ -0,0 +1,528 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// SessionID is the public identifier.
+type SessionID ThreadID
+
+// ProcessGroupID is the public identifier.
+type ProcessGroupID ThreadID
+
+// Session contains a leader threadgroup and a list of ProcessGroups.
+//
+// +stateify savable
+type Session struct {
+	refs refs.AtomicRefCount
+
+	// leader is the originator of the Session.
+	//
+	// Note that this may no longer be running (and may be reaped), so the
+	// ID is cached upon initial creation. The leader is still required
+	// however, since its PIDNamespace defines the scope of the Session.
+	//
+	// The leader is immutable.
+	leader *ThreadGroup
+
+	// id is the cached identifier in the leader's namespace.
+	//
+	// The id is immutable.
+	id SessionID
+
+	// foreground is the foreground process group.
+	//
+	// This is protected by TaskSet.mu.
+	foreground *ProcessGroup
+
+	// ProcessGroups is a list of process groups in this Session. This is
+	// protected by TaskSet.mu.
+	processGroups processGroupList
+
+	// sessionEntry is the embed for TaskSet.sessions. This is protected by
+	// TaskSet.mu.
+	sessionEntry
+}
+
+// incRef grabs a reference.
+func (s *Session) incRef() {
+	s.refs.IncRef()
+}
+
+// decRef drops a reference.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (s *Session) decRef() {
+	s.refs.DecRefWithDestructor(func() {
+		// Remove translations from the leader.
+		for ns := s.leader.pidns; ns != nil; ns = ns.parent {
+			id := ns.sids[s]
+			delete(ns.sids, s)
+			delete(ns.sessions, id)
+		}
+
+		// Remove from the list of global Sessions.
+		s.leader.pidns.owner.sessions.Remove(s)
+	})
+}
+
+// ProcessGroup contains an originator threadgroup and a parent Session.
+//
+// +stateify savable
+type ProcessGroup struct {
+	refs refs.AtomicRefCount // not exported.
+
+	// originator is the originator of the group.
+	//
+	// See note re: leader in Session. The same applies here.
+	//
+	// The originator is immutable.
+	originator *ThreadGroup
+
+	// id is the cached identifier in the originator's namespace.
+	//
+	// The id is immutable.
+	id ProcessGroupID
+
+	// Session is the parent Session.
+	//
+	// The session is immutable.
+	session *Session
+
+	// ancestors is the number of thread groups in this process group whose
+	// parent is in a different process group in the same session.
+	//
+	// The name is derived from the fact that process groups where
+	// ancestors is zero are considered "orphans".
+	//
+	// ancestors is protected by TaskSet.mu.
+	ancestors uint32
+
+	// processGroupEntry is the embedded entry for Sessions.groups. This is
+	// protected by TaskSet.mu.
+	processGroupEntry
+}
+
+// Originator retuns the originator of the process group.
+func (pg *ProcessGroup) Originator() *ThreadGroup {
+	return pg.originator
+}
+
+// IsOrphan returns true if this process group is an orphan.
+func (pg *ProcessGroup) IsOrphan() bool {
+	pg.originator.TaskSet().mu.RLock()
+	defer pg.originator.TaskSet().mu.RUnlock()
+	return pg.ancestors == 0
+}
+
+// incRefWithParent grabs a reference.
+//
+// This function is called when this ProcessGroup is being associated with some
+// new ThreadGroup, tg. parentPG is the ProcessGroup of tg's parent
+// ThreadGroup. If tg is init, then parentPG may be nil.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) incRefWithParent(parentPG *ProcessGroup) {
+	// We acquire an "ancestor" reference in the case of a nil parent.
+	// This is because the process being associated is init, and init can
+	// never be orphaned (we count it as always having an ancestor).
+	if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) {
+		pg.ancestors++
+	}
+
+	pg.refs.IncRef()
+}
+
+// decRefWithParent drops a reference.
+//
+// parentPG is per incRefWithParent.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
+	// See incRefWithParent regarding parent == nil.
+	if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) {
+		pg.ancestors--
+	}
+
+	alive := true
+	pg.refs.DecRefWithDestructor(func() {
+		alive = false // don't bother with handleOrphan.
+
+		// Remove translations from the originator.
+		for ns := pg.originator.pidns; ns != nil; ns = ns.parent {
+			id := ns.pgids[pg]
+			delete(ns.pgids, pg)
+			delete(ns.processGroups, id)
+		}
+
+		// Remove the list of process groups.
+		pg.session.processGroups.Remove(pg)
+		pg.session.decRef()
+	})
+	if alive {
+		pg.handleOrphan()
+	}
+}
+
+// parentPG returns the parent process group.
+//
+// Precondition: callers must hold TaskSet.mu.
+func (tg *ThreadGroup) parentPG() *ProcessGroup {
+	if tg.leader.parent != nil {
+		return tg.leader.parent.tg.processGroup
+	}
+	return nil
+}
+
+// handleOrphan checks whether the process group is an orphan and has any
+// stopped jobs. If yes, then appropriate signals are delivered to each thread
+// group within the process group.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) handleOrphan() {
+	// Check if this process is an orphan.
+	if pg.ancestors != 0 {
+		return
+	}
+
+	// See if there are any stopped jobs.
+	hasStopped := false
+	pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) {
+		if tg.processGroup != pg {
+			return
+		}
+		tg.signalHandlers.mu.Lock()
+		if tg.groupStopComplete {
+			hasStopped = true
+		}
+		tg.signalHandlers.mu.Unlock()
+	})
+	if !hasStopped {
+		return
+	}
+
+	// Deliver appropriate signals to all thread groups.
+	pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) {
+		if tg.processGroup != pg {
+			return
+		}
+		tg.signalHandlers.mu.Lock()
+		tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGHUP), true /* group */)
+		tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGCONT), true /* group */)
+		tg.signalHandlers.mu.Unlock()
+	})
+
+	return
+}
+
+// Session returns the process group's session without taking a reference.
+func (pg *ProcessGroup) Session() *Session {
+	return pg.session
+}
+
+// SendSignal sends a signal to all processes inside the process group. It is
+// analagous to kernel/signal.c:kill_pgrp.
+func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error {
+	tasks := pg.originator.TaskSet()
+	tasks.mu.RLock()
+	defer tasks.mu.RUnlock()
+
+	var lastErr error
+	for tg := range tasks.Root.tgids {
+		if tg.processGroup == pg {
+			tg.signalHandlers.mu.Lock()
+			infoCopy := *info
+			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
+				lastErr = err
+			}
+			tg.signalHandlers.mu.Unlock()
+		}
+	}
+	return lastErr
+}
+
+// CreateSession creates a new Session, with the ThreadGroup as the leader.
+//
+// EPERM may be returned if either the given ThreadGroup is already a Session
+// leader, or a ProcessGroup already exists for the ThreadGroup's ID.
+func (tg *ThreadGroup) CreateSession() error {
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	return tg.createSession()
+}
+
+// createSession creates a new session for a threadgroup.
+//
+// Precondition: callers must hold TaskSet.mu and the signal mutex for writing.
+func (tg *ThreadGroup) createSession() error {
+	// Get the ID for this thread in the current namespace.
+	id := tg.pidns.tgids[tg]
+
+	// Check if this ThreadGroup already leads a Session, or
+	// if the proposed group is already taken.
+	for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() {
+		if s.leader.pidns != tg.pidns {
+			continue
+		}
+		if s.leader == tg {
+			return syserror.EPERM
+		}
+		if s.id == SessionID(id) {
+			return syserror.EPERM
+		}
+		for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
+			if pg.id == ProcessGroupID(id) {
+				return syserror.EPERM
+			}
+		}
+	}
+
+	// Create a new Session, with a single reference.
+	s := &Session{
+		id:     SessionID(id),
+		leader: tg,
+	}
+	s.refs.EnableLeakCheck("kernel.Session")
+
+	// Create a new ProcessGroup, belonging to that Session.
+	// This also has a single reference (assigned below).
+	//
+	// Note that since this is a new session and a new process group, there
+	// will be zero ancestors for this process group. (It is an orphan at
+	// this point.)
+	pg := &ProcessGroup{
+		id:         ProcessGroupID(id),
+		originator: tg,
+		session:    s,
+		ancestors:  0,
+	}
+	pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+
+	// Tie them and return the result.
+	s.processGroups.PushBack(pg)
+	tg.pidns.owner.sessions.PushBack(s)
+
+	// Leave the current group, and assign the new one.
+	if tg.processGroup != nil {
+		oldParentPG := tg.parentPG()
+		tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+			childTG.processGroup.incRefWithParent(pg)
+			childTG.processGroup.decRefWithParent(oldParentPG)
+		})
+		// If tg.processGroup is an orphan, decRefWithParent will lock
+		// the signal mutex of each thread group in tg.processGroup.
+		// However, tg's signal mutex may already be locked at this
+		// point. We change tg's process group before calling
+		// decRefWithParent to avoid locking tg's signal mutex twice.
+		oldPG := tg.processGroup
+		tg.processGroup = pg
+		oldPG.decRefWithParent(oldParentPG)
+	} else {
+		// The current process group may be nil only in the case of an
+		// unparented thread group (i.e. the init process). This would
+		// not normally occur, but we allow it for the convenience of
+		// CreateSession working from that point. There will be no
+		// child processes. We always say that the very first group
+		// created has ancestors (avoids checks elsewhere).
+		//
+		// Note that this mirrors the parent == nil logic in
+		// incRef/decRef/reparent, which counts nil as an ancestor.
+		tg.processGroup = pg
+		tg.processGroup.ancestors++
+	}
+
+	// Ensure a translation is added to all namespaces.
+	for ns := tg.pidns; ns != nil; ns = ns.parent {
+		local := ns.tgids[tg]
+		ns.sids[s] = SessionID(local)
+		ns.sessions[SessionID(local)] = s
+		ns.pgids[pg] = ProcessGroupID(local)
+		ns.processGroups[ProcessGroupID(local)] = pg
+	}
+
+	// Disconnect from the controlling terminal.
+	tg.tty = nil
+
+	return nil
+}
+
+// CreateProcessGroup creates a new process group.
+//
+// An EPERM error will be returned if the ThreadGroup belongs to a different
+// Session, is a Session leader or the group already exists.
+func (tg *ThreadGroup) CreateProcessGroup() error {
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+
+	// Get the ID for this thread in the current namespace.
+	id := tg.pidns.tgids[tg]
+
+	// Per above, check for a Session leader or existing group.
+	for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() {
+		if s.leader.pidns != tg.pidns {
+			continue
+		}
+		if s.leader == tg {
+			return syserror.EPERM
+		}
+		for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
+			if pg.id == ProcessGroupID(id) {
+				return syserror.EPERM
+			}
+		}
+	}
+
+	// Create a new ProcessGroup, belonging to the current Session.
+	//
+	// We manually adjust the ancestors if the parent is in the same
+	// session.
+	tg.processGroup.session.incRef()
+	pg := ProcessGroup{
+		id:         ProcessGroupID(id),
+		originator: tg,
+		session:    tg.processGroup.session,
+	}
+	pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+
+	if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
+		pg.ancestors++
+	}
+
+	// Assign the new process group; adjust children.
+	oldParentPG := tg.parentPG()
+	tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+		childTG.processGroup.incRefWithParent(&pg)
+		childTG.processGroup.decRefWithParent(oldParentPG)
+	})
+	tg.processGroup.decRefWithParent(oldParentPG)
+	tg.processGroup = &pg
+
+	// Add the new process group to the session.
+	pg.session.processGroups.PushBack(&pg)
+
+	// Ensure this translation is added to all namespaces.
+	for ns := tg.pidns; ns != nil; ns = ns.parent {
+		local := ns.tgids[tg]
+		ns.pgids[&pg] = ProcessGroupID(local)
+		ns.processGroups[ProcessGroupID(local)] = &pg
+	}
+
+	return nil
+}
+
+// JoinProcessGroup joins an existing process group.
+//
+// This function will return EACCES if an exec has been performed since fork
+// by the given ThreadGroup, and EPERM if the Sessions are not the same or the
+// group does not exist.
+//
+// If checkExec is set, then the join is not permitted after the process has
+// executed exec at least once.
+func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID, checkExec bool) error {
+	pidns.owner.mu.Lock()
+	defer pidns.owner.mu.Unlock()
+
+	// Lookup the ProcessGroup.
+	pg := pidns.processGroups[pgid]
+	if pg == nil {
+		return syserror.EPERM
+	}
+
+	// Disallow the join if an execve has performed, per POSIX.
+	if checkExec && tg.execed {
+		return syserror.EACCES
+	}
+
+	// See if it's in the same session as ours.
+	if pg.session != tg.processGroup.session {
+		return syserror.EPERM
+	}
+
+	// Join the group; adjust children.
+	parentPG := tg.parentPG()
+	pg.incRefWithParent(parentPG)
+	tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+		childTG.processGroup.incRefWithParent(pg)
+		childTG.processGroup.decRefWithParent(tg.processGroup)
+	})
+	tg.processGroup.decRefWithParent(parentPG)
+	tg.processGroup = pg
+
+	return nil
+}
+
+// Session returns the ThreadGroup's Session.
+//
+// A reference is not taken on the session.
+func (tg *ThreadGroup) Session() *Session {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.processGroup.session
+}
+
+// IDOfSession returns the Session assigned to s in PID namespace ns.
+//
+// If this group isn't visible in this namespace, zero will be returned. It is
+// the callers responsibility to check that before using this function.
+func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.sids[s]
+}
+
+// SessionWithID returns the Session with the given ID in the PID namespace ns,
+// or nil if that given ID is not defined in this namespace.
+//
+// A reference is not taken on the session.
+func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.sessions[id]
+}
+
+// ProcessGroup returns the ThreadGroup's ProcessGroup.
+//
+// A reference is not taken on the process group.
+func (tg *ThreadGroup) ProcessGroup() *ProcessGroup {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.processGroup
+}
+
+// IDOfProcessGroup returns the process group assigned to pg in PID namespace ns.
+//
+// The same constraints apply as IDOfSession.
+func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.pgids[pg]
+}
+
+// ProcessGroupWithID returns the ProcessGroup with the given ID in the PID
+// namespace ns, or nil if that given ID is not defined in this namespace.
+//
+// A reference is not taken on the process group.
+func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.processGroups[id]
+}
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
new file mode 100644
index 000000000..bfd779837
--- /dev/null
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -0,0 +1,29 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "shm",
+    srcs = [
+        "device.go",
+        "shm.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/usage",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/kernel/shm/device.go b/pkg/sentry/kernel/shm/device.go
new file mode 100644
index 000000000..6b0d5818b
--- /dev/null
+++ b/pkg/sentry/kernel/shm/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package shm
+
+import "gvisor.dev/gvisor/pkg/sentry/device"
+
+// shmDevice is the kernel shm device.
+var shmDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
new file mode 100644
index 000000000..f66cfcc7f
--- /dev/null
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -0,0 +1,707 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package shm implements sysv shared memory segments.
+//
+// Known missing features:
+//
+// - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement
+//   memory locking in general.
+//
+// - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy
+//   way to implement hugetlb support on a per-map basis, and it has no impact
+//   on correctness.
+//
+// - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap
+//   so it's meaningless to reserve space for swap.
+//
+// - No per-process segment size enforcement. This feature probably isn't used
+//   much anyways, since Linux sets the per-process limits to the system-wide
+//   limits by default.
+//
+// Lock ordering: mm.mappingMu -> shm registry lock -> shm lock
+package shm
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Key represents a shm segment key. Analogous to a file name.
+type Key int32
+
+// ID represents the opaque handle for a shm segment. Analogous to an fd.
+type ID int32
+
+// Registry tracks all shared memory segments in an IPC namespace. The registry
+// provides the mechanisms for creating and finding segments, and reporting
+// global shm parameters.
+//
+// +stateify savable
+type Registry struct {
+	// userNS owns the IPC namespace this registry belong to. Immutable.
+	userNS *auth.UserNamespace
+
+	// mu protects all fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// shms maps segment ids to segments.
+	//
+	// shms holds all referenced segments, which are removed on the last
+	// DecRef. Thus, it cannot itself hold a reference on the Shm.
+	//
+	// Since removal only occurs after the last (unlocked) DecRef, there
+	// exists a short window during which a Shm still exists in Shm, but is
+	// unreferenced. Users must use TryIncRef to determine if the Shm is
+	// still valid.
+	shms map[ID]*Shm
+
+	// keysToShms maps segment keys to segments.
+	//
+	// Shms in keysToShms are guaranteed to be referenced, as they are
+	// removed by disassociateKey before the last DecRef.
+	keysToShms map[Key]*Shm
+
+	// Sum of the sizes of all existing segments rounded up to page size, in
+	// units of page size.
+	totalPages uint64
+
+	// ID assigned to the last created segment. Used to quickly find the next
+	// unused ID.
+	lastIDUsed ID
+}
+
+// NewRegistry creates a new shm registry.
+func NewRegistry(userNS *auth.UserNamespace) *Registry {
+	return &Registry{
+		userNS:     userNS,
+		shms:       make(map[ID]*Shm),
+		keysToShms: make(map[Key]*Shm),
+	}
+}
+
+// FindByID looks up a segment given an ID.
+//
+// FindByID returns a reference on Shm.
+func (r *Registry) FindByID(id ID) *Shm {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	s := r.shms[id]
+	// Take a reference on s. If TryIncRef fails, s has reached the last
+	// DecRef, but hasn't quite been removed from r.shms yet.
+	if s != nil && s.TryIncRef() {
+		return s
+	}
+	return nil
+}
+
+// dissociateKey removes the association between a segment and its key,
+// preventing it from being discovered in the registry. This doesn't necessarily
+// mean the segment is about to be destroyed. This is analogous to unlinking a
+// file; the segment can still be used by a process already referencing it, but
+// cannot be discovered by a new process.
+func (r *Registry) dissociateKey(s *Shm) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.key != linux.IPC_PRIVATE {
+		delete(r.keysToShms, s.key)
+		s.key = linux.IPC_PRIVATE
+	}
+}
+
+// FindOrCreate looks up or creates a segment in the registry. It's functionally
+// analogous to open(2).
+//
+// FindOrCreate returns a reference on Shm.
+func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
+	if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
+		// "A new segment was to be created and size is less than SHMMIN or
+		// greater than SHMMAX." - man shmget(2)
+		//
+		// Note that 'private' always implies the creation of a new segment
+		// whether IPC_CREAT is specified or not.
+		return nil, syserror.EINVAL
+	}
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if len(r.shms) >= linux.SHMMNI {
+		// "All possible shared memory IDs have been taken (SHMMNI) ..."
+		//   - man shmget(2)
+		return nil, syserror.ENOSPC
+	}
+
+	if !private {
+		// Look up an existing segment.
+		if shm := r.keysToShms[key]; shm != nil {
+			shm.mu.Lock()
+			defer shm.mu.Unlock()
+
+			// Check that caller can access the segment.
+			if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) {
+				// "The user does not have permission to access the shared
+				// memory segment, and does not have the CAP_IPC_OWNER
+				// capability in the user namespace that governs its IPC
+				// namespace." - man shmget(2)
+				return nil, syserror.EACCES
+			}
+
+			if size > shm.size {
+				// "A segment for the given key exists, but size is greater than
+				// the size of that segment." - man shmget(2)
+				return nil, syserror.EINVAL
+			}
+
+			if create && exclusive {
+				// "IPC_CREAT and IPC_EXCL were specified in shmflg, but a
+				// shared memory segment already exists for key."
+				//  - man shmget(2)
+				return nil, syserror.EEXIST
+			}
+
+			shm.IncRef()
+			return shm, nil
+		}
+
+		if !create {
+			// "No segment exists for the given key, and IPC_CREAT was not
+			// specified." - man shmget(2)
+			return nil, syserror.ENOENT
+		}
+	}
+
+	var sizeAligned uint64
+	if val, ok := usermem.Addr(size).RoundUp(); ok {
+		sizeAligned = uint64(val)
+	} else {
+		return nil, syserror.EINVAL
+	}
+
+	if numPages := sizeAligned / usermem.PageSize; r.totalPages+numPages > linux.SHMALL {
+		// "... allocating a segment of the requested size would cause the
+		// system to exceed the system-wide limit on shared memory (SHMALL)."
+		//   - man shmget(2)
+		return nil, syserror.ENOSPC
+	}
+
+	// Need to create a new segment.
+	creator := fs.FileOwnerFromContext(ctx)
+	perms := fs.FilePermsFromMode(mode)
+	s, err := r.newShm(ctx, pid, key, creator, perms, size)
+	if err != nil {
+		return nil, err
+	}
+	// The initial reference is held by s itself. Take another to return to
+	// the caller.
+	s.IncRef()
+	return s, nil
+}
+
+// newShm creates a new segment in the registry.
+//
+// Precondition: Caller must hold r.mu.
+func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	if mfp == nil {
+		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
+	}
+
+	effectiveSize := uint64(usermem.Addr(size).MustRoundUp())
+	fr, err := mfp.MemoryFile().Allocate(effectiveSize, usage.Anonymous)
+	if err != nil {
+		return nil, err
+	}
+
+	shm := &Shm{
+		mfp:           mfp,
+		registry:      r,
+		creator:       creator,
+		size:          size,
+		effectiveSize: effectiveSize,
+		fr:            fr,
+		key:           key,
+		perms:         perms,
+		owner:         creator,
+		creatorPID:    pid,
+		changeTime:    ktime.NowFromContext(ctx),
+	}
+	shm.EnableLeakCheck("kernel.Shm")
+
+	// Find the next available ID.
+	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
+		// Handle wrap around.
+		if id < 0 {
+			id = 0
+			continue
+		}
+		if r.shms[id] == nil {
+			r.lastIDUsed = id
+
+			shm.ID = id
+			r.shms[id] = shm
+			r.keysToShms[key] = shm
+
+			r.totalPages += effectiveSize / usermem.PageSize
+
+			return shm, nil
+		}
+	}
+
+	log.Warningf("Shm ids exhuasted, they may be leaking")
+	return nil, syserror.ENOSPC
+}
+
+// IPCInfo reports global parameters for sysv shared memory segments on this
+// system. See shmctl(IPC_INFO).
+func (r *Registry) IPCInfo() *linux.ShmParams {
+	return &linux.ShmParams{
+		ShmMax: linux.SHMMAX,
+		ShmMin: linux.SHMMIN,
+		ShmMni: linux.SHMMNI,
+		ShmSeg: linux.SHMSEG,
+		ShmAll: linux.SHMALL,
+	}
+}
+
+// ShmInfo reports linux-specific global parameters for sysv shared memory
+// segments on this system. See shmctl(SHM_INFO).
+func (r *Registry) ShmInfo() *linux.ShmInfo {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	return &linux.ShmInfo{
+		UsedIDs: int32(r.lastIDUsed),
+		ShmTot:  r.totalPages,
+		ShmRss:  r.totalPages, // We could probably get a better estimate from memory accounting.
+		ShmSwp:  0,            // No reclaim at the moment.
+	}
+}
+
+// remove deletes a segment from this registry, deaccounting the memory used by
+// the segment.
+//
+// Precondition: Must follow a call to r.dissociateKey(s).
+func (r *Registry) remove(s *Shm) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if s.key != linux.IPC_PRIVATE {
+		panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked()))
+	}
+
+	delete(r.shms, s.ID)
+	r.totalPages -= s.effectiveSize / usermem.PageSize
+}
+
+// Shm represents a single shared memory segment.
+//
+// Shm segment are backed directly by an allocation from platform memory.
+// Segments are always mapped as a whole, greatly simplifying how mappings are
+// tracked. However note that mremap and munmap calls may cause the vma for a
+// segment to become fragmented; which requires special care when unmapping a
+// segment. See mm/shm.go.
+//
+// Segments persist until they are explicitly marked for destruction via
+// MarkDestroyed().
+//
+// Shm implements memmap.Mappable and memmap.MappingIdentity.
+//
+// +stateify savable
+type Shm struct {
+	// AtomicRefCount tracks the number of references to this segment.
+	//
+	// A segment holds a reference to itself until it is marked for
+	// destruction.
+	//
+	// In addition to direct users, the MemoryManager will hold references
+	// via MappingIdentity.
+	refs.AtomicRefCount
+
+	mfp pgalloc.MemoryFileProvider
+
+	// registry points to the shm registry containing this segment. Immutable.
+	registry *Registry
+
+	// ID is the kernel identifier for this segment. Immutable.
+	ID ID
+
+	// creator is the user that created the segment. Immutable.
+	creator fs.FileOwner
+
+	// size is the requested size of the segment at creation, in
+	// bytes. Immutable.
+	size uint64
+
+	// effectiveSize of the segment, rounding up to the next page
+	// boundary. Immutable.
+	//
+	// Invariant: effectiveSize must be a multiple of usermem.PageSize.
+	effectiveSize uint64
+
+	// fr is the offset into mfp.MemoryFile() that backs this contents of this
+	// segment. Immutable.
+	fr platform.FileRange
+
+	// mu protects all fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// key is the public identifier for this segment.
+	key Key
+
+	// perms is the access permissions for the segment.
+	perms fs.FilePermissions
+
+	// owner of this segment.
+	owner fs.FileOwner
+	// attachTime is updated on every successful shmat.
+	attachTime ktime.Time
+	// detachTime is updated on every successful shmdt.
+	detachTime ktime.Time
+	// changeTime is updated on every successful changes to the segment via
+	// shmctl(IPC_SET).
+	changeTime ktime.Time
+
+	// creatorPID is the PID of the process that created the segment.
+	creatorPID int32
+	// lastAttachDetachPID is the pid of the process that issued the last shmat
+	// or shmdt syscall.
+	lastAttachDetachPID int32
+
+	// pendingDestruction indicates the segment was marked as destroyed through
+	// shmctl(IPC_RMID). When marked as destroyed, the segment will not be found
+	// in the registry and can no longer be attached. When the last user
+	// detaches from the segment, it is destroyed.
+	pendingDestruction bool
+}
+
+// Precondition: Caller must hold s.mu.
+func (s *Shm) debugLocked() string {
+	return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}",
+		s.ID, s.key, s.size, s.ReadRefs(), s.pendingDestruction)
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (s *Shm) MappedName(ctx context.Context) string {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return fmt.Sprintf("SYSV%08d", s.key)
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (s *Shm) DeviceID() uint64 {
+	return shmDevice.DeviceID()
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (s *Shm) InodeID() uint64 {
+	// "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use
+	// this. Changing this will break them." -- Linux, ipc/shm.c:newseg()
+	return uint64(s.ID)
+}
+
+// DecRef overrides refs.RefCount.DecRef with a destructor.
+//
+// Precondition: Caller must not hold s.mu.
+func (s *Shm) DecRef() {
+	s.DecRefWithDestructor(s.destroy)
+}
+
+// Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
+// segments.
+func (s *Shm) Msync(context.Context, memmap.MappableRange) error {
+	return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.attachTime = ktime.NowFromContext(ctx)
+	if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
+		s.lastAttachDetachPID = pid
+	} else {
+		// AddMapping is called during a syscall, so ctx should always be a task
+		// context.
+		log.Warningf("Adding mapping to %s but couldn't get the current pid; not updating the last attach pid", s.debugLocked())
+	}
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	// RemoveMapping may be called during task exit, when ctx
+	// is context.Background. Gracefully handle missing clocks. Failing to
+	// update the detach time in these cases is ok, since no one can observe the
+	// omission.
+	if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
+		s.detachTime = clock.Now()
+	}
+
+	// If called from a non-task context we also won't have a threadgroup
+	// id. Silently skip updating the lastAttachDetachPid in that case.
+	if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
+		s.lastAttachDetachPID = pid
+	} else {
+		log.Debugf("Couldn't obtain pid when removing mapping to %s, not updating the last detach pid.", s.debugLocked())
+	}
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error {
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	var err error
+	if required.End > s.fr.Length() {
+		err = &memmap.BusError{syserror.EFAULT}
+	}
+	if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 {
+		return []memmap.Translation{
+			{
+				Source: source,
+				File:   s.mfp.MemoryFile(),
+				Offset: s.fr.Start + source.Start,
+				Perms:  usermem.AnyAccess,
+			},
+		}, err
+	}
+	return nil, err
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (s *Shm) InvalidateUnsavable(ctx context.Context) error {
+	return nil
+}
+
+// AttachOpts describes various flags passed to shmat(2).
+type AttachOpts struct {
+	Execute  bool
+	Readonly bool
+	Remap    bool
+}
+
+// ConfigureAttach creates an mmap configuration for the segment with the
+// requested attach options.
+//
+// Postconditions: The returned MMapOpts are valid only as long as a reference
+// continues to be held on s.
+func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.pendingDestruction && s.ReadRefs() == 0 {
+		return memmap.MMapOpts{}, syserror.EIDRM
+	}
+
+	if !s.checkPermissions(ctx, fs.PermMask{
+		Read:    true,
+		Write:   !opts.Readonly,
+		Execute: opts.Execute,
+	}) {
+		// "The calling process does not have the required permissions for the
+		// requested attach type, and does not have the CAP_IPC_OWNER capability
+		// in the user namespace that governs its IPC namespace." - man shmat(2)
+		return memmap.MMapOpts{}, syserror.EACCES
+	}
+	return memmap.MMapOpts{
+		Length: s.size,
+		Offset: 0,
+		Addr:   addr,
+		Fixed:  opts.Remap,
+		Perms: usermem.AccessType{
+			Read:    true,
+			Write:   !opts.Readonly,
+			Execute: opts.Execute,
+		},
+		MaxPerms:        usermem.AnyAccess,
+		Mappable:        s,
+		MappingIdentity: s,
+	}, nil
+}
+
+// EffectiveSize returns the size of the underlying shared memory segment. This
+// may be larger than the requested size at creation, due to rounding to page
+// boundaries.
+func (s *Shm) EffectiveSize() uint64 {
+	return s.effectiveSize
+}
+
+// IPCStat returns information about a shm. See shmctl(IPC_STAT).
+func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The caller must have read permission on the shared memory segment."
+	//   - man shmctl(2)
+	if !s.checkPermissions(ctx, fs.PermMask{Read: true}) {
+		// "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow
+		// read access for shmid, and the calling process does not have the
+		// CAP_IPC_OWNER capability in the user namespace that governs its IPC
+		// namespace." - man shmctl(2)
+		return nil, syserror.EACCES
+	}
+
+	var mode uint16
+	if s.pendingDestruction {
+		mode |= linux.SHM_DEST
+	}
+	creds := auth.CredentialsFromContext(ctx)
+
+	// Use the reference count as a rudimentary count of the number of
+	// attaches. We exclude:
+	//
+	// 1. The reference the caller holds.
+	// 2. The self-reference held by s prior to destruction.
+	//
+	// Note that this may still overcount by including transient references
+	// used in concurrent calls.
+	nattach := uint64(s.ReadRefs()) - 1
+	if !s.pendingDestruction {
+		nattach--
+	}
+
+	ds := &linux.ShmidDS{
+		ShmPerm: linux.IPCPerm{
+			Key:  uint32(s.key),
+			UID:  uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
+			GID:  uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
+			CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
+			CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
+			Mode: mode | uint16(s.perms.LinuxMode()),
+			Seq:  0, // IPC sequences not supported.
+		},
+		ShmSegsz:   s.size,
+		ShmAtime:   s.attachTime.TimeT(),
+		ShmDtime:   s.detachTime.TimeT(),
+		ShmCtime:   s.changeTime.TimeT(),
+		ShmCpid:    s.creatorPID,
+		ShmLpid:    s.lastAttachDetachPID,
+		ShmNattach: nattach,
+	}
+
+	return ds, nil
+}
+
+// Set modifies attributes for a segment. See shmctl(IPC_SET).
+func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if !s.checkOwnership(ctx) {
+		return syserror.EPERM
+	}
+
+	creds := auth.CredentialsFromContext(ctx)
+	uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID))
+	gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID))
+	if !uid.Ok() || !gid.Ok() {
+		return syserror.EINVAL
+	}
+
+	// User may only modify the lower 9 bits of the mode. All the other bits are
+	// always 0 for the underlying inode.
+	mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff)
+	s.perms = fs.FilePermsFromMode(mode)
+
+	s.owner.UID = uid
+	s.owner.GID = gid
+
+	s.changeTime = ktime.NowFromContext(ctx)
+	return nil
+}
+
+func (s *Shm) destroy() {
+	s.mfp.MemoryFile().DecRef(s.fr)
+	s.registry.remove(s)
+}
+
+// MarkDestroyed marks a segment for destruction. The segment is actually
+// destroyed once it has no references. MarkDestroyed may be called multiple
+// times, and is safe to call after a segment has already been destroyed. See
+// shmctl(IPC_RMID).
+func (s *Shm) MarkDestroyed() {
+	s.registry.dissociateKey(s)
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if !s.pendingDestruction {
+		s.pendingDestruction = true
+		// Drop the self-reference so destruction occurs when all
+		// external references are gone.
+		//
+		// N.B. This cannot be the final DecRef, as the caller also
+		// holds a reference.
+		s.DecRef()
+		return
+	}
+}
+
+// checkOwnership verifies whether a segment may be accessed by ctx as an
+// owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux.
+//
+// Precondition: Caller must hold s.mu.
+func (s *Shm) checkOwnership(ctx context.Context) bool {
+	creds := auth.CredentialsFromContext(ctx)
+	if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID {
+		return true
+	}
+
+	// Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux
+	// doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented
+	// for use to "override IPC ownership checks".
+	return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS)
+}
+
+// checkPermissions verifies whether a segment is accessible by ctx for access
+// described by req. See ipc/util.c:ipcperms() in Linux.
+//
+// Precondition: Caller must hold s.mu.
+func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool {
+	creds := auth.CredentialsFromContext(ctx)
+
+	p := s.perms.Other
+	if s.owner.UID == creds.EffectiveKUID {
+		p = s.perms.User
+	} else if creds.InGroup(s.owner.GID) {
+		p = s.perms.Group
+	}
+	if p.SupersetOf(req) {
+		return true
+	}
+
+	// Tasks with CAP_IPC_OWNER may bypass permission checks.
+	return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS)
+}
diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
new file mode 100644
index 000000000..e8cce37d0
--- /dev/null
+++ b/pkg/sentry/kernel/signal.go
@@ -0,0 +1,79 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+)
+
+// SignalPanic is used to panic the running threads. It is a signal which
+// cannot be used by the application: it must be caught and ignored by the
+// runtime (in order to catch possible races).
+const SignalPanic = linux.SIGUSR2
+
+// sendExternalSignal is called when an asynchronous signal is sent to the
+// sentry ("in sentry context"). On some platforms, it may also be called when
+// an asynchronous signal is sent to sandboxed application threads ("in
+// application context").
+//
+// context is used only for debugging to differentiate these cases.
+//
+// Preconditions: Kernel must have an init process.
+func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) {
+	switch linux.Signal(info.Signo) {
+	case linux.SIGURG:
+		// Sent by the Go 1.14+ runtime for asynchronous goroutine preemption.
+
+	case platform.SignalInterrupt:
+		// Assume that a call to platform.Context.Interrupt() misfired.
+
+	case SignalPanic:
+		// SignalPanic is also specially handled in sentry setup to ensure that
+		// it causes a panic even after tasks exit, but SignalPanic may also
+		// be sent here if it is received while in app context.
+		panic("Signal-induced panic")
+
+	default:
+		log.Infof("Received external signal %d in %s context", info.Signo, context)
+		if k.globalInit == nil {
+			panic(fmt.Sprintf("Received external signal %d before init created", info.Signo))
+		}
+		k.globalInit.SendSignal(info)
+	}
+}
+
+// SignalInfoPriv returns a SignalInfo equivalent to Linux's SEND_SIG_PRIV.
+func SignalInfoPriv(sig linux.Signal) *arch.SignalInfo {
+	return &arch.SignalInfo{
+		Signo: int32(sig),
+		Code:  arch.SignalInfoKernel,
+	}
+}
+
+// SignalInfoNoInfo returns a SignalInfo equivalent to Linux's SEND_SIG_NOINFO.
+func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *arch.SignalInfo {
+	info := &arch.SignalInfo{
+		Signo: int32(sig),
+		Code:  arch.SignalInfoUser,
+	}
+	info.SetPid(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg)))
+	info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+	return info
+}
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
new file mode 100644
index 000000000..768fda220
--- /dev/null
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -0,0 +1,88 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// SignalHandlers holds information about signal actions.
+//
+// +stateify savable
+type SignalHandlers struct {
+	// mu protects actions, as well as the signal state of all tasks and thread
+	// groups using this SignalHandlers object. (See comment on
+	// ThreadGroup.signalHandlers.)
+	mu sync.Mutex `state:"nosave"`
+
+	// actions is the action to be taken upon receiving each signal.
+	actions map[linux.Signal]arch.SignalAct
+}
+
+// NewSignalHandlers returns a new SignalHandlers specifying all default
+// actions.
+func NewSignalHandlers() *SignalHandlers {
+	return &SignalHandlers{
+		actions: make(map[linux.Signal]arch.SignalAct),
+	}
+}
+
+// Fork returns a copy of sh for a new thread group.
+func (sh *SignalHandlers) Fork() *SignalHandlers {
+	sh2 := NewSignalHandlers()
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	for sig, act := range sh.actions {
+		sh2.actions[sig] = act
+	}
+	return sh2
+}
+
+// CopyForExec returns a copy of sh for a thread group that is undergoing an
+// execve. (See comments in Task.finishExec.)
+func (sh *SignalHandlers) CopyForExec() *SignalHandlers {
+	sh2 := NewSignalHandlers()
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	for sig, act := range sh.actions {
+		if act.Handler == arch.SignalActIgnore {
+			sh2.actions[sig] = arch.SignalAct{
+				Handler: arch.SignalActIgnore,
+			}
+		}
+	}
+	return sh2
+}
+
+// IsIgnored returns true if the signal is ignored.
+func (sh *SignalHandlers) IsIgnored(sig linux.Signal) bool {
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	sa, ok := sh.actions[sig]
+	return ok && sa.Handler == arch.SignalActIgnore
+}
+
+// dequeueActionLocked returns the SignalAct that should be used to handle sig.
+//
+// Preconditions: sh.mu must be locked.
+func (sh *SignalHandlers) dequeueAction(sig linux.Signal) arch.SignalAct {
+	act := sh.actions[sig]
+	if act.IsResetHandler() {
+		delete(sh.actions, sig)
+	}
+	return act
+}
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
new file mode 100644
index 000000000..3eb78e91b
--- /dev/null
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -0,0 +1,22 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "signalfd",
+    srcs = ["signalfd.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go
new file mode 100644
index 000000000..8243bb93e
--- /dev/null
+++ b/pkg/sentry/kernel/signalfd/signalfd.go
@@ -0,0 +1,139 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package signalfd provides an implementation of signal file descriptors.
+package signalfd
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// SignalOperations represent a file with signalfd semantics.
+//
+// +stateify savable
+type SignalOperations struct {
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoWrite              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	// target is the original task target.
+	//
+	// The semantics here are a bit broken. Linux will always use current
+	// for all reads, regardless of where the signalfd originated. We can't
+	// do exactly that because we need to plumb the context through
+	// EventRegister in order to support proper blocking behavior. This
+	// will undoubtedly become very complicated quickly.
+	target *kernel.Task
+
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// mask is the signal mask. Protected by mu.
+	mask linux.SignalSet
+}
+
+// New creates a new signalfd object with the supplied mask.
+func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// No task context? Not valid.
+		return nil, syserror.EINVAL
+	}
+	// name matches fs/signalfd.c:signalfd4.
+	dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[signalfd]")
+	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &SignalOperations{
+		target: t,
+		mask:   mask,
+	}), nil
+}
+
+// Release implements fs.FileOperations.Release.
+func (s *SignalOperations) Release() {}
+
+// Mask returns the signal mask.
+func (s *SignalOperations) Mask() linux.SignalSet {
+	s.mu.Lock()
+	mask := s.mask
+	s.mu.Unlock()
+	return mask
+}
+
+// SetMask sets the signal mask.
+func (s *SignalOperations) SetMask(mask linux.SignalSet) {
+	s.mu.Lock()
+	s.mask = mask
+	s.mu.Unlock()
+}
+
+// Read implements fs.FileOperations.Read.
+func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	// Attempt to dequeue relevant signals.
+	info, err := s.target.Sigtimedwait(s.Mask(), 0)
+	if err != nil {
+		// There must be no signal available.
+		return 0, syserror.ErrWouldBlock
+	}
+
+	// Copy out the signal info using the specified format.
+	var buf [128]byte
+	binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{
+		Signo:   uint32(info.Signo),
+		Errno:   info.Errno,
+		Code:    info.Code,
+		PID:     uint32(info.Pid()),
+		UID:     uint32(info.Uid()),
+		Status:  info.Status(),
+		Overrun: uint32(info.Overrun()),
+		Addr:    info.Addr(),
+	})
+	n, err := dst.CopyOut(ctx, buf[:])
+	return int64(n), err
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (s *SignalOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if mask&waiter.EventIn != 0 && s.target.PendingSignals()&s.Mask() != 0 {
+		return waiter.EventIn // Pending signals.
+	}
+	return 0
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (s *SignalOperations) EventRegister(entry *waiter.Entry, _ waiter.EventMask) {
+	// Register for the signal set; ignore the passed events.
+	s.target.SignalRegister(entry, waiter.EventMask(s.Mask()))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (s *SignalOperations) EventUnregister(entry *waiter.Entry) {
+	// Unregister the original entry.
+	s.target.SignalUnregister(entry)
+}
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
new file mode 100644
index 000000000..413111faf
--- /dev/null
+++ b/pkg/sentry/kernel/syscalls.go
@@ -0,0 +1,364 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/bits"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// maxSyscallNum is the highest supported syscall number.
+//
+// The types below create fast lookup slices for all syscalls. This maximum
+// serves as a sanity check that we don't allocate huge slices for a very large
+// syscall. This is checked during registration.
+const maxSyscallNum = 2000
+
+// SyscallSupportLevel is a syscall support levels.
+type SyscallSupportLevel int
+
+// String returns a human readable represetation of the support level.
+func (l SyscallSupportLevel) String() string {
+	switch l {
+	case SupportUnimplemented:
+		return "Unimplemented"
+	case SupportPartial:
+		return "Partial Support"
+	case SupportFull:
+		return "Full Support"
+	default:
+		return "Undocumented"
+	}
+}
+
+const (
+	// SupportUndocumented indicates the syscall is not documented yet.
+	SupportUndocumented = iota
+
+	// SupportUnimplemented indicates the syscall is unimplemented.
+	SupportUnimplemented
+
+	// SupportPartial indicates the syscall is partially supported.
+	SupportPartial
+
+	// SupportFull indicates the syscall is fully supported.
+	SupportFull
+)
+
+// Syscall includes the syscall implementation and compatibility information.
+type Syscall struct {
+	// Name is the syscall name.
+	Name string
+	// Fn is the implementation of the syscall.
+	Fn SyscallFn
+	// SupportLevel is the level of support implemented in gVisor.
+	SupportLevel SyscallSupportLevel
+	// Note describes the compatibility of the syscall.
+	Note string
+	// URLs is set of URLs to any relevant bugs or issues.
+	URLs []string
+}
+
+// SyscallFn is a syscall implementation.
+type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error)
+
+// MissingFn is a syscall to be called when an implementation is missing.
+type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error)
+
+// Possible flags for SyscallFlagsTable.enable.
+const (
+	// syscallPresent indicates that this is not a missing syscall.
+	//
+	// This flag is used internally in SyscallFlagsTable.
+	syscallPresent = 1 << iota
+
+	// StraceEnableLog enables syscall log tracing.
+	StraceEnableLog
+
+	// StraceEnableEvent enables syscall event tracing.
+	StraceEnableEvent
+
+	// ExternalBeforeEnable enables the external hook before syscall execution.
+	ExternalBeforeEnable
+
+	// ExternalAfterEnable enables the external hook after syscall execution.
+	ExternalAfterEnable
+)
+
+// StraceEnableBits combines both strace log and event flags.
+const StraceEnableBits = StraceEnableLog | StraceEnableEvent
+
+// SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall
+// basis.
+type SyscallFlagsTable struct {
+	// mu protects writes to the fields below.
+	//
+	// Atomic loads are always allowed. Atomic stores are allowed only
+	// while mu is held.
+	mu sync.Mutex
+
+	// enable contains the enable bits for each syscall.
+	//
+	// missing syscalls have the same value in enable as missingEnable to
+	// avoid an extra branch in Word.
+	enable []uint32
+
+	// missingEnable contains the enable bits for missing syscalls.
+	missingEnable uint32
+}
+
+// Init initializes the struct, with all syscalls in table set to enable.
+//
+// max is the largest syscall number in table.
+func (e *SyscallFlagsTable) init(table map[uintptr]Syscall, max uintptr) {
+	e.enable = make([]uint32, max+1)
+	for num := range table {
+		e.enable[num] = syscallPresent
+	}
+}
+
+// Word returns the enable bitfield for sysno.
+func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 {
+	if sysno < uintptr(len(e.enable)) {
+		return atomic.LoadUint32(&e.enable[sysno])
+	}
+
+	return atomic.LoadUint32(&e.missingEnable)
+}
+
+// Enable sets enable bit bit for all syscalls based on s.
+//
+// Syscalls missing from s are disabled.
+//
+// Syscalls missing from the initial table passed to Init cannot be added as
+// individual syscalls. If present in s they will be ignored.
+//
+// Callers to Word may see either the old or new value while this function
+// is executing.
+func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	missingVal := atomic.LoadUint32(&e.missingEnable)
+	if missingEnable {
+		missingVal |= bit
+	} else {
+		missingVal &^= bit
+	}
+	atomic.StoreUint32(&e.missingEnable, missingVal)
+
+	for num := range e.enable {
+		val := atomic.LoadUint32(&e.enable[num])
+		if !bits.IsOn32(val, syscallPresent) {
+			// Missing.
+			atomic.StoreUint32(&e.enable[num], missingVal)
+			continue
+		}
+
+		if s[uintptr(num)] {
+			val |= bit
+		} else {
+			val &^= bit
+		}
+		atomic.StoreUint32(&e.enable[num], val)
+	}
+}
+
+// EnableAll sets enable bit bit for all syscalls, present and missing.
+func (e *SyscallFlagsTable) EnableAll(bit uint32) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	missingVal := atomic.LoadUint32(&e.missingEnable)
+	missingVal |= bit
+	atomic.StoreUint32(&e.missingEnable, missingVal)
+
+	for num := range e.enable {
+		val := atomic.LoadUint32(&e.enable[num])
+		if !bits.IsOn32(val, syscallPresent) {
+			// Missing.
+			atomic.StoreUint32(&e.enable[num], missingVal)
+			continue
+		}
+
+		val |= bit
+		atomic.StoreUint32(&e.enable[num], val)
+	}
+}
+
+// Stracer traces syscall execution.
+type Stracer interface {
+	// SyscallEnter is called on syscall entry.
+	//
+	// The returned private data is passed to SyscallExit.
+	SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
+
+	// SyscallExit is called on syscall exit.
+	SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error)
+}
+
+// SyscallTable is a lookup table of system calls.
+//
+// Note that a SyscallTable is not savable directly. Instead, they are saved as
+// an OS/Arch pair and lookup happens again on restore.
+type SyscallTable struct {
+	// OS is the operating system that this syscall table implements.
+	OS abi.OS
+
+	// Arch is the architecture that this syscall table targets.
+	Arch arch.Arch
+
+	// The OS version that this syscall table implements.
+	Version Version
+
+	// AuditNumber is a numeric constant that represents the syscall table. If
+	// non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
+	// linux/audit.h.
+	AuditNumber uint32
+
+	// Table is the collection of functions.
+	Table map[uintptr]Syscall
+
+	// lookup is a fixed-size array that holds the syscalls (indexed by
+	// their numbers). It is used for fast look ups.
+	lookup []SyscallFn
+
+	// Emulate is a collection of instruction addresses to emulate. The
+	// keys are addresses, and the values are system call numbers.
+	Emulate map[usermem.Addr]uintptr
+
+	// The function to call in case of a missing system call.
+	Missing MissingFn
+
+	// Stracer traces this syscall table.
+	Stracer Stracer
+
+	// External is used to handle an external callback.
+	External func(*Kernel)
+
+	// ExternalFilterBefore is called before External is called before the syscall is executed.
+	// External is not called if it returns false.
+	ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool
+
+	// ExternalFilterAfter is called before External is called after the syscall is executed.
+	// External is not called if it returns false.
+	ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool
+
+	// FeatureEnable stores the strace and one-shot enable bits.
+	FeatureEnable SyscallFlagsTable
+}
+
+// MaxSysno returns the largest system call number.
+func (s *SyscallTable) MaxSysno() (max uintptr) {
+	for num := range s.Table {
+		if num > max {
+			max = num
+		}
+	}
+	return max
+}
+
+// allSyscallTables contains all known tables.
+var allSyscallTables []*SyscallTable
+
+// SyscallTables returns a read-only slice of registered SyscallTables.
+func SyscallTables() []*SyscallTable {
+	return allSyscallTables
+}
+
+// LookupSyscallTable returns the SyscallCall table for the OS/Arch combination.
+func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) {
+	for _, s := range allSyscallTables {
+		if s.OS == os && s.Arch == a {
+			return s, true
+		}
+	}
+	return nil, false
+}
+
+// RegisterSyscallTable registers a new syscall table for use by a Kernel.
+func RegisterSyscallTable(s *SyscallTable) {
+	if max := s.MaxSysno(); max > maxSyscallNum {
+		panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max))
+	}
+	if _, ok := LookupSyscallTable(s.OS, s.Arch); ok {
+		panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch))
+	}
+	allSyscallTables = append(allSyscallTables, s)
+	s.Init()
+}
+
+// Init initializes the system call table.
+//
+// This should normally be called only during registration.
+func (s *SyscallTable) Init() {
+	if s.Table == nil {
+		// Ensure non-nil lookup table.
+		s.Table = make(map[uintptr]Syscall)
+	}
+	if s.Emulate == nil {
+		// Ensure non-nil emulate table.
+		s.Emulate = make(map[usermem.Addr]uintptr)
+	}
+
+	max := s.MaxSysno() // Checked during RegisterSyscallTable.
+
+	// Initialize the fast-lookup table.
+	s.lookup = make([]SyscallFn, max+1)
+	for num, sc := range s.Table {
+		s.lookup[num] = sc.Fn
+	}
+
+	// Initialize all features.
+	s.FeatureEnable.init(s.Table, max)
+}
+
+// Lookup returns the syscall implementation, if one exists.
+func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
+	if sysno < uintptr(len(s.lookup)) {
+		return s.lookup[sysno]
+	}
+
+	return nil
+}
+
+// LookupName looks up a syscall name.
+func (s *SyscallTable) LookupName(sysno uintptr) string {
+	if sc, ok := s.Table[sysno]; ok {
+		return sc.Name
+	}
+	return fmt.Sprintf("sys_%d", sysno) // Unlikely.
+}
+
+// LookupEmulate looks up an emulation syscall number.
+func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
+	sysno, ok := s.Emulate[addr]
+	return sysno, ok
+}
+
+// mapLookup is similar to Lookup, except that it only uses the syscall table,
+// that is, it skips the fast look array. This is available for benchmarking.
+func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn {
+	if sc, ok := s.Table[sysno]; ok {
+		return sc.Fn
+	}
+	return nil
+}
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
new file mode 100644
index 000000000..90f890495
--- /dev/null
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -0,0 +1,47 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+// syscallTableInfo is used to reload the SyscallTable.
+//
+// +stateify savable
+type syscallTableInfo struct {
+	OS   abi.OS
+	Arch arch.Arch
+}
+
+// saveSt saves the SyscallTable.
+func (tc *TaskContext) saveSt() syscallTableInfo {
+	return syscallTableInfo{
+		OS:   tc.st.OS,
+		Arch: tc.st.Arch,
+	}
+}
+
+// loadSt loads the SyscallTable.
+func (tc *TaskContext) loadSt(sti syscallTableInfo) {
+	st, ok := LookupSyscallTable(sti.OS, sti.Arch)
+	if !ok {
+		panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch))
+	}
+	tc.st = st // Save the table reference.
+}
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
new file mode 100644
index 000000000..4607cde2f
--- /dev/null
+++ b/pkg/sentry/kernel/syslog.go
@@ -0,0 +1,108 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"math/rand"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// syslog represents a sentry-global kernel log.
+//
+// Currently, it contains only fun messages for a dmesg easter egg.
+//
+// +stateify savable
+type syslog struct {
+	// mu protects the below.
+	mu sync.Mutex `state:"nosave"`
+
+	// msg is the syslog message buffer. It is lazily initialized.
+	msg []byte
+}
+
+// Log returns a copy of the syslog.
+func (s *syslog) Log() []byte {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if s.msg != nil {
+		// Already initialized, just return a copy.
+		o := make([]byte, len(s.msg))
+		copy(o, s.msg)
+		return o
+	}
+
+	// Not initialized, create message.
+	allMessages := []string{
+		"Synthesizing system calls...",
+		"Mounting deweydecimalfs...",
+		"Moving files to filing cabinet...",
+		"Digging up root...",
+		"Constructing home...",
+		"Segmenting fault lines...",
+		"Creating bureaucratic processes...",
+		"Searching for needles in stacks...",
+		"Preparing for the zombie uprising...",
+		"Feeding the init monster...",
+		"Creating cloned children...",
+		"Daemonizing children...",
+		"Waiting for children...",
+		"Gathering forks...",
+		"Committing treasure map to memory...",
+		"Reading process obituaries...",
+		"Searching for socket adapter...",
+		"Creating process schedule...",
+		"Generating random numbers by fair dice roll...",
+		"Rewriting operating system in Javascript...",
+		"Reticulating splines...",
+		"Consulting tar man page...",
+		"Forking spaghetti code...",
+		"Checking naughty and nice process list...",
+		"Checking naughty and nice process list...", // Check it up to twice.
+		"Granting licence to kill(2)...",            // British spelling for British movie.
+		"Letting the watchdogs out...",
+	}
+
+	selectMessage := func() string {
+		i := rand.Intn(len(allMessages))
+		m := allMessages[i]
+
+		// Delete the selected message.
+		allMessages[i] = allMessages[len(allMessages)-1]
+		allMessages = allMessages[:len(allMessages)-1]
+
+		return m
+	}
+
+	const format = "<6>[%11.6f] %s\n"
+
+	s.msg = append(s.msg, []byte(fmt.Sprintf(format, 0.0, "Starting gVisor..."))...)
+
+	time := 0.1
+	for i := 0; i < 10; i++ {
+		time += rand.Float64() / 2
+		s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...)
+	}
+
+	time += rand.Float64() / 2
+	s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...)
+
+	// Return a copy.
+	o := make([]byte, len(s.msg))
+	copy(o, s.msg)
+	return o
+}
diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go
new file mode 100644
index 000000000..32cf47e05
--- /dev/null
+++ b/pkg/sentry/kernel/table_test.go
@@ -0,0 +1,110 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+const (
+	maxTestSyscall = 1000
+)
+
+func createSyscallTable() *SyscallTable {
+	m := make(map[uintptr]Syscall)
+	for i := uintptr(0); i <= maxTestSyscall; i++ {
+		j := i
+		m[i] = Syscall{
+			Fn: func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) {
+				return j, nil, nil
+			},
+		}
+	}
+
+	s := &SyscallTable{
+		OS:    abi.Linux,
+		Arch:  arch.AMD64,
+		Table: m,
+	}
+
+	RegisterSyscallTable(s)
+	return s
+}
+
+func TestTable(t *testing.T) {
+	table := createSyscallTable()
+	defer func() {
+		// Cleanup registered tables to keep tests separate.
+		allSyscallTables = []*SyscallTable{}
+	}()
+
+	// Go through all functions and check that they return the right value.
+	for i := uintptr(0); i < maxTestSyscall; i++ {
+		fn := table.Lookup(i)
+		if fn == nil {
+			t.Errorf("Syscall %v is set to nil", i)
+			continue
+		}
+
+		v, _, _ := fn(nil, arch.SyscallArguments{})
+		if v != i {
+			t.Errorf("Wrong return value for syscall %v: expected %v, got %v", i, i, v)
+		}
+	}
+
+	// Check that values outside the range return nil.
+	for i := uintptr(maxTestSyscall + 1); i < maxTestSyscall+100; i++ {
+		fn := table.Lookup(i)
+		if fn != nil {
+			t.Errorf("Syscall %v is not nil: %v", i, fn)
+			continue
+		}
+	}
+}
+
+func BenchmarkTableLookup(b *testing.B) {
+	table := createSyscallTable()
+
+	b.ResetTimer()
+
+	j := uintptr(0)
+	for i := 0; i < b.N; i++ {
+		table.Lookup(j)
+		j = (j + 1) % 310
+	}
+
+	b.StopTimer()
+	// Cleanup registered tables to keep tests separate.
+	allSyscallTables = []*SyscallTable{}
+}
+
+func BenchmarkTableMapLookup(b *testing.B) {
+	table := createSyscallTable()
+
+	b.ResetTimer()
+
+	j := uintptr(0)
+	for i := 0; i < b.N; i++ {
+		table.mapLookup(j)
+		j = (j + 1) % 310
+	}
+
+	b.StopTimer()
+	// Cleanup registered tables to keep tests separate.
+	allSyscallTables = []*SyscallTable{}
+}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
new file mode 100644
index 000000000..f48247c94
--- /dev/null
+++ b/pkg/sentry/kernel/task.go
@@ -0,0 +1,886 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	gocontext "context"
+	"runtime/trace"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/unimpl"
+	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Task represents a thread of execution in the untrusted app.  It
+// includes registers and any thread-specific state that you would
+// normally expect.
+//
+// Each task is associated with a goroutine, called the task goroutine, that
+// executes code (application code, system calls, etc.) on behalf of that task.
+// See Task.run (task_run.go).
+//
+// All fields that are "owned by the task goroutine" can only be mutated by the
+// task goroutine while it is running. The task goroutine does not require
+// synchronization to read these fields, although it still requires
+// synchronization as described for those fields to mutate them.
+//
+// All fields that are "exclusive to the task goroutine" can only be accessed
+// by the task goroutine while it is running. The task goroutine does not
+// require synchronization to read or write these fields.
+//
+// +stateify savable
+type Task struct {
+	taskNode
+
+	// runState is what the task goroutine is executing if it is not stopped.
+	// If runState is nil, the task goroutine should exit or has exited.
+	// runState is exclusive to the task goroutine.
+	runState taskRunState
+
+	// haveSyscallReturn is true if tc.Arch().Return() represents a value
+	// returned by a syscall (or set by ptrace after a syscall).
+	//
+	// haveSyscallReturn is exclusive to the task goroutine.
+	haveSyscallReturn bool
+
+	// interruptChan is notified whenever the task goroutine is interrupted
+	// (usually by a pending signal). interruptChan is effectively a condition
+	// variable that can be used in select statements.
+	//
+	// interruptChan is not saved; because saving interrupts all tasks,
+	// interruptChan is always notified after restore (see Task.run).
+	interruptChan chan struct{} `state:"nosave"`
+
+	// gosched contains the current scheduling state of the task goroutine.
+	//
+	// gosched is protected by goschedSeq. gosched is owned by the task
+	// goroutine.
+	goschedSeq sync.SeqCount `state:"nosave"`
+	gosched    TaskGoroutineSchedInfo
+
+	// yieldCount is the number of times the task goroutine has called
+	// Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or
+	// Task.Yield(), voluntarily ceasing execution.
+	//
+	// yieldCount is accessed using atomic memory operations. yieldCount is
+	// owned by the task goroutine.
+	yieldCount uint64
+
+	// pendingSignals is the set of pending signals that may be handled only by
+	// this task.
+	//
+	// pendingSignals is protected by (taskNode.)tg.signalHandlers.mu
+	// (hereafter "the signal mutex"); see comment on
+	// ThreadGroup.signalHandlers.
+	pendingSignals pendingSignals
+
+	// signalMask is the set of signals whose delivery is currently blocked.
+	//
+	// signalMask is accessed using atomic memory operations, and is protected
+	// by the signal mutex (such that reading signalMask is safe if either the
+	// signal mutex is locked or if atomic memory operations are used, while
+	// writing signalMask requires both). signalMask is owned by the task
+	// goroutine.
+	signalMask linux.SignalSet
+
+	// If the task goroutine is currently executing Task.sigtimedwait,
+	// realSignalMask is the previous value of signalMask, which has temporarily
+	// been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0.
+	//
+	// realSignalMask is exclusive to the task goroutine.
+	realSignalMask linux.SignalSet
+
+	// If haveSavedSignalMask is true, savedSignalMask is the signal mask that
+	// should be applied after the task has either delivered one signal to a
+	// user handler or is about to resume execution in the untrusted
+	// application.
+	//
+	// Both haveSavedSignalMask and savedSignalMask are exclusive to the task
+	// goroutine.
+	haveSavedSignalMask bool
+	savedSignalMask     linux.SignalSet
+
+	// signalStack is the alternate signal stack used by signal handlers for
+	// which the SA_ONSTACK flag is set.
+	//
+	// signalStack is exclusive to the task goroutine.
+	signalStack arch.SignalStack
+
+	// signalQueue is a set of registered waiters for signal-related events.
+	//
+	// signalQueue is protected by the signalMutex. Note that the task does
+	// not implement all queue methods, specifically the readiness checks.
+	// The task only broadcast a notification on signal delivery.
+	signalQueue waiter.Queue `state:"zerovalue"`
+
+	// If groupStopPending is true, the task should participate in a group
+	// stop in the interrupt path.
+	//
+	// groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux.
+	//
+	// groupStopPending is protected by the signal mutex.
+	groupStopPending bool
+
+	// If groupStopAcknowledged is true, the task has already acknowledged that
+	// it is entering the most recent group stop that has been initiated on its
+	// thread group.
+	//
+	// groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux.
+	//
+	// groupStopAcknowledged is protected by the signal mutex.
+	groupStopAcknowledged bool
+
+	// If trapStopPending is true, the task goroutine should enter a
+	// PTRACE_INTERRUPT-induced stop from the interrupt path.
+	//
+	// trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that
+	// Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects
+	// JOBCTL_STOP_PENDING.
+	//
+	// trapStopPending is protected by the signal mutex.
+	trapStopPending bool
+
+	// If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group
+	// stop has begun or ended since the last time the task entered a
+	// ptrace-stop from the group-stop path.
+	//
+	// trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux.
+	//
+	// trapNotifyPending is protected by the signal mutex.
+	trapNotifyPending bool
+
+	// If stop is not nil, it is the internally-initiated condition that
+	// currently prevents the task goroutine from running.
+	//
+	// stop is protected by the signal mutex.
+	stop TaskStop
+
+	// stopCount is the number of active external stops (calls to
+	// Task.BeginExternalStop that have not been paired with a call to
+	// Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is
+	// non-zero if the task goroutine should stop.
+	//
+	// Mutating stopCount requires both locking the signal mutex and using
+	// atomic memory operations. Reading stopCount requires either locking the
+	// signal mutex or using atomic memory operations. This allows Task.doStop
+	// to require only a single atomic read in the common case where stopCount
+	// is 0.
+	//
+	// stopCount is not saved, because external stops cannot be retained across
+	// a save/restore cycle. (Suppose a sentryctl command issues an external
+	// stop; after a save/restore cycle, the restored sentry has no knowledge
+	// of the pre-save sentryctl command, and the stopped task would remain
+	// stopped forever.)
+	stopCount int32 `state:"nosave"`
+
+	// endStopCond is signaled when stopCount transitions to 0. The combination
+	// of stopCount and endStopCond effectively form a sync.WaitGroup, but
+	// WaitGroup provides no way to read its counter value.
+	//
+	// Invariant: endStopCond.L is the signal mutex. (This is not racy because
+	// sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine
+	// calls sync.Cond.Wait; and only the task goroutine can change the
+	// identity of the signal mutex, in Task.finishExec.)
+	endStopCond sync.Cond `state:"nosave"`
+
+	// exitStatus is the task's exit status.
+	//
+	// exitStatus is protected by the signal mutex.
+	exitStatus ExitStatus
+
+	// syscallRestartBlock represents a custom restart function to run in
+	// restart_syscall(2) to resume an interrupted syscall.
+	//
+	// syscallRestartBlock is exclusive to the task goroutine.
+	syscallRestartBlock SyscallRestartBlock
+
+	// p provides the mechanism by which the task runs code in userspace. The p
+	// interface object is immutable.
+	p platform.Context `state:"nosave"`
+
+	// k is the Kernel that this task belongs to. The k pointer is immutable.
+	k *Kernel
+
+	// containerID has no equivalent in Linux; it's used by runsc to track all
+	// tasks that belong to a given containers since cgroups aren't implemented.
+	// It's inherited by the children, is immutable, and may be empty.
+	//
+	// NOTE: cgroups can be used to track this when implemented.
+	containerID string
+
+	// mu protects some of the following fields.
+	mu sync.Mutex `state:"nosave"`
+
+	// tc holds task data provided by the ELF loader.
+	//
+	// tc is protected by mu, and is owned by the task goroutine.
+	tc TaskContext
+
+	// fsContext is the task's filesystem context.
+	//
+	// fsContext is protected by mu, and is owned by the task goroutine.
+	fsContext *FSContext
+
+	// fdTable is the task's file descriptor table.
+	//
+	// fdTable is protected by mu, and is owned by the task goroutine.
+	fdTable *FDTable
+
+	// If vforkParent is not nil, it is the task that created this task with
+	// vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
+	// this TaskContext is released.
+	//
+	// vforkParent is protected by the TaskSet mutex.
+	vforkParent *Task
+
+	// exitState is the task's progress through the exit path.
+	//
+	// exitState is protected by the TaskSet mutex. exitState is owned by the
+	// task goroutine.
+	exitState TaskExitState
+
+	// exitTracerNotified is true if the exit path has either signaled the
+	// task's tracer to indicate the exit, or determined that no such signal is
+	// needed. exitTracerNotified can only be true if exitState is
+	// TaskExitZombie or TaskExitDead.
+	//
+	// exitTracerNotified is protected by the TaskSet mutex.
+	exitTracerNotified bool
+
+	// exitTracerAcked is true if exitTracerNotified is true and either the
+	// task's tracer has acknowledged the exit notification, or the exit path
+	// has determined that no such notification is needed.
+	//
+	// exitTracerAcked is protected by the TaskSet mutex.
+	exitTracerAcked bool
+
+	// exitParentNotified is true if the exit path has either signaled the
+	// task's parent to indicate the exit, or determined that no such signal is
+	// needed. exitParentNotified can only be true if exitState is
+	// TaskExitZombie or TaskExitDead.
+	//
+	// exitParentNotified is protected by the TaskSet mutex.
+	exitParentNotified bool
+
+	// exitParentAcked is true if exitParentNotified is true and either the
+	// task's parent has acknowledged the exit notification, or the exit path
+	// has determined that no such acknowledgment is needed.
+	//
+	// exitParentAcked is protected by the TaskSet mutex.
+	exitParentAcked bool
+
+	// goroutineStopped is a WaitGroup whose counter value is 1 when the task
+	// goroutine is running and 0 when the task goroutine is stopped or has
+	// exited.
+	goroutineStopped sync.WaitGroup `state:"nosave"`
+
+	// ptraceTracer is the task that is ptrace-attached to this one. If
+	// ptraceTracer is nil, this task is not being traced. Note that due to
+	// atomic.Value limitations (atomic.Value.Store(nil) panics), a nil
+	// ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)).
+	//
+	// ptraceTracer is protected by the TaskSet mutex, and accessed with atomic
+	// operations. This allows paths that wouldn't otherwise lock the TaskSet
+	// mutex, notably the syscall path, to check if ptraceTracer is nil without
+	// additional synchronization.
+	ptraceTracer atomic.Value `state:".(*Task)"`
+
+	// ptraceTracees is the set of tasks that this task is ptrace-attached to.
+	//
+	// ptraceTracees is protected by the TaskSet mutex.
+	ptraceTracees map[*Task]struct{}
+
+	// ptraceSeized is true if ptraceTracer attached to this task with
+	// PTRACE_SEIZE.
+	//
+	// ptraceSeized is protected by the TaskSet mutex.
+	ptraceSeized bool
+
+	// ptraceOpts contains ptrace options explicitly set by the tracer. If
+	// ptraceTracer is nil, ptraceOpts is expected to be the zero value.
+	//
+	// ptraceOpts is protected by the TaskSet mutex.
+	ptraceOpts ptraceOptions
+
+	// ptraceSyscallMode controls ptrace behavior around syscall entry and
+	// exit.
+	//
+	// ptraceSyscallMode is protected by the TaskSet mutex.
+	ptraceSyscallMode ptraceSyscallMode
+
+	// If ptraceSinglestep is true, the next time the task executes application
+	// code, single-stepping should be enabled. ptraceSinglestep is stored
+	// independently of the architecture-specific trap flag because tracer
+	// detaching (which can happen concurrently with the tracee's execution if
+	// the tracer exits) must disable single-stepping, and the task's
+	// architectural state is implicitly exclusive to the task goroutine (no
+	// synchronization occurs before passing registers to SwitchToApp).
+	//
+	// ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP.
+	//
+	// ptraceSinglestep is protected by the TaskSet mutex.
+	ptraceSinglestep bool
+
+	// If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the
+	// time that t entered the ptrace stop, reset to 0 when the tracer
+	// acknowledges the stop with a wait*() syscall. Otherwise, it is the
+	// signal number passed to the ptrace operation that ended the last ptrace
+	// stop on this task. In the latter case, the effect of ptraceCode depends
+	// on the nature of the ptrace stop; signal-delivery-stop uses it to
+	// conditionally override ptraceSiginfo, syscall-entry/exit-stops send the
+	// signal to the task after leaving the stop, and PTRACE_EVENT stops and
+	// traced group stops ignore it entirely.
+	//
+	// Linux contextually stores the equivalent of ptraceCode in
+	// task_struct::exit_code.
+	//
+	// ptraceCode is protected by the TaskSet mutex.
+	ptraceCode int32
+
+	// ptraceSiginfo is the value returned to the tracer by
+	// ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO).
+	// (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.)
+	// ptraceSiginfo is nil if the task is in a ptraced group-stop (this is
+	// required for PTRACE_GETSIGINFO to return EINVAL during such stops, which
+	// is in turn required to distinguish group stops from other ptrace stops,
+	// per subsection "Group-stop" in ptrace(2)).
+	//
+	// ptraceSiginfo is analogous to Linux's task_struct::last_siginfo.
+	//
+	// ptraceSiginfo is protected by the TaskSet mutex.
+	ptraceSiginfo *arch.SignalInfo
+
+	// ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to
+	// the tracer by ptrace(PTRACE_GETEVENTMSG).
+	//
+	// ptraceEventMsg is protected by the TaskSet mutex.
+	ptraceEventMsg uint64
+
+	// The struct that holds the IO-related usage. The ioUsage pointer is
+	// immutable.
+	ioUsage *usage.IO
+
+	// logPrefix is a string containing the task's thread ID in the root PID
+	// namespace, and is prepended to log messages emitted by Task.Infof etc.
+	logPrefix atomic.Value `state:"nosave"`
+
+	// traceContext and traceTask are both used for tracing, and are
+	// updated along with the logPrefix in updateInfoLocked.
+	//
+	// These are exclusive to the task goroutine.
+	traceContext gocontext.Context `state:"nosave"`
+	traceTask    *trace.Task       `state:"nosave"`
+
+	// creds is the task's credentials.
+	//
+	// creds.Load() may be called without synchronization. creds.Store() is
+	// serialized by mu. creds is owned by the task goroutine. All
+	// auth.Credentials objects that creds may point to, or have pointed to
+	// in the past, must be treated as immutable.
+	creds auth.AtomicPtrCredentials
+
+	// utsns is the task's UTS namespace.
+	//
+	// utsns is protected by mu. utsns is owned by the task goroutine.
+	utsns *UTSNamespace
+
+	// ipcns is the task's IPC namespace.
+	//
+	// ipcns is protected by mu. ipcns is owned by the task goroutine.
+	ipcns *IPCNamespace
+
+	// abstractSockets tracks abstract sockets that are in use.
+	//
+	// abstractSockets is protected by mu.
+	abstractSockets *AbstractSocketNamespace
+
+	// mountNamespaceVFS2 is the task's mount namespace.
+	//
+	// It is protected by mu. It is owned by the task goroutine.
+	mountNamespaceVFS2 *vfs.MountNamespace
+
+	// parentDeathSignal is sent to this task's thread group when its parent exits.
+	//
+	// parentDeathSignal is protected by mu.
+	parentDeathSignal linux.Signal
+
+	// syscallFilters is all seccomp-bpf syscall filters applicable to the
+	// task, in the order in which they were installed. The type of the atomic
+	// is []bpf.Program. Writing needs to be protected by the signal mutex.
+	//
+	// syscallFilters is owned by the task goroutine.
+	syscallFilters atomic.Value `state:".([]bpf.Program)"`
+
+	// If cleartid is non-zero, treat it as a pointer to a ThreadID in the
+	// task's virtual address space; when the task exits, set the pointed-to
+	// ThreadID to 0, and wake any futex waiters.
+	//
+	// cleartid is exclusive to the task goroutine.
+	cleartid usermem.Addr
+
+	// This is mostly a fake cpumask just for sched_set/getaffinity as we
+	// don't really control the affinity.
+	//
+	// Invariant: allowedCPUMask.Size() ==
+	// sched.CPUMaskSize(Kernel.applicationCores).
+	//
+	// allowedCPUMask is protected by mu.
+	allowedCPUMask sched.CPUSet
+
+	// cpu is the fake cpu number returned by getcpu(2). cpu is ignored
+	// entirely if Kernel.useHostCores is true.
+	//
+	// cpu is accessed using atomic memory operations.
+	cpu int32
+
+	// This is used to keep track of changes made to a process' priority/niceness.
+	// It is mostly used to provide some reasonable return value from
+	// getpriority(2) after a call to setpriority(2) has been made.
+	// We currently do not actually modify a process' scheduling priority.
+	// NOTE: This represents the userspace view of priority (nice).
+	// This means that the value should be in the range [-20, 19].
+	//
+	// niceness is protected by mu.
+	niceness int
+
+	// This is used to track the numa policy for the current thread. This can be
+	// modified through a set_mempolicy(2) syscall. Since we always report a
+	// single numa node, all policies are no-ops. We only track this information
+	// so that we can return reasonable values if the application calls
+	// get_mempolicy(2) after setting a non-default policy. Note that in the
+	// real syscall, nodemask can be longer than a single unsigned long, but we
+	// always report a single node so never need to save more than a single
+	// bit.
+	//
+	// numaPolicy and numaNodeMask are protected by mu.
+	numaPolicy   linux.NumaPolicy
+	numaNodeMask uint64
+
+	// netns is the task's network namespace. netns is never nil.
+	//
+	// netns is protected by mu.
+	netns *inet.Namespace
+
+	// If rseqPreempted is true, before the next call to p.Switch(),
+	// interrupt rseq critical regions as defined by rseqAddr and
+	// tg.oldRSeqCritical and write the task goroutine's CPU number to
+	// rseqAddr/oldRSeqCPUAddr.
+	//
+	// We support two ABIs for restartable sequences:
+	//
+	//  1. The upstream interface added in v4.18,
+	//  2. An "old" interface never merged upstream. In the implementation,
+	//     this is referred to as "old rseq".
+	//
+	// rseqPreempted is exclusive to the task goroutine.
+	rseqPreempted bool `state:"nosave"`
+
+	// rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr.
+	//
+	// If rseq is unused, rseqCPU is -1 for convenient use in
+	// platform.Context.Switch.
+	//
+	// rseqCPU is exclusive to the task goroutine.
+	rseqCPU int32
+
+	// oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable.
+	//
+	// oldRSeqCPUAddr is exclusive to the task goroutine.
+	oldRSeqCPUAddr usermem.Addr
+
+	// rseqAddr is a pointer to the userspace linux.RSeq structure.
+	//
+	// rseqAddr is exclusive to the task goroutine.
+	rseqAddr usermem.Addr
+
+	// rseqSignature is the signature that the rseq abort IP must be signed
+	// with.
+	//
+	// rseqSignature is exclusive to the task goroutine.
+	rseqSignature uint32
+
+	// copyScratchBuffer is a buffer available to CopyIn/CopyOut
+	// implementations that require an intermediate buffer to copy data
+	// into/out of. It prevents these buffers from being allocated/zeroed in
+	// each syscall and eventually garbage collected.
+	//
+	// copyScratchBuffer is exclusive to the task goroutine.
+	copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"`
+
+	// blockingTimer is used for blocking timeouts. blockingTimerChan is the
+	// channel that is sent to when blockingTimer fires.
+	//
+	// blockingTimer is exclusive to the task goroutine.
+	blockingTimer     *ktime.Timer    `state:"nosave"`
+	blockingTimerChan <-chan struct{} `state:"nosave"`
+
+	// futexWaiter is used for futex(FUTEX_WAIT) syscalls.
+	//
+	// futexWaiter is exclusive to the task goroutine.
+	futexWaiter *futex.Waiter `state:"nosave"`
+
+	// startTime is the real time at which the task started. It is set when
+	// a Task is created or invokes execve(2).
+	//
+	// startTime is protected by mu.
+	startTime ktime.Time
+}
+
+func (t *Task) savePtraceTracer() *Task {
+	return t.ptraceTracer.Load().(*Task)
+}
+
+func (t *Task) loadPtraceTracer(tracer *Task) {
+	t.ptraceTracer.Store(tracer)
+}
+
+func (t *Task) saveSyscallFilters() []bpf.Program {
+	if f := t.syscallFilters.Load(); f != nil {
+		return f.([]bpf.Program)
+	}
+	return nil
+}
+
+func (t *Task) loadSyscallFilters(filters []bpf.Program) {
+	t.syscallFilters.Store(filters)
+}
+
+// afterLoad is invoked by stateify.
+func (t *Task) afterLoad() {
+	t.updateInfoLocked()
+	t.interruptChan = make(chan struct{}, 1)
+	t.gosched.State = TaskGoroutineNonexistent
+	if t.stop != nil {
+		t.stopCount = 1
+	}
+	t.endStopCond.L = &t.tg.signalHandlers.mu
+	t.p = t.k.Platform.NewContext()
+	t.rseqPreempted = true
+	t.futexWaiter = futex.NewWaiter()
+}
+
+// copyScratchBufferLen is the length of Task.copyScratchBuffer.
+const copyScratchBufferLen = 144 // sizeof(struct stat)
+
+// CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut
+// functions. It must only be used within those functions and can only be used
+// by the task goroutine; it exists to improve performance and thus
+// intentionally lacks any synchronization.
+//
+// Callers should pass a constant value as an argument if possible, which will
+// allow the compiler to inline and optimize out the if statement below.
+func (t *Task) CopyScratchBuffer(size int) []byte {
+	if size > copyScratchBufferLen {
+		return make([]byte, size)
+	}
+	return t.copyScratchBuffer[:size]
+}
+
+// FutexWaiter returns the Task's futex.Waiter.
+func (t *Task) FutexWaiter() *futex.Waiter {
+	return t.futexWaiter
+}
+
+// Kernel returns the Kernel containing t.
+func (t *Task) Kernel() *Kernel {
+	return t.k
+}
+
+// Value implements context.Context.Value.
+//
+// Preconditions: The caller must be running on the task goroutine (as implied
+// by the requirements of context.Context).
+func (t *Task) Value(key interface{}) interface{} {
+	switch key {
+	case CtxCanTrace:
+		return t.CanTrace
+	case CtxKernel:
+		return t.k
+	case CtxPIDNamespace:
+		return t.tg.pidns
+	case CtxUTSNamespace:
+		return t.utsns
+	case CtxIPCNamespace:
+		return t.ipcns
+	case CtxTask:
+		return t
+	case auth.CtxCredentials:
+		return t.Credentials()
+	case context.CtxThreadGroupID:
+		return int32(t.ThreadGroup().ID())
+	case fs.CtxRoot:
+		return t.fsContext.RootDirectory()
+	case vfs.CtxRoot:
+		return t.fsContext.RootDirectoryVFS2()
+	case vfs.CtxMountNamespace:
+		t.mountNamespaceVFS2.IncRef()
+		return t.mountNamespaceVFS2
+	case fs.CtxDirentCacheLimiter:
+		return t.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return t.NetworkContext()
+	case ktime.CtxRealtimeClock:
+		return t.k.RealtimeClock()
+	case limits.CtxLimits:
+		return t.tg.limits
+	case pgalloc.CtxMemoryFile:
+		return t.k.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return t.k
+	case platform.CtxPlatform:
+		return t.k
+	case uniqueid.CtxGlobalUniqueID:
+		return t.k.UniqueID()
+	case uniqueid.CtxGlobalUniqueIDProvider:
+		return t.k
+	case uniqueid.CtxInotifyCookie:
+		return t.k.GenerateInotifyCookie()
+	case unimpl.CtxEvents:
+		return t.k
+	default:
+		return nil
+	}
+}
+
+// SetClearTID sets t's cleartid.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) SetClearTID(addr usermem.Addr) {
+	t.cleartid = addr
+}
+
+// SetSyscallRestartBlock sets the restart block for use in
+// restart_syscall(2). After registering a restart block, a syscall should
+// return ERESTART_RESTARTBLOCK to request a restart using the block.
+//
+// Precondition: The caller must be running on the task goroutine.
+func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) {
+	t.syscallRestartBlock = r
+}
+
+// SyscallRestartBlock returns the currently registered restart block for use in
+// restart_syscall(2). This function is *not* idempotent and may be called once
+// per syscall. This function must not be called if a restart block has not been
+// registered for the current syscall.
+//
+// Precondition: The caller must be running on the task goroutine.
+func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
+	r := t.syscallRestartBlock
+	// Explicitly set the restart block to nil so that a future syscall can't
+	// accidentally reuse it.
+	t.syscallRestartBlock = nil
+	return r
+}
+
+// IsChrooted returns true if the root directory of t's FSContext is not the
+// root directory of t's MountNamespace.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) IsChrooted() bool {
+	if VFS2Enabled {
+		realRoot := t.mountNamespaceVFS2.Root()
+		defer realRoot.DecRef()
+		root := t.fsContext.RootDirectoryVFS2()
+		defer root.DecRef()
+		return root != realRoot
+	}
+
+	realRoot := t.tg.mounts.Root()
+	defer realRoot.DecRef()
+	root := t.fsContext.RootDirectory()
+	if root != nil {
+		defer root.DecRef()
+	}
+	return root != realRoot
+}
+
+// TaskContext returns t's TaskContext.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) TaskContext() *TaskContext {
+	return &t.tc
+}
+
+// FSContext returns t's FSContext. FSContext does not take an additional
+// reference on the returned FSContext.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) FSContext() *FSContext {
+	return t.fsContext
+}
+
+// FDTable returns t's FDTable. FDMTable does not take an additional reference
+// on the returned FDMap.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) FDTable() *FDTable {
+	return t.fdTable
+}
+
+// GetFile is a convenience wrapper for t.FDTable().Get.
+//
+// Precondition: same as FDTable.Get.
+func (t *Task) GetFile(fd int32) *fs.File {
+	f, _ := t.fdTable.Get(fd)
+	return f
+}
+
+// GetFileVFS2 is a convenience wrapper for t.FDTable().GetVFS2.
+//
+// Precondition: same as FDTable.Get.
+func (t *Task) GetFileVFS2(fd int32) *vfs.FileDescription {
+	f, _ := t.fdTable.GetVFS2(fd)
+	return f
+}
+
+// NewFDs is a convenience wrapper for t.FDTable().NewFDs.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error) {
+	return t.fdTable.NewFDs(t, fd, files, flags)
+}
+
+// NewFDsVFS2 is a convenience wrapper for t.FDTable().NewFDsVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDsVFS2(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) {
+	return t.fdTable.NewFDsVFS2(t, fd, files, flags)
+}
+
+// NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) {
+	fds, err := t.fdTable.NewFDs(t, fd, []*fs.File{file}, flags)
+	if err != nil {
+		return 0, err
+	}
+	return fds[0], nil
+}
+
+// NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.Get.
+func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
+	return t.fdTable.NewFDVFS2(t, fd, file, flags)
+}
+
+// NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error {
+	return t.fdTable.NewFDAt(t, fd, file, flags)
+}
+
+// NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error {
+	return t.fdTable.NewFDAtVFS2(t, fd, file, flags)
+}
+
+// WithMuLocked executes f with t.mu locked.
+func (t *Task) WithMuLocked(f func(*Task)) {
+	t.mu.Lock()
+	f(t)
+	t.mu.Unlock()
+}
+
+// MountNamespace returns t's MountNamespace. MountNamespace does not take an
+// additional reference on the returned MountNamespace.
+func (t *Task) MountNamespace() *fs.MountNamespace {
+	return t.tg.mounts
+}
+
+// MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the
+// returned mount namespace.
+func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.mountNamespaceVFS2.IncRef()
+	return t.mountNamespaceVFS2
+}
+
+// AbstractSockets returns t's AbstractSocketNamespace.
+func (t *Task) AbstractSockets() *AbstractSocketNamespace {
+	return t.abstractSockets
+}
+
+// ContainerID returns t's container ID.
+func (t *Task) ContainerID() string {
+	return t.containerID
+}
+
+// OOMScoreAdj gets the task's thread group's OOM score adjustment.
+func (t *Task) OOMScoreAdj() int32 {
+	return atomic.LoadInt32(&t.tg.oomScoreAdj)
+}
+
+// SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The
+// value should be between -1000 and 1000 inclusive.
+func (t *Task) SetOOMScoreAdj(adj int32) error {
+	if adj > 1000 || adj < -1000 {
+		return syserror.EINVAL
+	}
+	atomic.StoreInt32(&t.tg.oomScoreAdj, adj)
+	return nil
+}
+
+// UID returns t's uid.
+// TODO(gvisor.dev/issue/170): This method is not namespaced yet.
+func (t *Task) UID() uint32 {
+	return uint32(t.Credentials().EffectiveKUID)
+}
+
+// GID returns t's gid.
+// TODO(gvisor.dev/issue/170): This method is not namespaced yet.
+func (t *Task) GID() uint32 {
+	return uint32(t.Credentials().EffectiveKGID)
+}
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
new file mode 100644
index 000000000..5f3e60fe8
--- /dev/null
+++ b/pkg/sentry/kernel/task_acct.go
@@ -0,0 +1,196 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// Accounting, limits, timers.
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Getitimer implements getitimer(2).
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Getitimer(id int32) (linux.ItimerVal, error) {
+	var tm ktime.Time
+	var s ktime.Setting
+	switch id {
+	case linux.ITIMER_REAL:
+		tm, s = t.tg.itimerRealTimer.Get()
+	case linux.ITIMER_VIRTUAL:
+		tm = t.tg.UserCPUClock().Now()
+		t.tg.signalHandlers.mu.Lock()
+		s, _ = t.tg.itimerVirtSetting.At(tm)
+		t.tg.signalHandlers.mu.Unlock()
+	case linux.ITIMER_PROF:
+		tm = t.tg.CPUClock().Now()
+		t.tg.signalHandlers.mu.Lock()
+		s, _ = t.tg.itimerProfSetting.At(tm)
+		t.tg.signalHandlers.mu.Unlock()
+	default:
+		return linux.ItimerVal{}, syserror.EINVAL
+	}
+	val, iv := ktime.SpecFromSetting(tm, s)
+	return linux.ItimerVal{
+		Value:    linux.DurationToTimeval(val),
+		Interval: linux.DurationToTimeval(iv),
+	}, nil
+}
+
+// Setitimer implements setitimer(2).
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Setitimer(id int32, newitv linux.ItimerVal) (linux.ItimerVal, error) {
+	var tm ktime.Time
+	var olds ktime.Setting
+	switch id {
+	case linux.ITIMER_REAL:
+		news, err := ktime.SettingFromSpec(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), t.tg.itimerRealTimer.Clock())
+		if err != nil {
+			return linux.ItimerVal{}, err
+		}
+		tm, olds = t.tg.itimerRealTimer.Swap(news)
+	case linux.ITIMER_VIRTUAL:
+		c := t.tg.UserCPUClock()
+		var err error
+		t.k.cpuClockTicker.Atomically(func() {
+			tm = c.Now()
+			var news ktime.Setting
+			news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm)
+			if err != nil {
+				return
+			}
+			t.tg.signalHandlers.mu.Lock()
+			olds = t.tg.itimerVirtSetting
+			t.tg.itimerVirtSetting = news
+			t.tg.updateCPUTimersEnabledLocked()
+			t.tg.signalHandlers.mu.Unlock()
+		})
+		if err != nil {
+			return linux.ItimerVal{}, err
+		}
+	case linux.ITIMER_PROF:
+		c := t.tg.CPUClock()
+		var err error
+		t.k.cpuClockTicker.Atomically(func() {
+			tm = c.Now()
+			var news ktime.Setting
+			news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm)
+			if err != nil {
+				return
+			}
+			t.tg.signalHandlers.mu.Lock()
+			olds = t.tg.itimerProfSetting
+			t.tg.itimerProfSetting = news
+			t.tg.updateCPUTimersEnabledLocked()
+			t.tg.signalHandlers.mu.Unlock()
+		})
+		if err != nil {
+			return linux.ItimerVal{}, err
+		}
+	default:
+		return linux.ItimerVal{}, syserror.EINVAL
+	}
+	oldval, oldiv := ktime.SpecFromSetting(tm, olds)
+	return linux.ItimerVal{
+		Value:    linux.DurationToTimeval(oldval),
+		Interval: linux.DurationToTimeval(oldiv),
+	}, nil
+}
+
+// IOUsage returns the io usage of the thread.
+func (t *Task) IOUsage() *usage.IO {
+	return t.ioUsage
+}
+
+// IOUsage returns the total io usage of all dead and live threads in the group.
+func (tg *ThreadGroup) IOUsage() *usage.IO {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+
+	io := *tg.ioUsage
+	// Account for active tasks.
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		io.Accumulate(t.IOUsage())
+	}
+	return &io
+}
+
+// Name returns t's name.
+func (t *Task) Name() string {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.tc.Name
+}
+
+// SetName changes t's name.
+func (t *Task) SetName(name string) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.tc.Name = name
+	t.Debugf("Set thread name to %q", name)
+}
+
+// Limits implements context.Context.Limits.
+func (t *Task) Limits() *limits.LimitSet {
+	return t.ThreadGroup().Limits()
+}
+
+// StartTime returns t's start time.
+func (t *Task) StartTime() ktime.Time {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.startTime
+}
+
+// MaxRSS returns the maximum resident set size of the task in bytes. which
+// should be one of RUSAGE_SELF, RUSAGE_CHILDREN, RUSAGE_THREAD, or
+// RUSAGE_BOTH. See getrusage(2) for documentation on the behavior of these
+// flags.
+func (t *Task) MaxRSS(which int32) uint64 {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+
+	switch which {
+	case linux.RUSAGE_SELF, linux.RUSAGE_THREAD:
+		// If there's an active mm we can use its value.
+		if mm := t.MemoryManager(); mm != nil {
+			if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > t.tg.maxRSS {
+				return mmMaxRSS
+			}
+		}
+		return t.tg.maxRSS
+	case linux.RUSAGE_CHILDREN:
+		return t.tg.childMaxRSS
+	case linux.RUSAGE_BOTH:
+		maxRSS := t.tg.maxRSS
+		if maxRSS < t.tg.childMaxRSS {
+			maxRSS = t.tg.childMaxRSS
+		}
+		if mm := t.MemoryManager(); mm != nil {
+			if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > maxRSS {
+				return mmMaxRSS
+			}
+		}
+		return maxRSS
+	default:
+		// We'll only get here if which is invalid.
+		return 0
+	}
+}
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
new file mode 100644
index 000000000..4a4a69ee2
--- /dev/null
+++ b/pkg/sentry/kernel/task_block.go
@@ -0,0 +1,230 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"runtime"
+	"runtime/trace"
+	"time"
+
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// BlockWithTimeout blocks t until an event is received from C, the application
+// monotonic clock indicates that timeout has elapsed (only if haveTimeout is true),
+// or t is interrupted. It returns:
+//
+// - The remaining timeout, which is guaranteed to be 0 if the timeout expired,
+// and is unspecified if haveTimeout is false.
+//
+// - An error which is nil if an event is received from C, ETIMEDOUT if the timeout
+// expired, and syserror.ErrInterrupted if t is interrupted.
+func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) {
+	if !haveTimeout {
+		return timeout, t.block(C, nil)
+	}
+
+	start := t.Kernel().MonotonicClock().Now()
+	deadline := start.Add(timeout)
+	err := t.BlockWithDeadline(C, true, deadline)
+
+	// Timeout, explicitly return a remaining duration of 0.
+	if err == syserror.ETIMEDOUT {
+		return 0, err
+	}
+
+	// Compute the remaining timeout. Note that even if block() above didn't
+	// return due to a timeout, we may have used up any of the remaining time
+	// since then. We cap the remaining timeout to 0 to make it easier to
+	// directly use the returned duration.
+	end := t.Kernel().MonotonicClock().Now()
+	remainingTimeout := timeout - end.Sub(start)
+	if remainingTimeout < 0 {
+		remainingTimeout = 0
+	}
+
+	return remainingTimeout, err
+}
+
+// BlockWithDeadline blocks t until an event is received from C, the
+// application monotonic clock indicates a time of deadline (only if
+// haveDeadline is true), or t is interrupted. It returns nil if an event is
+// received from C, ETIMEDOUT if the deadline expired, and
+// syserror.ErrInterrupted if t is interrupted.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) BlockWithDeadline(C chan struct{}, haveDeadline bool, deadline ktime.Time) error {
+	if !haveDeadline {
+		return t.block(C, nil)
+	}
+
+	// Start the timeout timer.
+	t.blockingTimer.Swap(ktime.Setting{
+		Enabled: true,
+		Next:    deadline,
+	})
+
+	err := t.block(C, t.blockingTimerChan)
+
+	// Stop the timeout timer and drain the channel.
+	t.blockingTimer.Swap(ktime.Setting{})
+	select {
+	case <-t.blockingTimerChan:
+	default:
+	}
+
+	return err
+}
+
+// BlockWithTimer blocks t until an event is received from C or tchan, or t is
+// interrupted. It returns nil if an event is received from C, ETIMEDOUT if an
+// event is received from tchan, and syserror.ErrInterrupted if t is
+// interrupted.
+//
+// Most clients should use BlockWithDeadline or BlockWithTimeout instead.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) BlockWithTimer(C <-chan struct{}, tchan <-chan struct{}) error {
+	return t.block(C, tchan)
+}
+
+// Block blocks t until an event is received from C or t is interrupted. It
+// returns nil if an event is received from C and syserror.ErrInterrupted if t
+// is interrupted.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Block(C <-chan struct{}) error {
+	return t.block(C, nil)
+}
+
+// block blocks a task on one of many events.
+// N.B. defer is too expensive to be used here.
+func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
+	// Fast path if the request is already done.
+	select {
+	case <-C:
+		return nil
+	default:
+	}
+
+	// Deactive our address space, we don't need it.
+	interrupt := t.SleepStart()
+
+	// If the request is not completed, but the timer has already expired,
+	// then ensure that we run through a scheduler cycle. This is because
+	// we may see applications relying on timer slack to yield the thread.
+	// For example, they may attempt to sleep for some number of nanoseconds,
+	// and expect that this will actually yield the CPU and sleep for at
+	// least microseconds, e.g.:
+	// https://github.com/LMAX-Exchange/disruptor/commit/6ca210f2bcd23f703c479804d583718e16f43c07
+	if len(timerChan) > 0 {
+		runtime.Gosched()
+	}
+
+	region := trace.StartRegion(t.traceContext, blockRegion)
+	select {
+	case <-C:
+		region.End()
+		t.SleepFinish(true)
+		// Woken by event.
+		return nil
+
+	case <-interrupt:
+		region.End()
+		t.SleepFinish(false)
+		// Return the indicated error on interrupt.
+		return syserror.ErrInterrupted
+
+	case <-timerChan:
+		region.End()
+		t.SleepFinish(true)
+		// We've timed out.
+		return syserror.ETIMEDOUT
+	}
+}
+
+// SleepStart implements amutex.Sleeper.SleepStart.
+func (t *Task) SleepStart() <-chan struct{} {
+	t.Deactivate()
+	t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible)
+	return t.interruptChan
+}
+
+// SleepFinish implements amutex.Sleeper.SleepFinish.
+func (t *Task) SleepFinish(success bool) {
+	if !success {
+		// The interrupted notification is consumed only at the top-level
+		// (Run). Therefore we attempt to reset the pending notification.
+		// This will also elide our next entry back into the task, so we
+		// will process signals, state changes, etc.
+		t.interruptSelf()
+	}
+	t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible)
+	t.Activate()
+}
+
+// Interrupted implements amutex.Sleeper.Interrupted
+func (t *Task) Interrupted() bool {
+	return len(t.interruptChan) != 0
+}
+
+// UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart.
+func (t *Task) UninterruptibleSleepStart(deactivate bool) {
+	if deactivate {
+		t.Deactivate()
+	}
+	t.accountTaskGoroutineEnter(TaskGoroutineBlockedUninterruptible)
+}
+
+// UninterruptibleSleepFinish implements context.Context.UninterruptibleSleepFinish.
+func (t *Task) UninterruptibleSleepFinish(activate bool) {
+	t.accountTaskGoroutineLeave(TaskGoroutineBlockedUninterruptible)
+	if activate {
+		t.Activate()
+	}
+}
+
+// interrupted returns true if interrupt or interruptSelf has been called at
+// least once since the last call to interrupted.
+func (t *Task) interrupted() bool {
+	select {
+	case <-t.interruptChan:
+		return true
+	default:
+		return false
+	}
+}
+
+// interrupt unblocks the task and interrupts it if it's currently running in
+// userspace.
+func (t *Task) interrupt() {
+	t.interruptSelf()
+	t.p.Interrupt()
+}
+
+// interruptSelf is like Interrupt, but can only be called by the task
+// goroutine.
+func (t *Task) interruptSelf() {
+	select {
+	case t.interruptChan <- struct{}{}:
+		t.Debugf("Interrupt queued")
+	default:
+		t.Debugf("Dropping duplicate interrupt")
+	}
+	// platform.Context.Interrupt() is unnecessary since a task goroutine
+	// calling interruptSelf() cannot also be blocked in
+	// platform.Context.Switch().
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
new file mode 100644
index 000000000..e1ecca99e
--- /dev/null
+++ b/pkg/sentry/kernel/task_clone.go
@@ -0,0 +1,540 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// SharingOptions controls what resources are shared by a new task created by
+// Task.Clone, or an existing task affected by Task.Unshare.
+type SharingOptions struct {
+	// If NewAddressSpace is true, the task should have an independent virtual
+	// address space.
+	NewAddressSpace bool
+
+	// If NewSignalHandlers is true, the task should use an independent set of
+	// signal handlers.
+	NewSignalHandlers bool
+
+	// If NewThreadGroup is true, the task should be the leader of its own
+	// thread group. TerminationSignal is the signal that the thread group
+	// will send to its parent when it exits. If NewThreadGroup is false,
+	// TerminationSignal is ignored.
+	NewThreadGroup    bool
+	TerminationSignal linux.Signal
+
+	// If NewPIDNamespace is true:
+	//
+	// - In the context of Task.Clone, the new task should be the init task
+	// (TID 1) in a new PID namespace.
+	//
+	// - In the context of Task.Unshare, the task should create a new PID
+	// namespace, and all subsequent clones of the task should be members of
+	// the new PID namespace.
+	NewPIDNamespace bool
+
+	// If NewUserNamespace is true, the task should have an independent user
+	// namespace.
+	NewUserNamespace bool
+
+	// If NewNetworkNamespace is true, the task should have an independent
+	// network namespace.
+	NewNetworkNamespace bool
+
+	// If NewFiles is true, the task should use an independent file descriptor
+	// table.
+	NewFiles bool
+
+	// If NewFSContext is true, the task should have an independent FSContext.
+	NewFSContext bool
+
+	// If NewUTSNamespace is true, the task should have an independent UTS
+	// namespace.
+	NewUTSNamespace bool
+
+	// If NewIPCNamespace is true, the task should have an independent IPC
+	// namespace.
+	NewIPCNamespace bool
+}
+
+// CloneOptions controls the behavior of Task.Clone.
+type CloneOptions struct {
+	// SharingOptions defines the set of resources that the new task will share
+	// with its parent.
+	SharingOptions
+
+	// Stack is the initial stack pointer of the new task. If Stack is 0, the
+	// new task will start with the same stack pointer as its parent.
+	Stack usermem.Addr
+
+	// If SetTLS is true, set the new task's TLS (thread-local storage)
+	// descriptor to TLS. If SetTLS is false, TLS is ignored.
+	SetTLS bool
+	TLS    usermem.Addr
+
+	// If ChildClearTID is true, when the child exits, 0 is written to the
+	// address ChildTID in the child's memory, and if the write is successful a
+	// futex wake on the same address is performed.
+	//
+	// If ChildSetTID is true, the child's thread ID (in the child's PID
+	// namespace) is written to address ChildTID in the child's memory. (As in
+	// Linux, failed writes are silently ignored.)
+	ChildClearTID bool
+	ChildSetTID   bool
+	ChildTID      usermem.Addr
+
+	// If ParentSetTID is true, the child's thread ID (in the parent's PID
+	// namespace) is written to address ParentTID in the parent's memory. (As
+	// in Linux, failed writes are silently ignored.)
+	//
+	// Older versions of the clone(2) man page state that CLONE_PARENT_SETTID
+	// causes the child's thread ID to be written to ptid in both the parent
+	// and child's memory, but this is a documentation error fixed by
+	// 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID").
+	ParentSetTID bool
+	ParentTID    usermem.Addr
+
+	// If Vfork is true, place the parent in vforkStop until the cloned task
+	// releases its TaskContext.
+	Vfork bool
+
+	// If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for
+	// this clone(), and do not ptrace-attach the caller's tracer to the new
+	// task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate).
+	Untraced bool
+
+	// If InheritTracer is true, ptrace-attach the caller's tracer to the new
+	// task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported
+	// for it. If both Untraced and InheritTracer are true, no event will be
+	// reported, but tracer inheritance will still occur.
+	InheritTracer bool
+}
+
+// Clone implements the clone(2) syscall and returns the thread ID of the new
+// task in t's PID namespace. Clone may return both a non-zero thread ID and a
+// non-nil error.
+//
+// Preconditions: The caller must be running Task.doSyscallInvoke on the task
+// goroutine.
+func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
+	// Since signal actions may refer to application signal handlers by virtual
+	// address, any set of signal handlers must refer to the same address
+	// space.
+	if !opts.NewSignalHandlers && opts.NewAddressSpace {
+		return 0, nil, syserror.EINVAL
+	}
+	// In order for the behavior of thread-group-directed signals to be sane,
+	// all tasks in a thread group must share signal handlers.
+	if !opts.NewThreadGroup && opts.NewSignalHandlers {
+		return 0, nil, syserror.EINVAL
+	}
+	// All tasks in a thread group must be in the same PID namespace.
+	if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) {
+		return 0, nil, syserror.EINVAL
+	}
+	// The two different ways of specifying a new PID namespace are
+	// incompatible.
+	if opts.NewPIDNamespace && t.childPIDNamespace != nil {
+		return 0, nil, syserror.EINVAL
+	}
+	// Thread groups and FS contexts cannot span user namespaces.
+	if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
+	// single clone(2) or unshare(2) call, the user namespace is guaranteed to
+	// be created first, giving the child (clone(2)) or caller (unshare(2))
+	// privileges over the remaining namespaces created by the call." -
+	// user_namespaces(7)
+	creds := t.Credentials()
+	userns := creds.UserNamespace
+	if opts.NewUserNamespace {
+		var err error
+		// "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
+		// the caller is in a chroot environment (i.e., the caller's root
+		// directory does not match the root directory of the mount namespace
+		// in which it resides)." - clone(2). Neither chroot(2) nor
+		// user_namespaces(7) document this.
+		if t.IsChrooted() {
+			return 0, nil, syserror.EPERM
+		}
+		userns, err = creds.NewChildUserNamespace()
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+	if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
+		return 0, nil, syserror.EPERM
+	}
+
+	utsns := t.UTSNamespace()
+	if opts.NewUTSNamespace {
+		// Note that this must happen after NewUserNamespace so we get
+		// the new userns if there is one.
+		utsns = t.UTSNamespace().Clone(userns)
+	}
+
+	ipcns := t.IPCNamespace()
+	if opts.NewIPCNamespace {
+		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
+		// namespace"
+		ipcns = NewIPCNamespace(userns)
+	}
+
+	netns := t.NetworkNamespace()
+	if opts.NewNetworkNamespace {
+		netns = inet.NewNamespace(netns)
+	}
+
+	// TODO(b/63601033): Implement CLONE_NEWNS.
+	mntnsVFS2 := t.mountNamespaceVFS2
+	if mntnsVFS2 != nil {
+		mntnsVFS2.IncRef()
+	}
+
+	tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
+	if err != nil {
+		return 0, nil, err
+	}
+	// clone() returns 0 in the child.
+	tc.Arch.SetReturn(0)
+	if opts.Stack != 0 {
+		tc.Arch.SetStack(uintptr(opts.Stack))
+	}
+	if opts.SetTLS {
+		if !tc.Arch.SetTLS(uintptr(opts.TLS)) {
+			return 0, nil, syserror.EPERM
+		}
+	}
+
+	var fsContext *FSContext
+	if opts.NewFSContext {
+		fsContext = t.fsContext.Fork()
+	} else {
+		fsContext = t.fsContext
+		fsContext.IncRef()
+	}
+
+	var fdTable *FDTable
+	if opts.NewFiles {
+		fdTable = t.fdTable.Fork()
+	} else {
+		fdTable = t.fdTable
+		fdTable.IncRef()
+	}
+
+	pidns := t.tg.pidns
+	if t.childPIDNamespace != nil {
+		pidns = t.childPIDNamespace
+	} else if opts.NewPIDNamespace {
+		pidns = pidns.NewChild(userns)
+	}
+
+	tg := t.tg
+	rseqAddr := usermem.Addr(0)
+	rseqSignature := uint32(0)
+	if opts.NewThreadGroup {
+		if tg.mounts != nil {
+			tg.mounts.IncRef()
+		}
+		sh := t.tg.signalHandlers
+		if opts.NewSignalHandlers {
+			sh = sh.Fork()
+		}
+		tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy())
+		tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj)
+		rseqAddr = t.rseqAddr
+		rseqSignature = t.rseqSignature
+	}
+
+	cfg := &TaskConfig{
+		Kernel:                  t.k,
+		ThreadGroup:             tg,
+		SignalMask:              t.SignalMask(),
+		TaskContext:             tc,
+		FSContext:               fsContext,
+		FDTable:                 fdTable,
+		Credentials:             creds,
+		Niceness:                t.Niceness(),
+		NetworkNamespace:        netns,
+		AllowedCPUMask:          t.CPUMask(),
+		UTSNamespace:            utsns,
+		IPCNamespace:            ipcns,
+		AbstractSocketNamespace: t.abstractSockets,
+		MountNamespaceVFS2:      mntnsVFS2,
+		RSeqAddr:                rseqAddr,
+		RSeqSignature:           rseqSignature,
+		ContainerID:             t.ContainerID(),
+	}
+	if opts.NewThreadGroup {
+		cfg.Parent = t
+	} else {
+		cfg.InheritParent = t
+	}
+	nt, err := t.tg.pidns.owner.NewTask(cfg)
+	if err != nil {
+		if opts.NewThreadGroup {
+			tg.release()
+		}
+		return 0, nil, err
+	}
+
+	// "A child process created via fork(2) inherits a copy of its parent's
+	// alternate signal stack settings" - sigaltstack(2).
+	//
+	// However kernel/fork.c:copy_process() adds a limitation to this:
+	// "sigaltstack should be cleared when sharing the same VM".
+	if opts.NewAddressSpace || opts.Vfork {
+		nt.SetSignalStack(t.SignalStack())
+	}
+
+	if userns != creds.UserNamespace {
+		if err := nt.SetUserNamespace(userns); err != nil {
+			// This shouldn't be possible: userns was created from nt.creds, so
+			// nt should have CAP_SYS_ADMIN in userns.
+			panic("Task.Clone: SetUserNamespace failed: " + err.Error())
+		}
+	}
+
+	// This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
+	// nt that it must receive before its task goroutine starts running.
+	tid := nt.k.tasks.Root.IDOfTask(nt)
+	defer nt.Start(tid)
+	t.traceCloneEvent(tid)
+
+	// "If fork/clone and execve are allowed by @prog, any child processes will
+	// be constrained to the same filters and system call ABI as the parent." -
+	// Documentation/prctl/seccomp_filter.txt
+	if f := t.syscallFilters.Load(); f != nil {
+		copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...)
+		nt.syscallFilters.Store(copiedFilters)
+	}
+	if opts.Vfork {
+		nt.vforkParent = t
+	}
+
+	if opts.ChildClearTID {
+		nt.SetClearTID(opts.ChildTID)
+	}
+	if opts.ChildSetTID {
+		// Can't use Task.CopyOut, which assumes AddressSpaceActive.
+		usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{})
+	}
+	ntid := t.tg.pidns.IDOfTask(nt)
+	if opts.ParentSetTID {
+		t.CopyOut(opts.ParentTID, ntid)
+	}
+
+	kind := ptraceCloneKindClone
+	if opts.Vfork {
+		kind = ptraceCloneKindVfork
+	} else if opts.TerminationSignal == linux.SIGCHLD {
+		kind = ptraceCloneKindFork
+	}
+	if t.ptraceClone(kind, nt, opts) {
+		if opts.Vfork {
+			return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil
+		}
+		return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil
+	}
+	if opts.Vfork {
+		t.maybeBeginVforkStop(nt)
+		return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil
+	}
+	return ntid, nil, nil
+}
+
+// maybeBeginVforkStop checks if a previously-started vfork child is still
+// running and has not yet released its MM, such that its parent t should enter
+// a vforkStop.
+//
+// Preconditions: The caller must be running on t's task goroutine.
+func (t *Task) maybeBeginVforkStop(child *Task) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.killedLocked() {
+		child.vforkParent = nil
+		return
+	}
+	if child.vforkParent == t {
+		t.beginInternalStopLocked((*vforkStop)(nil))
+	}
+}
+
+func (t *Task) unstopVforkParent() {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if p := t.vforkParent; p != nil {
+		p.tg.signalHandlers.mu.Lock()
+		defer p.tg.signalHandlers.mu.Unlock()
+		if _, ok := p.stop.(*vforkStop); ok {
+			p.endInternalStopLocked()
+		}
+		// Parent no longer needs to be unstopped.
+		t.vforkParent = nil
+	}
+}
+
+// +stateify savable
+type runSyscallAfterPtraceEventClone struct {
+	vforkChild *Task
+
+	// If vforkChild is not nil, vforkChildTID is its thread ID in the parent's
+	// PID namespace. vforkChildTID must be stored since the child may exit and
+	// release its TID before the PTRACE_EVENT stop ends.
+	vforkChildTID ThreadID
+}
+
+func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
+	if r.vforkChild != nil {
+		t.maybeBeginVforkStop(r.vforkChild)
+		return &runSyscallAfterVforkStop{r.vforkChildTID}
+	}
+	return (*runSyscallExit)(nil)
+}
+
+// +stateify savable
+type runSyscallAfterVforkStop struct {
+	// childTID has the same meaning as
+	// runSyscallAfterPtraceEventClone.vforkChildTID.
+	childTID ThreadID
+}
+
+func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState {
+	t.ptraceVforkDone(r.childTID)
+	return (*runSyscallExit)(nil)
+}
+
+// Unshare changes the set of resources t shares with other tasks, as specified
+// by opts.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Unshare(opts *SharingOptions) error {
+	// In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and
+	// NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if
+	// t is the only task using its MM, which due to clone(2)'s rules imply
+	// that it is also the only task using its signal handlers / in its thread
+	// group, and cause EINVAL to be returned otherwise.
+	//
+	// Since we don't count the number of tasks using each address space or set
+	// of signal handlers, we reject NewSignalHandlers and NewAddressSpace
+	// altogether, and interpret NewThreadGroup as requiring that t be the only
+	// member of its thread group. This seems to be logically coherent, in the
+	// sense that clone(2) allows a task to share signal handlers and address
+	// spaces with tasks in other thread groups.
+	if opts.NewAddressSpace || opts.NewSignalHandlers {
+		return syserror.EINVAL
+	}
+	creds := t.Credentials()
+	if opts.NewThreadGroup {
+		t.tg.signalHandlers.mu.Lock()
+		if t.tg.tasksCount != 1 {
+			t.tg.signalHandlers.mu.Unlock()
+			return syserror.EINVAL
+		}
+		t.tg.signalHandlers.mu.Unlock()
+		// This isn't racy because we're the only living task, and therefore
+		// the only task capable of creating new ones, in our thread group.
+	}
+	if opts.NewUserNamespace {
+		if t.IsChrooted() {
+			return syserror.EPERM
+		}
+		newUserNS, err := creds.NewChildUserNamespace()
+		if err != nil {
+			return err
+		}
+		err = t.SetUserNamespace(newUserNS)
+		if err != nil {
+			return err
+		}
+		// Need to reload creds, becaue t.SetUserNamespace() changed task credentials.
+		creds = t.Credentials()
+	}
+	haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
+	if opts.NewPIDNamespace {
+		if !haveCapSysAdmin {
+			return syserror.EPERM
+		}
+		t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
+	}
+	t.mu.Lock()
+	// Can't defer unlock: DecRefs must occur without holding t.mu.
+	if opts.NewNetworkNamespace {
+		if !haveCapSysAdmin {
+			t.mu.Unlock()
+			return syserror.EPERM
+		}
+		t.netns = inet.NewNamespace(t.netns)
+	}
+	if opts.NewUTSNamespace {
+		if !haveCapSysAdmin {
+			t.mu.Unlock()
+			return syserror.EPERM
+		}
+		// Note that this must happen after NewUserNamespace, so the
+		// new user namespace is used if there is one.
+		t.utsns = t.utsns.Clone(creds.UserNamespace)
+	}
+	if opts.NewIPCNamespace {
+		if !haveCapSysAdmin {
+			t.mu.Unlock()
+			return syserror.EPERM
+		}
+		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
+		// namespace"
+		t.ipcns = NewIPCNamespace(creds.UserNamespace)
+	}
+	var oldFDTable *FDTable
+	if opts.NewFiles {
+		oldFDTable = t.fdTable
+		t.fdTable = oldFDTable.Fork()
+	}
+	var oldFSContext *FSContext
+	if opts.NewFSContext {
+		oldFSContext = t.fsContext
+		t.fsContext = oldFSContext.Fork()
+	}
+	t.mu.Unlock()
+	if oldFDTable != nil {
+		oldFDTable.DecRef()
+	}
+	if oldFSContext != nil {
+		oldFSContext.DecRef()
+	}
+	return nil
+}
+
+// vforkStop is a TaskStop imposed on a task that creates a child with
+// CLONE_VFORK or vfork(2), that ends when the child task ceases to use its
+// current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
+// that the child and parent share mappings until the child execve()s into a
+// new process image or exits.)
+//
+// +stateify savable
+type vforkStop struct{}
+
+// StopIgnoresKill implements TaskStop.Killable.
+func (*vforkStop) Killable() bool { return true }
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
new file mode 100644
index 000000000..9fa528384
--- /dev/null
+++ b/pkg/sentry/kernel/task_context.go
@@ -0,0 +1,169 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC)
+
+// Auxmap contains miscellaneous data for the task.
+type Auxmap map[string]interface{}
+
+// TaskContext is the subset of a task's data that is provided by the loader.
+//
+// +stateify savable
+type TaskContext struct {
+	// Name is the thread name set by the prctl(PR_SET_NAME) system call.
+	Name string
+
+	// Arch is the architecture-specific context (registers, etc.)
+	Arch arch.Context
+
+	// MemoryManager is the task's address space.
+	MemoryManager *mm.MemoryManager
+
+	// fu implements futexes in the address space.
+	fu *futex.Manager
+
+	// st is the task's syscall table.
+	st *SyscallTable `state:".(syscallTableInfo)"`
+}
+
+// release releases all resources held by the TaskContext. release is called by
+// the task when it execs into a new TaskContext or exits.
+func (tc *TaskContext) release() {
+	// Nil out pointers so that if the task is saved after release, it doesn't
+	// follow the pointers to possibly now-invalid objects.
+	if tc.MemoryManager != nil {
+		tc.MemoryManager.DecUsers(context.Background())
+		tc.MemoryManager = nil
+	}
+	tc.fu = nil
+}
+
+// Fork returns a duplicate of tc. The copied TaskContext always has an
+// independent arch.Context. If shareAddressSpace is true, the copied
+// TaskContext shares an address space with the original; otherwise, the copied
+// TaskContext has an independent address space that is initially a duplicate
+// of the original's.
+func (tc *TaskContext) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskContext, error) {
+	newTC := &TaskContext{
+		Name: tc.Name,
+		Arch: tc.Arch.Fork(),
+		st:   tc.st,
+	}
+	if shareAddressSpace {
+		newTC.MemoryManager = tc.MemoryManager
+		if newTC.MemoryManager != nil {
+			if !newTC.MemoryManager.IncUsers() {
+				// Shouldn't be possible since tc.MemoryManager should be a
+				// counted user.
+				panic(fmt.Sprintf("TaskContext.Fork called with userless TaskContext.MemoryManager"))
+			}
+		}
+		newTC.fu = tc.fu
+	} else {
+		newMM, err := tc.MemoryManager.Fork(ctx)
+		if err != nil {
+			return nil, err
+		}
+		newTC.MemoryManager = newMM
+		newTC.fu = k.futexes.Fork()
+	}
+	return newTC, nil
+}
+
+// Arch returns t's arch.Context.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Arch() arch.Context {
+	return t.tc.Arch
+}
+
+// MemoryManager returns t's MemoryManager. MemoryManager does not take an
+// additional reference on the returned MM.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) MemoryManager() *mm.MemoryManager {
+	return t.tc.MemoryManager
+}
+
+// SyscallTable returns t's syscall table.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) SyscallTable() *SyscallTable {
+	return t.tc.st
+}
+
+// Stack returns the userspace stack.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Stack() *arch.Stack {
+	return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())}
+}
+
+// LoadTaskImage loads a specified file into a new TaskContext.
+//
+// args.MemoryManager does not need to be set by the caller.
+func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) {
+	// If File is not nil, we should load that instead of resolving Filename.
+	if args.File != nil {
+		args.Filename = args.File.PathnameWithDeleted(ctx)
+	}
+
+	// Prepare a new user address space to load into.
+	m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
+	defer m.DecUsers(ctx)
+	args.MemoryManager = m
+
+	os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso)
+	if err != nil {
+		return nil, err
+	}
+
+	// Lookup our new syscall table.
+	st, ok := LookupSyscallTable(os, ac.Arch())
+	if !ok {
+		// No syscall table found. This means that the ELF binary does not match
+		// the architecture.
+		return nil, errNoSyscalls
+	}
+
+	if !m.IncUsers() {
+		panic("Failed to increment users count on new MM")
+	}
+	return &TaskContext{
+		Name:          name,
+		Arch:          ac,
+		MemoryManager: m,
+		fu:            k.futexes.Fork(),
+		st:            st,
+	}, nil
+}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
new file mode 100644
index 000000000..9b69f3cbe
--- /dev/null
+++ b/pkg/sentry/kernel/task_exec.go
@@ -0,0 +1,277 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements the machinery behind the execve() syscall. In brief, a
+// thread executes an execve() by killing all other threads in its thread
+// group, assuming the leader's identity, and then switching process images.
+//
+// This design is effectively mandated by Linux. From ptrace(2):
+//
+// """
+// execve(2) under ptrace
+//     When one thread in a multithreaded process calls execve(2), the
+//     kernel destroys all other threads in the process, and resets the
+//     thread ID of the execing thread to the thread group ID (process ID).
+//     (Or, to put things another way, when a multithreaded process does an
+//     execve(2), at completion of the call, it appears as though the
+//     execve(2) occurred in the thread group leader, regardless of which
+//     thread did the execve(2).)  This resetting of the thread ID looks
+//     very confusing to tracers:
+//
+//     *  All other threads stop in PTRACE_EVENT_EXIT stop, if the
+//        PTRACE_O_TRACEEXIT option was turned on.  Then all other threads
+//        except the thread group leader report death as if they exited via
+//        _exit(2) with exit code 0.
+//
+//     *  The execing tracee changes its thread ID while it is in the
+//        execve(2).  (Remember, under ptrace, the "pid" returned from
+//        waitpid(2), or fed into ptrace calls, is the tracee's thread ID.)
+//        That is, the tracee's thread ID is reset to be the same as its
+//        process ID, which is the same as the thread group leader's thread
+//        ID.
+//
+//     *  Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC
+//        option was turned on.
+//
+//     *  If the thread group leader has reported its PTRACE_EVENT_EXIT stop
+//        by this time, it appears to the tracer that the dead thread leader
+//        "reappears from nowhere".  (Note: the thread group leader does not
+//        report death via WIFEXITED(status) until there is at least one
+//        other live thread.  This eliminates the possibility that the
+//        tracer will see it dying and then reappearing.)  If the thread
+//        group leader was still alive, for the tracer this may look as if
+//        thread group leader returns from a different system call than it
+//        entered, or even "returned from a system call even though it was
+//        not in any system call".  If the thread group leader was not
+//        traced (or was traced by a different tracer), then during
+//        execve(2) it will appear as if it has become a tracee of the
+//        tracer of the execing tracee.
+//
+//     All of the above effects are the artifacts of the thread ID change in
+//     the tracee.
+// """
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// execStop is a TaskStop that a task sets on itself when it wants to execve
+// and is waiting for the other tasks in its thread group to exit first.
+//
+// +stateify savable
+type execStop struct{}
+
+// Killable implements TaskStop.Killable.
+func (*execStop) Killable() bool { return true }
+
+// Execve implements the execve(2) syscall by killing all other tasks in its
+// thread group and switching to newTC. Execve always takes ownership of newTC.
+//
+// Preconditions: The caller must be running Task.doSyscallInvoke on the task
+// goroutine.
+func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+
+	if t.tg.exiting || t.tg.execing != nil {
+		// We lost to a racing group-exit, kill, or exec from another thread
+		// and should just exit.
+		newTC.release()
+		return nil, syserror.EINTR
+	}
+
+	// Cancel any racing group stops.
+	t.tg.endGroupStopLocked(false)
+
+	// If the task has any siblings, they have to exit before the exec can
+	// continue.
+	t.tg.execing = t
+	if t.tg.tasks.Front() != t.tg.tasks.Back() {
+		// "[All] other threads except the thread group leader report death as
+		// if they exited via _exit(2) with exit code 0." - ptrace(2)
+		for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
+			if t != sibling {
+				sibling.killLocked()
+			}
+		}
+		// The last sibling to exit will wake t.
+		t.beginInternalStopLocked((*execStop)(nil))
+	}
+
+	return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil
+}
+
+// The runSyscallAfterExecStop state continues execve(2) after all siblings of
+// a thread in the execve syscall have exited.
+//
+// +stateify savable
+type runSyscallAfterExecStop struct {
+	tc *TaskContext
+}
+
+func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
+	t.traceExecEvent(r.tc)
+	t.tg.pidns.owner.mu.Lock()
+	t.tg.execing = nil
+	if t.killed() {
+		t.tg.pidns.owner.mu.Unlock()
+		r.tc.release()
+		return (*runInterrupt)(nil)
+	}
+	// We are the thread group leader now. Save our old thread ID for
+	// PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this
+	// point it will get a PID of 0, but this is consistent with Linux.
+	oldTID := ThreadID(0)
+	if tracer := t.Tracer(); tracer != nil {
+		oldTID = tracer.tg.pidns.tids[t]
+	}
+	t.promoteLocked()
+	// "POSIX timers are not preserved (timer_create(2))." - execve(2). Handle
+	// this first since POSIX timers are protected by the signal mutex, which
+	// we're about to change. Note that we have to stop and destroy timers
+	// without holding any mutexes to avoid circular lock ordering.
+	var its []*IntervalTimer
+	t.tg.signalHandlers.mu.Lock()
+	for _, it := range t.tg.timers {
+		its = append(its, it)
+	}
+	t.tg.timers = make(map[linux.TimerID]*IntervalTimer)
+	t.tg.signalHandlers.mu.Unlock()
+	t.tg.pidns.owner.mu.Unlock()
+	for _, it := range its {
+		it.DestroyTimer()
+	}
+	t.tg.pidns.owner.mu.Lock()
+	// "During an execve(2), the dispositions of handled signals are reset to
+	// the default; the dispositions of ignored signals are left unchanged. ...
+	// [The] signal mask is preserved across execve(2). ... [The] pending
+	// signal set is preserved across an execve(2)." - signal(7)
+	//
+	// Details:
+	//
+	// - If the thread group is sharing its signal handlers with another thread
+	// group via CLONE_SIGHAND, execve forces the signal handlers to be copied
+	// (see Linux's fs/exec.c:de_thread). We're not reference-counting signal
+	// handlers, so we always make a copy.
+	//
+	// - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags,
+	// restorer (if present), and mask are always reset. (See Linux's
+	// fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.)
+	t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec()
+	t.endStopCond.L = &t.tg.signalHandlers.mu
+	// "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2)
+	t.signalStack = arch.SignalStack{Flags: arch.SignalStackFlagDisable}
+	// "The termination signal is reset to SIGCHLD (see clone(2))."
+	t.tg.terminationSignal = linux.SIGCHLD
+	// execed indicates that the process can no longer join a process group
+	// in some scenarios (namely, the parent call setpgid(2) on the child).
+	// See the JoinProcessGroup function in sessions.go for more context.
+	t.tg.execed = true
+	// Maximum RSS is preserved across execve(2).
+	t.updateRSSLocked()
+	// Restartable sequence state is discarded.
+	t.rseqPreempted = false
+	t.rseqCPU = -1
+	t.rseqAddr = 0
+	t.rseqSignature = 0
+	t.oldRSeqCPUAddr = 0
+	t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
+	t.tg.pidns.owner.mu.Unlock()
+
+	oldFDTable := t.fdTable
+	t.fdTable = t.fdTable.Fork()
+	oldFDTable.DecRef()
+
+	// Remove FDs with the CloseOnExec flag set.
+	t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
+		return flags.CloseOnExec
+	})
+
+	// NOTE(b/30815691): We currently do not implement privileged
+	// executables (set-user/group-ID bits and file capabilities). This
+	// allows us to unconditionally enable user dumpability on the new mm.
+	// See fs/exec.c:setup_new_exec.
+	r.tc.MemoryManager.SetDumpability(mm.UserDumpable)
+
+	// Switch to the new process.
+	t.MemoryManager().Deactivate()
+	t.mu.Lock()
+	// Update credentials to reflect the execve. This should precede switching
+	// MMs to ensure that dumpability has been reset first, if needed.
+	t.updateCredsForExecLocked()
+	t.tc.release()
+	t.tc = *r.tc
+	t.mu.Unlock()
+	t.unstopVforkParent()
+	// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
+	t.MemoryManager().Activate(t)
+
+	t.ptraceExec(oldTID)
+	return (*runSyscallExit)(nil)
+}
+
+// promoteLocked makes t the leader of its thread group. If t is already the
+// thread group leader, promoteLocked is a no-op.
+//
+// Preconditions: All other tasks in t's thread group, including the existing
+// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must
+// be locked for writing.
+func (t *Task) promoteLocked() {
+	oldLeader := t.tg.leader
+	if t == oldLeader {
+		return
+	}
+	// Swap the leader's TIDs with the execing task's. The latter will be
+	// released when the old leader is reaped below.
+	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+		oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader]
+		ns.tids[oldLeader] = oldTID
+		ns.tids[t] = leaderTID
+		ns.tasks[oldTID] = oldLeader
+		ns.tasks[leaderTID] = t
+		// Neither the ThreadGroup nor TGID change, so no need to
+		// update ns.tgids.
+	}
+
+	// Inherit the old leader's start time.
+	oldStartTime := oldLeader.StartTime()
+	t.mu.Lock()
+	t.startTime = oldStartTime
+	t.mu.Unlock()
+
+	t.tg.leader = t
+	t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t])
+	t.updateInfoLocked()
+	// Reap the original leader. If it has a tracer, detach it instead of
+	// waiting for it to acknowledge the original leader's death.
+	oldLeader.exitParentNotified = true
+	oldLeader.exitParentAcked = true
+	if tracer := oldLeader.Tracer(); tracer != nil {
+		delete(tracer.ptraceTracees, oldLeader)
+		oldLeader.forgetTracerLocked()
+		// Notify the tracer that it will no longer be receiving these events
+		// from the tracee.
+		tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue)
+	}
+	oldLeader.exitNotifyLocked(false)
+}
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
new file mode 100644
index 000000000..c4ade6e8e
--- /dev/null
+++ b/pkg/sentry/kernel/task_exit.go
@@ -0,0 +1,1167 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements the task exit cycle:
+//
+// - Tasks are asynchronously requested to exit with Task.Kill.
+//
+// - When able, the task goroutine enters the exit path starting from state
+// runExit.
+//
+// - Other tasks observe completed exits with Task.Wait (which implements the
+// wait*() family of syscalls).
+
+import (
+	"errors"
+	"fmt"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// An ExitStatus is a value communicated from an exiting task or thread group
+// to the party that reaps it.
+//
+// +stateify savable
+type ExitStatus struct {
+	// Code is the numeric value passed to the call to exit or exit_group that
+	// caused the exit. If the exit was not caused by such a call, Code is 0.
+	Code int
+
+	// Signo is the signal that caused the exit. If the exit was not caused by
+	// a signal, Signo is 0.
+	Signo int
+}
+
+// Signaled returns true if the ExitStatus indicates that the exiting task or
+// thread group was killed by a signal.
+func (es ExitStatus) Signaled() bool {
+	return es.Signo != 0
+}
+
+// Status returns the numeric representation of the ExitStatus returned by e.g.
+// the wait4() system call.
+func (es ExitStatus) Status() uint32 {
+	return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff)
+}
+
+// ShellExitCode returns the numeric exit code that Bash would return for an
+// exit status of es.
+func (es ExitStatus) ShellExitCode() int {
+	if es.Signaled() {
+		return 128 + es.Signo
+	}
+	return es.Code
+}
+
+// TaskExitState represents a step in the task exit path.
+//
+// "Exiting" and "exited" are often ambiguous; prefer to name specific states.
+type TaskExitState int
+
+const (
+	// TaskExitNone indicates that the task has not begun exiting.
+	TaskExitNone TaskExitState = iota
+
+	// TaskExitInitiated indicates that the task goroutine has entered the exit
+	// path, and the task is no longer eligible to participate in group stops
+	// or group signal handling. TaskExitInitiated is analogous to Linux's
+	// PF_EXITING.
+	TaskExitInitiated
+
+	// TaskExitZombie indicates that the task has released its resources, and
+	// the task no longer prevents a sibling thread from completing execve.
+	TaskExitZombie
+
+	// TaskExitDead indicates that the task's thread IDs have been released,
+	// and the task no longer prevents its thread group leader from being
+	// reaped. ("Reaping" refers to the transitioning of a task from
+	// TaskExitZombie to TaskExitDead.)
+	TaskExitDead
+)
+
+// String implements fmt.Stringer.
+func (t TaskExitState) String() string {
+	switch t {
+	case TaskExitNone:
+		return "TaskExitNone"
+	case TaskExitInitiated:
+		return "TaskExitInitiated"
+	case TaskExitZombie:
+		return "TaskExitZombie"
+	case TaskExitDead:
+		return "TaskExitDead"
+	default:
+		return strconv.Itoa(int(t))
+	}
+}
+
+// killLocked marks t as killed by enqueueing a SIGKILL, without causing the
+// thread-group-affecting side effects SIGKILL usually has.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) killLocked() {
+	// Clear killable stops.
+	if t.stop != nil && t.stop.Killable() {
+		t.endInternalStopLocked()
+	}
+	t.pendingSignals.enqueue(&arch.SignalInfo{
+		Signo: int32(linux.SIGKILL),
+		// Linux just sets SIGKILL in the pending signal bitmask without
+		// enqueueing an actual siginfo, such that
+		// kernel/signal.c:collect_signal() initializes si_code to SI_USER.
+		Code: arch.SignalInfoUser,
+	}, nil)
+	t.interrupt()
+}
+
+// killed returns true if t has a SIGKILL pending. killed is analogous to
+// Linux's fatal_signal_pending().
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) killed() bool {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.killedLocked()
+}
+
+func (t *Task) killedLocked() bool {
+	return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0
+}
+
+// PrepareExit indicates an exit with status es.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) PrepareExit(es ExitStatus) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.exitStatus = es
+}
+
+// PrepareGroupExit indicates a group exit with status es to t's thread group.
+//
+// PrepareGroupExit is analogous to Linux's do_group_exit(), except that it
+// does not tail-call do_exit(), except that it *does* set Task.exitStatus.
+// (Linux does not do so until within do_exit(), since it reuses exit_code for
+// ptrace.)
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) PrepareGroupExit(es ExitStatus) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.tg.exiting || t.tg.execing != nil {
+		// Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e.
+		// this "group exit" is being executed by the killed sibling of an
+		// execing task, then Task.Execve never set t.tg.exitStatus, so it's
+		// still the zero value. This is consistent with Linux, both in intent
+		// ("all other threads ... report death as if they exited via _exit(2)
+		// with exit code 0" - ptrace(2), "execve under ptrace") and in
+		// implementation (compare fs/exec.c:de_thread() =>
+		// kernel/signal.c:zap_other_threads() and
+		// kernel/exit.c:do_group_exit() =>
+		// include/linux/sched.h:signal_group_exit()).
+		t.exitStatus = t.tg.exitStatus
+		return
+	}
+	t.tg.exiting = true
+	t.tg.exitStatus = es
+	t.exitStatus = es
+	for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
+		if sibling != t {
+			sibling.killLocked()
+		}
+	}
+}
+
+// Kill requests that all tasks in ts exit as if group exiting with status es.
+// Kill does not wait for tasks to exit.
+//
+// Kill has no analogue in Linux; it's provided for save/restore only.
+func (ts *TaskSet) Kill(es ExitStatus) {
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.Root.exiting = true
+	for t := range ts.Root.tids {
+		t.tg.signalHandlers.mu.Lock()
+		if !t.tg.exiting {
+			t.tg.exiting = true
+			t.tg.exitStatus = es
+		}
+		t.killLocked()
+		t.tg.signalHandlers.mu.Unlock()
+	}
+}
+
+// advanceExitStateLocked checks that t's current exit state is oldExit, then
+// sets it to newExit. If t's current exit state is not oldExit,
+// advanceExitStateLocked panics.
+//
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) {
+	if t.exitState != oldExit {
+		panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState))
+	}
+	t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit)
+	t.exitState = newExit
+}
+
+// runExit is the entry point into the task exit path.
+//
+// +stateify savable
+type runExit struct{}
+
+func (*runExit) execute(t *Task) taskRunState {
+	t.ptraceExit()
+	return (*runExitMain)(nil)
+}
+
+// +stateify savable
+type runExitMain struct{}
+
+func (*runExitMain) execute(t *Task) taskRunState {
+	t.traceExitEvent()
+	lastExiter := t.exitThreadGroup()
+
+	// If the task has a cleartid, and the thread group wasn't killed by a
+	// signal, handle that before releasing the MM.
+	if t.cleartid != 0 {
+		t.tg.signalHandlers.mu.Lock()
+		signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
+		t.tg.signalHandlers.mu.Unlock()
+		if !signaled {
+			if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
+				t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1)
+			}
+			// If the CopyOut fails, there's nothing we can do.
+		}
+	}
+
+	// Deactivate the address space and update max RSS before releasing the
+	// task's MM.
+	t.Deactivate()
+	t.tg.pidns.owner.mu.Lock()
+	t.updateRSSLocked()
+	t.tg.pidns.owner.mu.Unlock()
+	t.mu.Lock()
+	t.tc.release()
+	t.mu.Unlock()
+
+	// Releasing the MM unblocks a blocked CLONE_VFORK parent.
+	t.unstopVforkParent()
+
+	t.fsContext.DecRef()
+	t.fdTable.DecRef()
+
+	t.mu.Lock()
+	if t.mountNamespaceVFS2 != nil {
+		t.mountNamespaceVFS2.DecRef()
+		t.mountNamespaceVFS2 = nil
+	}
+	t.mu.Unlock()
+
+	// If this is the last task to exit from the thread group, release the
+	// thread group's resources.
+	if lastExiter {
+		t.tg.release()
+	}
+
+	// Detach tracees.
+	t.exitPtrace()
+
+	// Reparent the task's children.
+	t.exitChildren()
+
+	// Don't tail-call runExitNotify, as exitChildren may have initiated a stop
+	// to wait for a PID namespace to die.
+	return (*runExitNotify)(nil)
+}
+
+// exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread
+// group that it is no longer eligible to participate in group activities. It
+// returns true if t is the last task in its thread group to call
+// exitThreadGroup.
+func (t *Task) exitThreadGroup() bool {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.tg.signalHandlers.mu.Lock()
+	// Can't defer unlock: see below.
+
+	t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated)
+	t.tg.activeTasks--
+	last := t.tg.activeTasks == 0
+
+	// Ensure that someone will handle the signals we can't.
+	t.setSignalMaskLocked(^linux.SignalSet(0))
+
+	// Check if this task's exit interacts with an initiated group stop.
+	if !t.groupStopPending {
+		t.tg.signalHandlers.mu.Unlock()
+		return last
+	}
+	t.groupStopPending = false
+	sig := t.tg.groupStopSignal
+	notifyParent := t.participateGroupStopLocked()
+	// signalStop must be called with t's signal mutex unlocked.
+	t.tg.signalHandlers.mu.Unlock()
+	if notifyParent && t.tg.leader.parent != nil {
+		t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig))
+		t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
+	}
+	return last
+}
+
+func (t *Task) exitChildren() {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	newParent := t.findReparentTargetLocked()
+	if newParent == nil {
+		// "If the init process of a PID namespace terminates, the kernel
+		// terminates all of the processes in the namespace via a SIGKILL
+		// signal." - pid_namespaces(7)
+		t.Debugf("Init process terminating, killing namespace")
+		t.tg.pidns.exiting = true
+		for other := range t.tg.pidns.tgids {
+			if other == t.tg {
+				continue
+			}
+			other.signalHandlers.mu.Lock()
+			other.leader.sendSignalLocked(&arch.SignalInfo{
+				Signo: int32(linux.SIGKILL),
+			}, true /* group */)
+			other.signalHandlers.mu.Unlock()
+		}
+		// TODO(b/37722272): The init process waits for all processes in the
+		// namespace to exit before completing its own exit
+		// (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all
+		// other tasks in the namespace are dead, except possibly for this
+		// thread group's leader (which can't be reaped until this task exits).
+	}
+	// This is correct even if newParent is nil (it ensures that children don't
+	// wait for a parent to reap them.)
+	for c := range t.children {
+		if sig := c.ParentDeathSignal(); sig != 0 {
+			siginfo := &arch.SignalInfo{
+				Signo: int32(sig),
+				Code:  arch.SignalInfoUser,
+			}
+			siginfo.SetPid(int32(c.tg.pidns.tids[t]))
+			siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow()))
+			c.tg.signalHandlers.mu.Lock()
+			c.sendSignalLocked(siginfo, true /* group */)
+			c.tg.signalHandlers.mu.Unlock()
+		}
+		c.reparentLocked(newParent)
+		if newParent != nil {
+			newParent.children[c] = struct{}{}
+		}
+	}
+}
+
+// findReparentTargetLocked returns the task to which t's children should be
+// reparented. If no such task exists, findNewParentLocked returns nil.
+//
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) findReparentTargetLocked() *Task {
+	// Reparent to any sibling in the same thread group that hasn't begun
+	// exiting.
+	if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil {
+		return t2
+	}
+	// "A child process that is orphaned within the namespace will be
+	// reparented to [the init process for the namespace] ..." -
+	// pid_namespaces(7)
+	if init := t.tg.pidns.tasks[InitTID]; init != nil {
+		return init.tg.anyNonExitingTaskLocked()
+	}
+	return nil
+}
+
+func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task {
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		if t.exitState == TaskExitNone {
+			return t
+		}
+	}
+	return nil
+}
+
+// reparentLocked changes t's parent. The new parent may be nil.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) reparentLocked(parent *Task) {
+	oldParent := t.parent
+	t.parent = parent
+	// If a thread group leader's parent changes, reset the thread group's
+	// termination signal to SIGCHLD and re-check exit notification. (Compare
+	// kernel/exit.c:reparent_leader().)
+	if t != t.tg.leader {
+		return
+	}
+	if oldParent == nil && parent == nil {
+		return
+	}
+	if oldParent != nil && parent != nil && oldParent.tg == parent.tg {
+		return
+	}
+	t.tg.terminationSignal = linux.SIGCHLD
+	if t.exitParentNotified && !t.exitParentAcked {
+		t.exitParentNotified = false
+		t.exitNotifyLocked(false)
+	}
+}
+
+// When a task exits, other tasks in the system, notably the task's parent and
+// ptracer, may want to be notified. The exit notification system ensures that
+// interested tasks receive signals and/or are woken from blocking calls to
+// wait*() syscalls; these notifications must be resolved before exiting tasks
+// can be reaped and disappear from the system.
+//
+// Each task may have a parent task and/or a tracer task. If both a parent and
+// a tracer exist, they may be the same task, different tasks in the same
+// thread group, or tasks in different thread groups. (In the last case, Linux
+// refers to the task as being ptrace-reparented due to an implementation
+// detail; we avoid this terminology to avoid confusion.)
+//
+// A thread group is *empty* if all non-leader tasks in the thread group are
+// dead, and the leader is either a zombie or dead. The exit of a thread group
+// leader is never waitable - by either the parent or tracer - until the thread
+// group is empty.
+//
+// There are a few ways for an exit notification to be resolved:
+//
+// - The exit notification may be acknowledged by a call to Task.Wait with
+// WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall).
+//
+// - If the notified party is the parent, and the parent thread group is not
+// also the tracer thread group, and the notification signal is SIGCHLD, the
+// parent may explicitly ignore the notification (see quote in exitNotify).
+// Note that it's possible for the notified party to ignore the signal in other
+// cases, but the notification is only resolved under the above conditions.
+// (Actually, there is one exception; see the last paragraph of the "leader,
+// has tracer, tracer thread group is parent thread group" case below.)
+//
+// - If the notified party is the parent, and the parent does not exist, the
+// notification is resolved as if ignored. (This is only possible in the
+// sentry. In Linux, the only task / thread group without a parent is global
+// init, and killing global init causes a kernel panic.)
+//
+// - If the notified party is a tracer, the tracer may detach the traced task.
+// (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.)
+//
+// In addition, if the notified party is the parent, the parent may exit and
+// cause the notifying task to be reparented to another thread group. This does
+// not resolve the notification; instead, the notification must be resent to
+// the new parent.
+//
+// The series of notifications generated for a given task's exit depend on
+// whether it is a thread group leader; whether the task is ptraced; and, if
+// so, whether the tracer thread group is the same as the parent thread group.
+//
+// - Non-leader, no tracer: No notification is generated; the task is reaped
+// immediately.
+//
+// - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer
+// notification is resolved (by waiting or detaching), the task is reaped. (For
+// non-leaders, whether the tracer and parent thread groups are the same is
+// irrelevant.)
+//
+// - Leader, no tracer: The task remains a zombie, with no notification sent,
+// until all other tasks in the thread group are dead. (In Linux terms, this
+// condition is indicated by include/linux/sched.h:thread_group_empty(); tasks
+// are removed from their thread_group list in kernel/exit.c:release_task() =>
+// __exit_signal() => __unhash_process().) Then the thread group's termination
+// signal is sent to the parent. When the parent notification is resolved (by
+// waiting or ignoring), the task is reaped.
+//
+// - Leader, has tracer, tracer thread group is not parent thread group:
+// SIGCHLD is sent to the tracer. When the tracer notification is resolved (by
+// waiting or detaching), and all other tasks in the thread group are dead, the
+// thread group's termination signal is sent to the parent. (Note that the
+// tracer cannot resolve the exit notification by waiting until the thread
+// group is empty.) When the parent notification is resolved, the task is
+// reaped.
+//
+// - Leader, has tracer, tracer thread group is parent thread group:
+//
+// If all other tasks in the thread group are dead, the thread group's
+// termination signal is sent to the parent. At this point, the notification
+// can only be resolved by waiting. If the parent detaches from the task as a
+// tracer, the notification is not resolved, but the notification can now be
+// resolved by waiting or ignoring. When the parent notification is resolved,
+// the task is reaped.
+//
+// If at least one task in the thread group is not dead, SIGCHLD is sent to the
+// parent. At this point, the notification cannot be resolved at all; once the
+// thread group becomes empty, it can be resolved only by waiting. If the
+// parent detaches from the task as a tracer before all remaining tasks die,
+// then exit notification proceeds as in the case where the leader never had a
+// tracer. If the parent detaches from the task as a tracer after all remaining
+// tasks die, the notification is not resolved, but the notification can now be
+// resolved by waiting or ignoring. When the parent notification is resolved,
+// the task is reaped.
+//
+// In both of the above cases, when the parent detaches from the task as a
+// tracer while the thread group is empty, whether or not the parent resolves
+// the notification by ignoring it is based on the parent's SIGCHLD signal
+// action, whether or not the thread group's termination signal is SIGCHLD
+// (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()).
+//
+// There is one final wrinkle: A leader can become a non-leader due to a
+// sibling execve. In this case, the execing thread detaches the leader's
+// tracer (if one exists) and reaps the leader immediately. In Linux, this is
+// in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked().
+
+// +stateify savable
+type runExitNotify struct{}
+
+func (*runExitNotify) execute(t *Task) taskRunState {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie)
+	t.tg.liveTasks--
+	// Check if this completes a sibling's execve.
+	if t.tg.execing != nil && t.tg.liveTasks == 1 {
+		// execing blocks the addition of new tasks to the thread group, so
+		// the sole living task must be the execing one.
+		e := t.tg.execing
+		e.tg.signalHandlers.mu.Lock()
+		if _, ok := e.stop.(*execStop); ok {
+			e.endInternalStopLocked()
+		}
+		e.tg.signalHandlers.mu.Unlock()
+	}
+	t.exitNotifyLocked(false)
+	// The task goroutine will now exit.
+	return nil
+}
+
+// exitNotifyLocked is called after changes to t's state that affect exit
+// notification.
+//
+// If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace;
+// thanks to Linux's haphazard implementation of this functionality, such cases
+// determine whether parent notifications are ignored based on the parent's
+// handling of SIGCHLD, regardless of what the exited task's thread group's
+// termination signal is.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
+	if t.exitState != TaskExitZombie {
+		return
+	}
+	if !t.exitTracerNotified {
+		t.exitTracerNotified = true
+		tracer := t.Tracer()
+		if tracer == nil {
+			t.exitTracerAcked = true
+		} else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg {
+			// Don't set exitParentNotified if t is non-leader, even if the
+			// tracer is in the parent thread group, so that if the parent
+			// detaches the following call to exitNotifyLocked passes through
+			// the !exitParentNotified case below and causes t to be reaped
+			// immediately.
+			//
+			// Tracer notification doesn't care about about
+			// SIG_IGN/SA_NOCLDWAIT.
+			tracer.tg.signalHandlers.mu.Lock()
+			tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */)
+			tracer.tg.signalHandlers.mu.Unlock()
+			// Wake EventTraceeStop waiters as well since this task will never
+			// ptrace-stop again.
+			tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop)
+		} else {
+			// t is a leader and the tracer is in the parent thread group.
+			t.exitParentNotified = true
+			sig := linux.SIGCHLD
+			if t.tg.tasksCount == 1 {
+				sig = t.tg.terminationSignal
+			}
+			// This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either
+			// (in Linux, the check in do_notify_parent() is gated by
+			// !tsk->ptrace.)
+			t.parent.tg.signalHandlers.mu.Lock()
+			t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */)
+			t.parent.tg.signalHandlers.mu.Unlock()
+			// See below for rationale for this event mask.
+			t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
+		}
+	}
+	if t.exitTracerAcked && !t.exitParentNotified {
+		if t != t.tg.leader {
+			t.exitParentNotified = true
+			t.exitParentAcked = true
+		} else if t.tg.tasksCount == 1 {
+			t.exitParentNotified = true
+			if t.parent == nil {
+				t.exitParentAcked = true
+			} else {
+				// "POSIX.1-2001 specifies that if the disposition of SIGCHLD is
+				// set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see
+				// sigaction(2)), then children that terminate do not become
+				// zombies and a call to wait() or waitpid() will block until all
+				// children have terminated, and then fail with errno set to
+				// ECHILD. (The original POSIX standard left the behavior of
+				// setting SIGCHLD to SIG_IGN unspecified. Note that even though
+				// the default disposition of SIGCHLD is "ignore", explicitly
+				// setting the disposition to SIG_IGN results in different
+				// treatment of zombie process children.) Linux 2.6 conforms to
+				// this specification." - wait(2)
+				//
+				// Some undocumented Linux-specific details:
+				//
+				// - All of the above is ignored if the termination signal isn't
+				// SIGCHLD.
+				//
+				// - SA_NOCLDWAIT causes the leader to be immediately reaped, but
+				// does not suppress the SIGCHLD.
+				signalParent := t.tg.terminationSignal.IsValid()
+				t.parent.tg.signalHandlers.mu.Lock()
+				if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach {
+					if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok {
+						if act.Handler == arch.SignalActIgnore {
+							t.exitParentAcked = true
+							signalParent = false
+						} else if act.Flags&arch.SignalFlagNoCldWait != 0 {
+							t.exitParentAcked = true
+						}
+					}
+				}
+				if signalParent {
+					t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */)
+				}
+				t.parent.tg.signalHandlers.mu.Unlock()
+				// If a task in the parent was waiting for a child group stop
+				// or continue, it needs to be notified of the exit, because
+				// there may be no remaining eligible tasks (so that wait
+				// should return ECHILD).
+				t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
+			}
+		}
+	}
+	if t.exitTracerAcked && t.exitParentAcked {
+		t.advanceExitStateLocked(TaskExitZombie, TaskExitDead)
+		for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+			tid := ns.tids[t]
+			delete(ns.tasks, tid)
+			delete(ns.tids, t)
+			if t == t.tg.leader {
+				delete(ns.tgids, t.tg)
+			}
+		}
+		t.tg.exitedCPUStats.Accumulate(t.CPUStats())
+		t.tg.ioUsage.Accumulate(t.ioUsage)
+		t.tg.signalHandlers.mu.Lock()
+		t.tg.tasks.Remove(t)
+		t.tg.tasksCount--
+		tc := t.tg.tasksCount
+		t.tg.signalHandlers.mu.Unlock()
+		if tc == 1 && t != t.tg.leader {
+			// Our fromPtraceDetach doesn't matter here (in Linux terms, this
+			// is via a call to release_task()).
+			t.tg.leader.exitNotifyLocked(false)
+		} else if tc == 0 {
+			t.tg.processGroup.decRefWithParent(t.tg.parentPG())
+		}
+		if t.parent != nil {
+			delete(t.parent.children, t)
+			t.parent = nil
+		}
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.SignalInfo {
+	info := &arch.SignalInfo{
+		Signo: int32(sig),
+	}
+	info.SetPid(int32(receiver.tg.pidns.tids[t]))
+	info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+	if t.exitStatus.Signaled() {
+		info.Code = arch.CLD_KILLED
+		info.SetStatus(int32(t.exitStatus.Signo))
+	} else {
+		info.Code = arch.CLD_EXITED
+		info.SetStatus(int32(t.exitStatus.Code))
+	}
+	// TODO(b/72102453): Set utime, stime.
+	return info
+}
+
+// ExitStatus returns t's exit status, which is only guaranteed to be
+// meaningful if t.ExitState() != TaskExitNone.
+func (t *Task) ExitStatus() ExitStatus {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.exitStatus
+}
+
+// ExitStatus returns the exit status that would be returned by a consuming
+// wait*() on tg.
+func (tg *ThreadGroup) ExitStatus() ExitStatus {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	if tg.exiting {
+		return tg.exitStatus
+	}
+	return tg.leader.exitStatus
+}
+
+// TerminationSignal returns the thread group's termination signal.
+func (tg *ThreadGroup) TerminationSignal() linux.Signal {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.terminationSignal
+}
+
+// Task events that can be waited for.
+const (
+	// EventExit represents an exit notification generated for a child thread
+	// group leader or a tracee under the conditions specified in the comment
+	// above runExitNotify.
+	EventExit waiter.EventMask = 1 << iota
+
+	// EventChildGroupStop occurs when a child thread group completes a group
+	// stop (i.e. all tasks in the child thread group have entered a stopped
+	// state as a result of a group stop).
+	EventChildGroupStop
+
+	// EventTraceeStop occurs when a task that is ptraced by a task in the
+	// notified thread group enters a ptrace stop (see ptrace(2)).
+	EventTraceeStop
+
+	// EventGroupContinue occurs when a child thread group, or a thread group
+	// whose leader is ptraced by a task in the notified thread group, that had
+	// initiated or completed a group stop leaves the group stop, due to the
+	// child thread group or any task in the child thread group being sent
+	// SIGCONT.
+	EventGroupContinue
+)
+
+// WaitOptions controls the behavior of Task.Wait.
+type WaitOptions struct {
+	// If SpecificTID is non-zero, only events from the task with thread ID
+	// SpecificTID are eligible to be waited for. SpecificTID is resolved in
+	// the PID namespace of the waiter (the method receiver of Task.Wait). If
+	// no such task exists, or that task would not otherwise be eligible to be
+	// waited for by the waiting task, then there are no waitable tasks and
+	// Wait will return ECHILD.
+	SpecificTID ThreadID
+
+	// If SpecificPGID is non-zero, only events from ThreadGroups with a
+	// matching ProcessGroupID are eligible to be waited for. (Same
+	// constraints as SpecificTID apply.)
+	SpecificPGID ProcessGroupID
+
+	// Terminology note: Per waitpid(2), "a clone child is one which delivers
+	// no signal, or a signal other than SIGCHLD to its parent upon
+	// termination." In Linux, termination signal is technically a per-task
+	// property rather than a per-thread-group property. However, clone()
+	// forces no termination signal for tasks created with CLONE_THREAD, and
+	// execve() resets the termination signal to SIGCHLD, so all
+	// non-group-leader threads have no termination signal and are therefore
+	// "clone tasks".
+
+	// If NonCloneTasks is true, events from non-clone tasks are eligible to be
+	// waited for.
+	NonCloneTasks bool
+
+	// If CloneTasks is true, events from clone tasks are eligible to be waited
+	// for.
+	CloneTasks bool
+
+	// If SiblingChildren is true, events from children tasks of any task
+	// in the thread group of the waiter are eligible to be waited for.
+	SiblingChildren bool
+
+	// Events is a bitwise combination of the events defined above that specify
+	// what events are of interest to the call to Wait.
+	Events waiter.EventMask
+
+	// If ConsumeEvent is true, the Wait should consume the event such that it
+	// cannot be returned by a future Wait. Note that if a task exit is
+	// consumed in this way, in most cases the task will be reaped.
+	ConsumeEvent bool
+
+	// If BlockInterruptErr is not nil, Wait will block until either an event
+	// is available or there are no tasks that could produce a waitable event;
+	// if that blocking is interrupted, Wait returns BlockInterruptErr. If
+	// BlockInterruptErr is nil, Wait will not block.
+	BlockInterruptErr error
+}
+
+// Preconditions: The TaskSet mutex must be locked (for reading or writing).
+func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace, tracee bool) bool {
+	if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] {
+		return false
+	}
+	if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] {
+		return false
+	}
+	// Tracees are always eligible.
+	if tracee {
+		return true
+	}
+	if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD {
+		return o.NonCloneTasks
+	}
+	return o.CloneTasks
+}
+
+// ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g.
+// waitpid(WNOHANG)) that find no waitable events, but determine that waitable
+// events may exist in the future. (In contrast, if a non-blocking or blocking
+// Wait determines that there are no tasks that can produce a waitable event,
+// Task.Wait returns ECHILD.)
+var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events")
+
+// WaitResult contains information about a waited-for event.
+type WaitResult struct {
+	// Task is the task that reported the event.
+	Task *Task
+
+	// TID is the thread ID of Task in the PID namespace of the task that
+	// called Wait (that is, the method receiver of the call to Task.Wait). TID
+	// is provided because consuming exit waits cause the thread ID to be
+	// deallocated.
+	TID ThreadID
+
+	// UID is the real UID of Task in the user namespace of the task that
+	// called Wait.
+	UID auth.UID
+
+	// Event is exactly one of the events defined above.
+	Event waiter.EventMask
+
+	// Status is the numeric status associated with the event.
+	Status uint32
+}
+
+// Wait waits for an event from a thread group that is a child of t's thread
+// group, or a task in such a thread group, or a task that is ptraced by t,
+// subject to the options specified in opts.
+func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) {
+	if opts.BlockInterruptErr == nil {
+		return t.waitOnce(opts)
+	}
+	w, ch := waiter.NewChannelEntry(nil)
+	t.tg.eventQueue.EventRegister(&w, opts.Events)
+	defer t.tg.eventQueue.EventUnregister(&w)
+	for {
+		wr, err := t.waitOnce(opts)
+		if err != ErrNoWaitableEvent {
+			// This includes err == nil.
+			return wr, err
+		}
+		if err := t.Block(ch); err != nil {
+			return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr)
+		}
+	}
+}
+
+func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) {
+	anyWaitableTasks := false
+
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+
+	if opts.SiblingChildren {
+		// We can wait on the children and tracees of any task in the
+		// same thread group.
+		for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() {
+			wr, any := t.waitParentLocked(opts, parent)
+			if wr != nil {
+				return wr, nil
+			}
+			anyWaitableTasks = anyWaitableTasks || any
+		}
+	} else {
+		// We can only wait on this task.
+		var wr *WaitResult
+		wr, anyWaitableTasks = t.waitParentLocked(opts, t)
+		if wr != nil {
+			return wr, nil
+		}
+	}
+
+	if anyWaitableTasks {
+		return nil, ErrNoWaitableEvent
+	}
+	return nil, syserror.ECHILD
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, bool) {
+	anyWaitableTasks := false
+
+	for child := range parent.children {
+		if !opts.matchesTask(child, parent.tg.pidns, false) {
+			continue
+		}
+		// Non-leaders don't notify parents on exit and aren't eligible to
+		// be waited on.
+		if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked {
+			anyWaitableTasks = true
+			if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil {
+				return wr, anyWaitableTasks
+			}
+		}
+		// Check for group stops and continues. Tasks that have passed
+		// TaskExitInitiated can no longer participate in group stops.
+		if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 {
+			continue
+		}
+		if child.exitState >= TaskExitInitiated {
+			continue
+		}
+		// If the waiter is in the same thread group as the task's
+		// tracer, do not report its group stops; they will be reported
+		// as ptrace stops instead. This also skips checking for group
+		// continues, but they'll be checked for when scanning tracees
+		// below. (Per kernel/exit.c:wait_consider_task(): "If a
+		// ptracer wants to distinguish the two events for its own
+		// children, it should create a separate process which takes
+		// the role of real parent.")
+		if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg {
+			continue
+		}
+		anyWaitableTasks = true
+		if opts.Events&EventChildGroupStop != 0 {
+			if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil {
+				return wr, anyWaitableTasks
+			}
+		}
+		if opts.Events&EventGroupContinue != 0 {
+			if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil {
+				return wr, anyWaitableTasks
+			}
+		}
+	}
+	for tracee := range parent.ptraceTracees {
+		if !opts.matchesTask(tracee, parent.tg.pidns, true) {
+			continue
+		}
+		// Non-leaders do notify tracers on exit.
+		if opts.Events&EventExit != 0 && !tracee.exitTracerAcked {
+			anyWaitableTasks = true
+			if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil {
+				return wr, anyWaitableTasks
+			}
+		}
+		if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 {
+			continue
+		}
+		if tracee.exitState >= TaskExitInitiated {
+			continue
+		}
+		anyWaitableTasks = true
+		if opts.Events&EventTraceeStop != 0 {
+			if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil {
+				return wr, anyWaitableTasks
+			}
+		}
+		if opts.Events&EventGroupContinue != 0 {
+			if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil {
+				return wr, anyWaitableTasks
+			}
+		}
+	}
+
+	return nil, anyWaitableTasks
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult {
+	if asPtracer && !target.exitTracerNotified {
+		return nil
+	}
+	if !asPtracer && !target.exitParentNotified {
+		return nil
+	}
+	// Zombied thread group leaders are never waitable until their thread group
+	// is otherwise empty. Usually this is caught by the
+	// target.exitParentNotified check above, but if t is both (in the thread
+	// group of) target's tracer and parent, asPtracer may be true.
+	if target == target.tg.leader && target.tg.tasksCount != 1 {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	status := target.exitStatus.Status()
+	if !opts.ConsumeEvent {
+		return &WaitResult{
+			Task:   target,
+			TID:    pid,
+			UID:    uid,
+			Event:  EventExit,
+			Status: status,
+		}
+	}
+	// Surprisingly, the exit status reported by a non-consuming wait can
+	// differ from that reported by a consuming wait; the latter will return
+	// the group exit code if one is available.
+	if target.tg.exiting {
+		status = target.tg.exitStatus.Status()
+	}
+	// t may be (in the thread group of) target's parent, tracer, or both. We
+	// don't need to check for !exitTracerAcked because tracees are detached
+	// here, and we don't need to check for !exitParentAcked because zombies
+	// will be reaped here.
+	if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified {
+		target.exitTracerAcked = true
+		target.ptraceTracer.Store((*Task)(nil))
+		delete(t.ptraceTracees, target)
+	}
+	if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified {
+		target.exitParentAcked = true
+		if target == target.tg.leader {
+			// target.tg.exitedCPUStats doesn't include target.CPUStats() yet,
+			// and won't until after target.exitNotifyLocked() (maybe). Include
+			// target.CPUStats() explicitly. This is consistent with Linux,
+			// which accounts an exited task's cputime to its thread group in
+			// kernel/exit.c:release_task() => __exit_signal(), and uses
+			// thread_group_cputime_adjusted() in wait_task_zombie().
+			t.tg.childCPUStats.Accumulate(target.CPUStats())
+			t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats)
+			t.tg.childCPUStats.Accumulate(target.tg.childCPUStats)
+			// Update t's child max resident set size. The size will be the maximum
+			// of this thread's size and all its childrens' sizes.
+			if t.tg.childMaxRSS < target.tg.maxRSS {
+				t.tg.childMaxRSS = target.tg.maxRSS
+			}
+			if t.tg.childMaxRSS < target.tg.childMaxRSS {
+				t.tg.childMaxRSS = target.tg.childMaxRSS
+			}
+		}
+	}
+	target.exitNotifyLocked(false)
+	return &WaitResult{
+		Task:   target,
+		TID:    pid,
+		UID:    uid,
+		Event:  EventExit,
+		Status: status,
+	}
+}
+
+// updateRSSLocked updates t.tg.maxRSS.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) updateRSSLocked() {
+	if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS {
+		t.tg.maxRSS = mmMaxRSS
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult {
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if !target.tg.groupStopWaitable {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	sig := target.tg.groupStopSignal
+	if opts.ConsumeEvent {
+		target.tg.groupStopWaitable = false
+	}
+	return &WaitResult{
+		Task:  target,
+		TID:   pid,
+		UID:   uid,
+		Event: EventChildGroupStop,
+		// There is no name for these status constants.
+		Status: (uint32(sig)&0xff)<<8 | 0x7f,
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult {
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if !target.tg.groupContWaitable {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	if opts.ConsumeEvent {
+		target.tg.groupContWaitable = false
+	}
+	return &WaitResult{
+		Task:   target,
+		TID:    pid,
+		UID:    uid,
+		Event:  EventGroupContinue,
+		Status: 0xffff,
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult {
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if target.stop == nil {
+		return nil
+	}
+	if _, ok := target.stop.(*ptraceStop); !ok {
+		return nil
+	}
+	if target.ptraceCode == 0 {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	code := target.ptraceCode
+	if opts.ConsumeEvent {
+		target.ptraceCode = 0
+	}
+	return &WaitResult{
+		Task:   target,
+		TID:    pid,
+		UID:    uid,
+		Event:  EventTraceeStop,
+		Status: uint32(code)<<8 | 0x7f,
+	}
+}
+
+// ExitState returns t's current progress through the exit path.
+func (t *Task) ExitState() TaskExitState {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	return t.exitState
+}
+
+// ParentDeathSignal returns t's parent death signal.
+func (t *Task) ParentDeathSignal() linux.Signal {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.parentDeathSignal
+}
+
+// SetParentDeathSignal sets t's parent death signal.
+func (t *Task) SetParentDeathSignal(sig linux.Signal) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.parentDeathSignal = sig
+}
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
new file mode 100644
index 000000000..a53e77c9f
--- /dev/null
+++ b/pkg/sentry/kernel/task_futex.go
@@ -0,0 +1,54 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Futex returns t's futex manager.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Futex() *futex.Manager {
+	return t.tc.fu
+}
+
+// SwapUint32 implements futex.Target.SwapUint32.
+func (t *Task) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
+	return t.MemoryManager().SwapUint32(t, addr, new, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CompareAndSwapUint32 implements futex.Target.CompareAndSwapUint32.
+func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
+	return t.MemoryManager().CompareAndSwapUint32(t, addr, old, new, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// LoadUint32 implements futex.Target.LoadUint32.
+func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) {
+	return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// GetSharedKey implements futex.Target.GetSharedKey.
+func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) {
+	return t.MemoryManager().GetSharedFutexKey(t, addr)
+}
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
new file mode 100644
index 000000000..0325967e4
--- /dev/null
+++ b/pkg/sentry/kernel/task_identity.go
@@ -0,0 +1,606 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Credentials returns t's credentials.
+//
+// This value must be considered immutable.
+func (t *Task) Credentials() *auth.Credentials {
+	return t.creds.Load()
+}
+
+// UserNamespace returns the user namespace associated with the task.
+func (t *Task) UserNamespace() *auth.UserNamespace {
+	return t.Credentials().UserNamespace
+}
+
+// HasCapabilityIn checks if the task has capability cp in user namespace ns.
+func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool {
+	return t.Credentials().HasCapabilityIn(cp, ns)
+}
+
+// HasCapability checks if the task has capability cp in its user namespace.
+func (t *Task) HasCapability(cp linux.Capability) bool {
+	return t.Credentials().HasCapability(cp)
+}
+
+// SetUID implements the semantics of setuid(2).
+func (t *Task) SetUID(uid auth.UID) error {
+	// setuid considers -1 to be invalid.
+	if !uid.Ok() {
+		return syserror.EINVAL
+	}
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	creds := t.Credentials()
+	kuid := creds.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return syserror.EINVAL
+	}
+	// "setuid() sets the effective user ID of the calling process. If the
+	// effective UID of the caller is root (more precisely: if the caller has
+	// the CAP_SETUID capability), the real UID and saved set-user-ID are also
+	// set." - setuid(2)
+	if creds.HasCapability(linux.CAP_SETUID) {
+		t.setKUIDsUncheckedLocked(kuid, kuid, kuid)
+		return nil
+	}
+	// "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID
+	// capability) and uid does not match the real UID or saved set-user-ID of
+	// the calling process."
+	if kuid != creds.RealKUID && kuid != creds.SavedKUID {
+		return syserror.EPERM
+	}
+	t.setKUIDsUncheckedLocked(creds.RealKUID, kuid, creds.SavedKUID)
+	return nil
+}
+
+// SetREUID implements the semantics of setreuid(2).
+func (t *Task) SetREUID(r, e auth.UID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Supplying a value of -1 for either the real or effective user ID forces
+	// the system to leave that ID unchanged." - setreuid(2)
+	creds := t.Credentials()
+	newR := creds.RealKUID
+	if r.Ok() {
+		newR = creds.UserNamespace.MapToKUID(r)
+		if !newR.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	newE := creds.EffectiveKUID
+	if e.Ok() {
+		newE = creds.UserNamespace.MapToKUID(e)
+		if !newE.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	if !creds.HasCapability(linux.CAP_SETUID) {
+		// "Unprivileged processes may only set the effective user ID to the
+		// real user ID, the effective user ID, or the saved set-user-ID."
+		if newE != creds.RealKUID && newE != creds.EffectiveKUID && newE != creds.SavedKUID {
+			return syserror.EPERM
+		}
+		// "Unprivileged users may only set the real user ID to the real user
+		// ID or the effective user ID."
+		if newR != creds.RealKUID && newR != creds.EffectiveKUID {
+			return syserror.EPERM
+		}
+	}
+	// "If the real user ID is set (i.e., ruid is not -1) or the effective user
+	// ID is set to a value not equal to the previous real user ID, the saved
+	// set-user-ID will be set to the new effective user ID."
+	newS := creds.SavedKUID
+	if r.Ok() || (e.Ok() && newE != creds.EffectiveKUID) {
+		newS = newE
+	}
+	t.setKUIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// SetRESUID implements the semantics of the setresuid(2) syscall.
+func (t *Task) SetRESUID(r, e, s auth.UID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Unprivileged user processes may change the real UID, effective UID, and
+	// saved set-user-ID, each to one of: the current real UID, the current
+	// effective UID or the current saved set-user-ID. Privileged processes (on
+	// Linux, those having the CAP_SETUID capability) may set the real UID,
+	// effective UID, and saved set-user-ID to arbitrary values. If one of the
+	// arguments equals -1, the corresponding value is not changed." -
+	// setresuid(2)
+	var err error
+	creds := t.Credentials()
+	newR := creds.RealKUID
+	if r.Ok() {
+		newR, err = creds.UseUID(r)
+		if err != nil {
+			return err
+		}
+	}
+	newE := creds.EffectiveKUID
+	if e.Ok() {
+		newE, err = creds.UseUID(e)
+		if err != nil {
+			return err
+		}
+	}
+	newS := creds.SavedKUID
+	if s.Ok() {
+		newS, err = creds.UseUID(s)
+		if err != nil {
+			return err
+		}
+	}
+	t.setKUIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// Preconditions: t.mu must be locked.
+func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
+	creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds.
+	root := creds.UserNamespace.MapToKUID(auth.RootUID)
+	oldR, oldE, oldS := creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID
+	creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID = newR, newE, newS
+
+	// "1. If one or more of the real, effective or saved set user IDs was
+	// previously 0, and as a result of the UID changes all of these IDs have a
+	// nonzero value, then all capabilities are cleared from the permitted and
+	// effective capability sets." - capabilities(7)
+	if (oldR == root || oldE == root || oldS == root) && (newR != root && newE != root && newS != root) {
+		// prctl(2): "PR_SET_KEEPCAP: Set the state of the calling thread's
+		// "keep capabilities" flag, which determines whether the thread's permitted
+		// capability set is cleared when a change is made to the
+		// thread's user IDs such that the thread's real UID, effective
+		// UID, and saved set-user-ID all become nonzero when at least
+		// one of them previously had the value 0.  By default, the
+		// permitted capability set is cleared when such a change is
+		// made; setting the "keep capabilities" flag prevents it from
+		// being cleared." (A thread's effective capability set is always
+		// cleared when such a credential change is made,
+		// regardless of the setting of the "keep capabilities" flag.)
+		if !creds.KeepCaps {
+			creds.PermittedCaps = 0
+			creds.EffectiveCaps = 0
+		}
+	}
+	// """
+	// 2. If the effective user ID is changed from 0 to nonzero, then all
+	// capabilities are cleared from the effective set.
+	//
+	// 3. If the effective user ID is changed from nonzero to 0, then the
+	// permitted set is copied to the effective set.
+	// """
+	if oldE == root && newE != root {
+		creds.EffectiveCaps = 0
+	} else if oldE != root && newE == root {
+		creds.EffectiveCaps = creds.PermittedCaps
+	}
+	// "4. If the filesystem user ID is changed from 0 to nonzero (see
+	// setfsuid(2)), then the following capabilities are cleared from the
+	// effective set: ..."
+	// (filesystem UIDs aren't implemented, nor are any of the capabilities in
+	// question)
+
+	if oldE != newE {
+		// "[dumpability] is reset to the current value contained in
+		// the file /proc/sys/fs/suid_dumpable (which by default has
+		// the value 0), in the following circumstances: The process's
+		// effective user or group ID is changed." - prctl(2)
+		//
+		// (suid_dumpable isn't implemented, so we just use the
+		// default.
+		t.MemoryManager().SetDumpability(mm.NotDumpable)
+
+		// Not documented, but compare Linux's kernel/cred.c:commit_creds().
+		t.parentDeathSignal = 0
+	}
+	t.creds.Store(creds)
+}
+
+// SetGID implements the semantics of setgid(2).
+func (t *Task) SetGID(gid auth.GID) error {
+	if !gid.Ok() {
+		return syserror.EINVAL
+	}
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	creds := t.Credentials()
+	kgid := creds.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return syserror.EINVAL
+	}
+	if creds.HasCapability(linux.CAP_SETGID) {
+		t.setKGIDsUncheckedLocked(kgid, kgid, kgid)
+		return nil
+	}
+	if kgid != creds.RealKGID && kgid != creds.SavedKGID {
+		return syserror.EPERM
+	}
+	t.setKGIDsUncheckedLocked(creds.RealKGID, kgid, creds.SavedKGID)
+	return nil
+}
+
+// SetREGID implements the semantics of setregid(2).
+func (t *Task) SetREGID(r, e auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	creds := t.Credentials()
+	newR := creds.RealKGID
+	if r.Ok() {
+		newR = creds.UserNamespace.MapToKGID(r)
+		if !newR.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	newE := creds.EffectiveKGID
+	if e.Ok() {
+		newE = creds.UserNamespace.MapToKGID(e)
+		if !newE.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	if !creds.HasCapability(linux.CAP_SETGID) {
+		if newE != creds.RealKGID && newE != creds.EffectiveKGID && newE != creds.SavedKGID {
+			return syserror.EPERM
+		}
+		if newR != creds.RealKGID && newR != creds.EffectiveKGID {
+			return syserror.EPERM
+		}
+	}
+	newS := creds.SavedKGID
+	if r.Ok() || (e.Ok() && newE != creds.EffectiveKGID) {
+		newS = newE
+	}
+	t.setKGIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// SetRESGID implements the semantics of the setresgid(2) syscall.
+func (t *Task) SetRESGID(r, e, s auth.GID) error {
+	var err error
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	creds := t.Credentials()
+	newR := creds.RealKGID
+	if r.Ok() {
+		newR, err = creds.UseGID(r)
+		if err != nil {
+			return err
+		}
+	}
+	newE := creds.EffectiveKGID
+	if e.Ok() {
+		newE, err = creds.UseGID(e)
+		if err != nil {
+			return err
+		}
+	}
+	newS := creds.SavedKGID
+	if s.Ok() {
+		newS, err = creds.UseGID(s)
+		if err != nil {
+			return err
+		}
+	}
+	t.setKGIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
+	creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds.
+	oldE := creds.EffectiveKGID
+	creds.RealKGID, creds.EffectiveKGID, creds.SavedKGID = newR, newE, newS
+
+	if oldE != newE {
+		// "[dumpability] is reset to the current value contained in
+		// the file /proc/sys/fs/suid_dumpable (which by default has
+		// the value 0), in the following circumstances: The process's
+		// effective user or group ID is changed." - prctl(2)
+		//
+		// (suid_dumpable isn't implemented, so we just use the
+		// default.
+		t.MemoryManager().SetDumpability(mm.NotDumpable)
+
+		// Not documented, but compare Linux's
+		// kernel/cred.c:commit_creds().
+		t.parentDeathSignal = 0
+	}
+	t.creds.Store(creds)
+}
+
+// SetExtraGIDs attempts to change t's supplemental groups. All IDs are
+// interpreted as being in t's user namespace.
+func (t *Task) SetExtraGIDs(gids []auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	creds := t.Credentials()
+	if !creds.HasCapability(linux.CAP_SETGID) {
+		return syserror.EPERM
+	}
+	kgids := make([]auth.KGID, len(gids))
+	for i, gid := range gids {
+		kgid := creds.UserNamespace.MapToKGID(gid)
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+		kgids[i] = kgid
+	}
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
+	creds.ExtraKGIDs = kgids
+	t.creds.Store(creds)
+	return nil
+}
+
+// SetCapabilitySets attempts to change t's permitted, inheritable, and
+// effective capability sets.
+func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Permitted: This is a limiting superset for the effective capabilities
+	// that the thread may assume." - capabilities(7)
+	if effective & ^permitted != 0 {
+		return syserror.EPERM
+	}
+	creds := t.Credentials()
+	// "It is also a limiting superset for the capabilities that may be added
+	// to the inheritable set by a thread that does not have the CAP_SETPCAP
+	// capability in its effective set."
+	if !creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(creds.InheritableCaps|creds.PermittedCaps) != 0) {
+		return syserror.EPERM
+	}
+	// "If a thread drops a capability from its permitted set, it can never
+	// reacquire that capability (unless it execve(2)s ..."
+	if permitted & ^creds.PermittedCaps != 0 {
+		return syserror.EPERM
+	}
+	// "... if a capability is not in the bounding set, then a thread can't add
+	// this capability to its inheritable set, even if it was in its permitted
+	// capabilities ..."
+	if inheritable & ^(creds.InheritableCaps|creds.BoundingCaps) != 0 {
+		return syserror.EPERM
+	}
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
+	creds.PermittedCaps = permitted
+	creds.InheritableCaps = inheritable
+	creds.EffectiveCaps = effective
+	t.creds.Store(creds)
+	return nil
+}
+
+// DropBoundingCapability attempts to drop capability cp from t's capability
+// bounding set.
+func (t *Task) DropBoundingCapability(cp linux.Capability) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	creds := t.Credentials()
+	if !creds.HasCapability(linux.CAP_SETPCAP) {
+		return syserror.EPERM
+	}
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
+	creds.BoundingCaps &^= auth.CapabilitySetOf(cp)
+	t.creds.Store(creds)
+	return nil
+}
+
+// SetUserNamespace attempts to move c into ns.
+func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	creds := t.Credentials()
+	// "A process reassociating itself with a user namespace must have the
+	// CAP_SYS_ADMIN capability in the target user namespace." - setns(2)
+	//
+	// If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN
+	// in ns (by rule 3 in auth.Credentials.HasCapability).
+	if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
+		return syserror.EPERM
+	}
+
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
+	creds.UserNamespace = ns
+	// "The child process created by clone(2) with the CLONE_NEWUSER flag
+	// starts out with a complete set of capabilities in the new user
+	// namespace. Likewise, a process that creates a new user namespace using
+	// unshare(2) or joins an existing user namespace using setns(2) gains a
+	// full set of capabilities in that namespace."
+	creds.PermittedCaps = auth.AllCapabilities
+	creds.InheritableCaps = 0
+	creds.EffectiveCaps = auth.AllCapabilities
+	creds.BoundingCaps = auth.AllCapabilities
+	// "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER
+	// flag sets the "securebits" flags (see capabilities(7)) to their default
+	// values (all flags disabled) in the child (for clone(2)) or caller (for
+	// unshare(2), or setns(2)." - user_namespaces(7)
+	creds.KeepCaps = false
+	t.creds.Store(creds)
+
+	return nil
+}
+
+// SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS.
+func (t *Task) SetKeepCaps(k bool) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds.
+	creds.KeepCaps = k
+	t.creds.Store(creds)
+}
+
+// updateCredsForExecLocked updates t.creds to reflect an execve().
+//
+// NOTE(b/30815691): We currently do not implement privileged executables
+// (set-user/group-ID bits and file capabilities). This allows us to make a lot
+// of simplifying assumptions:
+//
+// - We assume the no_new_privs bit (set by prctl(SET_NO_NEW_PRIVS)), which
+// disables the features we don't support anyway, is always set. This
+// drastically simplifies this function.
+//
+// - We don't set AT_SECURE = 1, because no_new_privs always being set means
+// that the conditions that require AT_SECURE = 1 never arise. (Compare Linux's
+// security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().)
+//
+// - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since
+// seccomp-bpf is also allowed if the task has no_new_privs set.
+//
+// - Task.ptraceAttach does not serialize with execve as it does in Linux,
+// since no_new_privs being set has the same effect as the presence of an
+// unprivileged tracer.
+//
+// Preconditions: t.mu must be locked.
+func (t *Task) updateCredsForExecLocked() {
+	// """
+	// During an execve(2), the kernel calculates the new capabilities of
+	// the process using the following algorithm:
+	//
+	//     P'(permitted) = (P(inheritable) & F(inheritable)) |
+	//                     (F(permitted) & cap_bset)
+	//
+	//     P'(effective) = F(effective) ? P'(permitted) : 0
+	//
+	//     P'(inheritable) = P(inheritable)    [i.e., unchanged]
+	//
+	// where:
+	//
+	//     P         denotes the value of a thread capability set before the
+	//               execve(2)
+	//
+	//     P'        denotes the value of a thread capability set after the
+	//               execve(2)
+	//
+	//     F         denotes a file capability set
+	//
+	//     cap_bset  is the value of the capability bounding set
+	//
+	// ...
+	//
+	// In order to provide an all-powerful root using capability sets, during
+	// an execve(2):
+	//
+	// 1. If a set-user-ID-root program is being executed, or the real user ID
+	// of the process is 0 (root) then the file inheritable and permitted sets
+	// are defined to be all ones (i.e. all capabilities enabled).
+	//
+	// 2. If a set-user-ID-root program is being executed, then the file
+	// effective bit is defined to be one (enabled).
+	//
+	// The upshot of the above rules, combined with the capabilities
+	// transformations described above, is that when a process execve(2)s a
+	// set-user-ID-root program, or when a process with an effective UID of 0
+	// execve(2)s a program, it gains all capabilities in its permitted and
+	// effective capability sets, except those masked out by the capability
+	// bounding set.
+	// """ - capabilities(7)
+	// (ambient capability sets omitted)
+	//
+	// As the last paragraph implies, the case of "a set-user-ID root program
+	// is being executed" also includes the case where (namespace) root is
+	// executing a non-set-user-ID program; the actual check is just based on
+	// the effective user ID.
+	var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0
+	fileEffective := false
+	creds := t.Credentials()
+	root := creds.UserNamespace.MapToKUID(auth.RootUID)
+	if creds.EffectiveKUID == root || creds.RealKUID == root {
+		newPermitted = creds.InheritableCaps | creds.BoundingCaps
+		if creds.EffectiveKUID == root {
+			fileEffective = true
+		}
+	}
+
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
+
+	// Now we enter poorly-documented, somewhat confusing territory. (The
+	// accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds
+	// is not very helpful.) My reading of it is:
+	//
+	// If at least one of the following is true:
+	//
+	// A1. The execing task is ptraced, and the tracer did not have
+	// CAP_SYS_PTRACE in the execing task's user namespace at the time of
+	// PTRACE_ATTACH.
+	//
+	// A2. The execing task shares its FS context with at least one task in
+	// another thread group.
+	//
+	// A3. The execing task has no_new_privs set.
+	//
+	// AND at least one of the following is true:
+	//
+	// B1. The new effective user ID (which may come from set-user-ID, or be the
+	// execing task's existing effective user ID) is not equal to the task's
+	// real UID.
+	//
+	// B2. The new effective group ID (which may come from set-group-ID, or be
+	// the execing task's existing effective group ID) is not equal to the
+	// task's real GID.
+	//
+	// B3. The new permitted capability set contains capabilities not in the
+	// task's permitted capability set.
+	//
+	// Then:
+	//
+	// C1. Limit the new permitted capability set to the task's permitted
+	// capability set.
+	//
+	// C2. If either the task does not have CAP_SETUID in its user namespace, or
+	// the task has no_new_privs set, force the new effective UID and GID to
+	// the task's real UID and GID.
+	//
+	// But since no_new_privs is always set (A3 is always true), this becomes
+	// much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1
+	// is a no-op. So we can just do C1 and C2 unconditionally.
+	if creds.EffectiveKUID != creds.RealKUID || creds.EffectiveKGID != creds.RealKGID {
+		creds.EffectiveKUID = creds.RealKUID
+		creds.EffectiveKGID = creds.RealKGID
+		t.parentDeathSignal = 0
+	}
+	// (Saved set-user-ID is always set to the new effective user ID, and saved
+	// set-group-ID is always set to the new effective group ID, regardless of
+	// the above.)
+	creds.SavedKUID = creds.RealKUID
+	creds.SavedKGID = creds.RealKGID
+	creds.PermittedCaps &= newPermitted
+	if fileEffective {
+		creds.EffectiveCaps = creds.PermittedCaps
+	} else {
+		creds.EffectiveCaps = 0
+	}
+
+	// prctl(2): The "keep capabilities" value will be reset to 0 on subsequent
+	// calls to execve(2).
+	creds.KeepCaps = false
+
+	// "The bounding set is inherited at fork(2) from the thread's parent, and
+	// is preserved across an execve(2)". So we're done.
+	t.creds.Store(creds)
+}
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
new file mode 100644
index 000000000..eeccaa197
--- /dev/null
+++ b/pkg/sentry/kernel/task_log.go
@@ -0,0 +1,208 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"runtime/trace"
+	"sort"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+	// maxStackDebugBytes is the maximum number of user stack bytes that may be
+	// printed by debugDumpStack.
+	maxStackDebugBytes = 1024
+)
+
+// Infof logs an formatted info message by calling log.Infof.
+func (t *Task) Infof(fmt string, v ...interface{}) {
+	if log.IsLogging(log.Info) {
+		log.InfofAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
+	}
+}
+
+// Warningf logs a warning string by calling log.Warningf.
+func (t *Task) Warningf(fmt string, v ...interface{}) {
+	if log.IsLogging(log.Warning) {
+		log.WarningfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
+	}
+}
+
+// Debugf creates a debug string that includes the task ID.
+func (t *Task) Debugf(fmt string, v ...interface{}) {
+	if log.IsLogging(log.Debug) {
+		log.DebugfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
+	}
+}
+
+// IsLogging returns true iff this level is being logged.
+func (t *Task) IsLogging(level log.Level) bool {
+	return log.IsLogging(level)
+}
+
+// DebugDumpState logs task state at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) DebugDumpState() {
+	t.debugDumpRegisters()
+	t.debugDumpStack()
+	if mm := t.MemoryManager(); mm != nil {
+		t.Debugf("Mappings:\n%s", mm)
+	}
+	t.Debugf("FDTable:\n%s", t.fdTable)
+}
+
+// debugDumpRegisters logs register state at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) debugDumpRegisters() {
+	if !t.IsLogging(log.Debug) {
+		return
+	}
+	regmap, err := t.Arch().RegisterMap()
+	if err != nil {
+		t.Debugf("Registers: %v", err)
+	} else {
+		t.Debugf("Registers:")
+		var regs []string
+		for reg := range regmap {
+			regs = append(regs, reg)
+		}
+		sort.Strings(regs)
+		for _, reg := range regs {
+			t.Debugf("%-8s = %016x", reg, regmap[reg])
+		}
+	}
+}
+
+// debugDumpStack logs user stack contents at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) debugDumpStack() {
+	if !t.IsLogging(log.Debug) {
+		return
+	}
+	m := t.MemoryManager()
+	if m == nil {
+		t.Debugf("Memory manager for task is gone, skipping application stack dump.")
+		return
+	}
+	t.Debugf("Stack:")
+	start := usermem.Addr(t.Arch().Stack())
+	// Round addr down to a 16-byte boundary.
+	start &= ^usermem.Addr(15)
+	// Print 16 bytes per line, one byte at a time.
+	for offset := uint64(0); offset < maxStackDebugBytes; offset += 16 {
+		addr, ok := start.AddLength(offset)
+		if !ok {
+			break
+		}
+		var data [16]byte
+		n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{
+			IgnorePermissions: true,
+		})
+		// Print as much of the line as we can, even if an error was
+		// encountered.
+		if n > 0 {
+			t.Debugf("%x: % x", addr, data[:n])
+		}
+		if err != nil {
+			t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err)
+			break
+		}
+	}
+}
+
+// trace definitions.
+//
+// Note that all region names are prefixed by ':' in order to ensure that they
+// are lexically ordered before all system calls, which use the naked system
+// call name (e.g. "read") for maximum clarity.
+const (
+	traceCategory = "task"
+	runRegion     = ":run"
+	blockRegion   = ":block"
+	cpuidRegion   = ":cpuid"
+	faultRegion   = ":fault"
+)
+
+// updateInfoLocked updates the task's cached log prefix and tracing
+// information to reflect its current thread ID.
+//
+// Preconditions: The task's owning TaskSet.mu must be locked.
+func (t *Task) updateInfoLocked() {
+	// Use the task's TID in the root PID namespace for logging.
+	tid := t.tg.pidns.owner.Root.tids[t]
+	t.logPrefix.Store(fmt.Sprintf("[% 4d] ", tid))
+	t.rebuildTraceContext(tid)
+}
+
+// rebuildTraceContext rebuilds the trace context.
+//
+// Precondition: the passed tid must be the tid in the root namespace.
+func (t *Task) rebuildTraceContext(tid ThreadID) {
+	// Re-initialize the trace context.
+	if t.traceTask != nil {
+		t.traceTask.End()
+	}
+
+	// Note that we define the "task type" to be the dynamic TID. This does
+	// not align perfectly with the documentation for "tasks" in the
+	// tracing package. Tasks may be assumed to be bounded by analysis
+	// tools. However, if we just use a generic "task" type here, then the
+	// "user-defined tasks" page on the tracing dashboard becomes nearly
+	// unusable, as it loads all traces from all tasks.
+	//
+	// We can assume that the number of tasks in the system is not
+	// arbitrarily large (in general it won't be, especially for cases
+	// where we're collecting a brief profile), so using the TID is a
+	// reasonable compromise in this case.
+	t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid))
+}
+
+// traceCloneEvent is called when a new task is spawned.
+//
+// ntid must be the new task's ThreadID in the root namespace.
+func (t *Task) traceCloneEvent(ntid ThreadID) {
+	if !trace.IsEnabled() {
+		return
+	}
+	trace.Logf(t.traceContext, traceCategory, "spawn: %d", ntid)
+}
+
+// traceExitEvent is called when a task exits.
+func (t *Task) traceExitEvent() {
+	if !trace.IsEnabled() {
+		return
+	}
+	trace.Logf(t.traceContext, traceCategory, "exit status: 0x%x", t.exitStatus.Status())
+}
+
+// traceExecEvent is called when a task calls exec.
+func (t *Task) traceExecEvent(tc *TaskContext) {
+	if !trace.IsEnabled() {
+		return
+	}
+	file := tc.MemoryManager.Executable()
+	if file == nil {
+		trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
+		return
+	}
+	defer file.DecRef()
+	trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t))
+}
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
new file mode 100644
index 000000000..f7711232c
--- /dev/null
+++ b/pkg/sentry/kernel/task_net.go
@@ -0,0 +1,44 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+)
+
+// IsNetworkNamespaced returns true if t is in a non-root network namespace.
+func (t *Task) IsNetworkNamespaced() bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return !t.netns.IsRoot()
+}
+
+// NetworkContext returns the network stack used by the task. NetworkContext
+// may return nil if no network stack is available.
+//
+// TODO(gvisor.dev/issue/1833): Migrate callers of this method to
+// NetworkNamespace().
+func (t *Task) NetworkContext() inet.Stack {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns.Stack()
+}
+
+// NetworkNamespace returns the network namespace observed by the task.
+func (t *Task) NetworkNamespace() *inet.Namespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns
+}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
new file mode 100644
index 000000000..d654dd997
--- /dev/null
+++ b/pkg/sentry/kernel/task_run.go
@@ -0,0 +1,380 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"bytes"
+	"runtime"
+	"runtime/trace"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// A taskRunState is a reified state in the task state machine. See README.md
+// for details. The canonical list of all run states, as well as transitions
+// between them, is given in run_states.dot.
+//
+// The set of possible states is enumerable and completely defined by the
+// kernel package, so taskRunState would ideally be represented by a
+// discriminated union. However, Go does not support sum types.
+//
+// Hence, as with TaskStop, data-free taskRunStates should be represented as
+// typecast nils to avoid unnecessary allocation.
+type taskRunState interface {
+	// execute executes the code associated with this state over the given task
+	// and returns the following state. If execute returns nil, the task
+	// goroutine should exit.
+	//
+	// It is valid to tail-call a following state's execute to avoid the
+	// overhead of converting the following state to an interface object and
+	// checking for stops, provided that the tail-call cannot recurse.
+	execute(*Task) taskRunState
+}
+
+// run runs the task goroutine.
+//
+// threadID a dummy value set to the task's TID in the root PID namespace to
+// make it visible in stack dumps. A goroutine for a given task can be identified
+// searching for Task.run()'s argument value.
+func (t *Task) run(threadID uintptr) {
+	// Construct t.blockingTimer here. We do this here because we can't
+	// reconstruct t.blockingTimer during restore in Task.afterLoad(), because
+	// kernel.timekeeper.SetClocks() hasn't been called yet.
+	blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier()
+	t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier)
+	defer t.blockingTimer.Destroy()
+	t.blockingTimerChan = blockingTimerChan
+
+	// Activate our address space.
+	t.Activate()
+	// The corresponding t.Deactivate occurs in the exit path
+	// (runExitMain.execute) so that when
+	// Platform.CooperativelySharesAddressSpace() == true, we give up the
+	// AddressSpace before the task goroutine finishes executing.
+
+	// If this is a newly-started task, it should check for participation in
+	// group stops. If this is a task resuming after restore, it was
+	// interrupted by saving. In either case, the task is initially
+	// interrupted.
+	t.interruptSelf()
+
+	for {
+		// Explanation for this ordering:
+		//
+		// - A freshly-started task that is stopped should not do anything
+		// before it enters the stop.
+		//
+		// - If taskRunState.execute returns nil, the task goroutine should
+		// exit without checking for a stop.
+		//
+		// - Task.Start won't start Task.run if t.runState is nil, so this
+		// ordering is safe.
+		t.doStop()
+		t.runState = t.runState.execute(t)
+		if t.runState == nil {
+			t.accountTaskGoroutineEnter(TaskGoroutineNonexistent)
+			t.goroutineStopped.Done()
+			t.tg.liveGoroutines.Done()
+			t.tg.pidns.owner.liveGoroutines.Done()
+			t.tg.pidns.owner.runningGoroutines.Done()
+			t.p.Release()
+
+			// Keep argument alive because stack trace for dead variables may not be correct.
+			runtime.KeepAlive(threadID)
+			return
+		}
+	}
+}
+
+// doStop is called by Task.run to block until the task is not stopped.
+func (t *Task) doStop() {
+	if atomic.LoadInt32(&t.stopCount) == 0 {
+		return
+	}
+	t.Deactivate()
+	// NOTE(b/30316266): t.Activate() must be called without any locks held, so
+	// this defer must precede the defer for unlocking the signal mutex.
+	defer t.Activate()
+	t.accountTaskGoroutineEnter(TaskGoroutineStopped)
+	defer t.accountTaskGoroutineLeave(TaskGoroutineStopped)
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.tg.pidns.owner.runningGoroutines.Add(-1)
+	defer t.tg.pidns.owner.runningGoroutines.Add(1)
+	t.goroutineStopped.Add(-1)
+	defer t.goroutineStopped.Add(1)
+	for t.stopCount > 0 {
+		t.endStopCond.Wait()
+	}
+}
+
+func (*runApp) handleCPUIDInstruction(t *Task) error {
+	if len(arch.CPUIDInstruction) == 0 {
+		// CPUID emulation isn't supported, but this code can be
+		// executed, because the ptrace platform returns
+		// ErrContextSignalCPUID on page faults too. Look at
+		// pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more
+		// details.
+		return platform.ErrContextSignal
+	}
+	// Is this a CPUID instruction?
+	region := trace.StartRegion(t.traceContext, cpuidRegion)
+	expected := arch.CPUIDInstruction[:]
+	found := make([]byte, len(expected))
+	_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+	if err == nil && bytes.Equal(expected, found) {
+		// Skip the cpuid instruction.
+		t.Arch().CPUIDEmulate(t)
+		t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+		region.End()
+
+		return nil
+	}
+	region.End() // Not an actual CPUID, but required copy-in.
+	return platform.ErrContextSignal
+}
+
+// The runApp state checks for interrupts before executing untrusted
+// application code.
+//
+// +stateify savable
+type runApp struct{}
+
+func (app *runApp) execute(t *Task) taskRunState {
+	if t.interrupted() {
+		// Checkpointing instructs tasks to stop by sending an interrupt, so we
+		// must check for stops before entering runInterrupt (instead of
+		// tail-calling it).
+		return (*runInterrupt)(nil)
+	}
+
+	// We're about to switch to the application again. If there's still a
+	// unhandled SyscallRestartErrno that wasn't translated to an EINTR,
+	// restart the syscall that was interrupted. If there's a saved signal
+	// mask, restore it. (Note that restoring the saved signal mask may unblock
+	// a pending signal, causing another interruption, but that signal should
+	// not interact with the interrupted syscall.)
+	if t.haveSyscallReturn {
+		if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+			if sre == ERESTART_RESTARTBLOCK {
+				t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
+				t.Arch().RestartSyscallWithRestartBlock()
+			} else {
+				t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
+				t.Arch().RestartSyscall()
+			}
+		}
+		t.haveSyscallReturn = false
+	}
+	if t.haveSavedSignalMask {
+		t.SetSignalMask(t.savedSignalMask)
+		t.haveSavedSignalMask = false
+		if t.interrupted() {
+			return (*runInterrupt)(nil)
+		}
+	}
+
+	// Apply restartable sequences.
+	if t.rseqPreempted {
+		t.rseqPreempted = false
+		if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 {
+			// Linux writes the CPU on every preemption. We only do
+			// so if it changed. Thus we may delay delivery of
+			// SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid.
+			cpu := int32(hostcpu.GetCPU())
+			if t.rseqCPU != cpu {
+				t.rseqCPU = cpu
+				if err := t.rseqCopyOutCPU(); err != nil {
+					t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
+					t.forceSignal(linux.SIGSEGV, false)
+					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+					// Re-enter the task run loop for signal delivery.
+					return (*runApp)(nil)
+				}
+				if err := t.oldRSeqCopyOutCPU(); err != nil {
+					t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err)
+					t.forceSignal(linux.SIGSEGV, false)
+					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+					// Re-enter the task run loop for signal delivery.
+					return (*runApp)(nil)
+				}
+			}
+		}
+		t.rseqInterrupt()
+	}
+
+	// Check if we need to enable single-stepping. Tracers expect that the
+	// kernel preserves the value of the single-step flag set by PTRACE_SETREGS
+	// whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this
+	// includes our ptrace platform, by the way), so we should only clear the
+	// single-step flag if we're responsible for setting it. (clearSinglestep
+	// is therefore analogous to Linux's TIF_FORCED_TF.)
+	//
+	// Strictly speaking, we should also not clear the single-step flag if we
+	// single-step through an instruction that sets the single-step flag
+	// (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their
+	// own TF. (Famous last words, I know.)
+	clearSinglestep := false
+	if t.hasTracer() {
+		t.tg.pidns.owner.mu.RLock()
+		if t.ptraceSinglestep {
+			clearSinglestep = !t.Arch().SingleStep()
+			t.Arch().SetSingleStep()
+		}
+		t.tg.pidns.owner.mu.RUnlock()
+	}
+
+	region := trace.StartRegion(t.traceContext, runRegion)
+	t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
+	info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU)
+	t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
+	region.End()
+
+	if clearSinglestep {
+		t.Arch().ClearSingleStep()
+	}
+
+	switch err {
+	case nil:
+		// Handle application system call.
+		return t.doSyscall()
+
+	case platform.ErrContextInterrupt:
+		// Interrupted by platform.Context.Interrupt(). Re-enter the run
+		// loop to figure out why.
+		return (*runApp)(nil)
+
+	case platform.ErrContextSignalCPUID:
+		if err := app.handleCPUIDInstruction(t); err == nil {
+			// Resume execution.
+			return (*runApp)(nil)
+		}
+
+		// The instruction at the given RIP was not a CPUID, and we
+		// fallthrough to the default signal deliver behavior below.
+		fallthrough
+
+	case platform.ErrContextSignal:
+		// Looks like a signal has been delivered to us. If it's a synchronous
+		// signal (SEGV, SIGBUS, etc.), it should be sent to the application
+		// thread that received it.
+		sig := linux.Signal(info.Signo)
+
+		// Was it a fault that we should handle internally? If so, this wasn't
+		// an application-generated signal and we should continue execution
+		// normally.
+		if at.Any() {
+			region := trace.StartRegion(t.traceContext, faultRegion)
+			addr := usermem.Addr(info.Addr())
+			err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
+			region.End()
+			if err == nil {
+				// The fault was handled appropriately.
+				// We can resume running the application.
+				return (*runApp)(nil)
+			}
+
+			// Is this a vsyscall that we need emulate?
+			//
+			// Note that we don't track vsyscalls as part of a
+			// specific trace region. This is because regions don't
+			// stack, and the actual system call will count as a
+			// region. We should be able to easily identify
+			// vsyscalls by having a <fault><syscall> pair.
+			if at.Execute {
+				if sysno, ok := t.tc.st.LookupEmulate(addr); ok {
+					return t.doVsyscall(addr, sysno)
+				}
+			}
+
+			// Faults are common, log only at debug level.
+			t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
+			t.DebugDumpState()
+
+			// Continue to signal handling.
+			//
+			// Convert a BusError error to a SIGBUS from a SIGSEGV. All
+			// other info bits stay the same (address, etc.).
+			if _, ok := err.(*memmap.BusError); ok {
+				sig = linux.SIGBUS
+				info.Signo = int32(linux.SIGBUS)
+			}
+		}
+
+		switch sig {
+		case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP:
+			// Synchronous signal. Send it to ourselves. Assume the signal is
+			// legitimate and force it (work around the signal being ignored or
+			// blocked) like Linux does. Conveniently, this is even the correct
+			// behavior for SIGTRAP from single-stepping.
+			t.forceSignal(linux.Signal(sig), false /* unconditional */)
+			t.SendSignal(info)
+
+		case platform.SignalInterrupt:
+			// Assume that a call to platform.Context.Interrupt() misfired.
+
+		case linux.SIGPROF:
+			// It's a profiling interrupt: there's not much
+			// we can do. We've already paid a decent cost
+			// by intercepting the signal, at this point we
+			// simply ignore it.
+
+		default:
+			// Asynchronous signal. Let the system deal with it.
+			t.k.sendExternalSignal(info, "application")
+		}
+
+		return (*runApp)(nil)
+
+	case platform.ErrContextCPUPreempted:
+		// Ensure that rseq critical sections are interrupted and per-thread
+		// CPU values are updated before the next platform.Context.Switch().
+		t.rseqPreempted = true
+		return (*runApp)(nil)
+
+	default:
+		// What happened? Can't continue.
+		t.Warningf("Unexpected SwitchToApp error: %v", err)
+		t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)})
+		return (*runExit)(nil)
+	}
+}
+
+// waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits.
+func (t *Task) waitGoroutineStoppedOrExited() {
+	t.goroutineStopped.Wait()
+}
+
+// WaitExited blocks until all task goroutines in tg have exited.
+//
+// WaitExited does not correspond to anything in Linux; it's provided so that
+// external callers of Kernel.CreateProcess can wait for the created thread
+// group to terminate.
+func (tg *ThreadGroup) WaitExited() {
+	tg.liveGoroutines.Wait()
+}
+
+// Yield yields the processor for the calling task.
+func (t *Task) Yield() {
+	atomic.AddUint64(&t.yieldCount, 1)
+	runtime.Gosched()
+}
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
new file mode 100644
index 000000000..09366b60c
--- /dev/null
+++ b/pkg/sentry/kernel/task_sched.go
@@ -0,0 +1,668 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// CPU scheduling, real and fake.
+
+import (
+	"fmt"
+	"math/rand"
+	"sync/atomic"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TaskGoroutineState is a coarse representation of the current execution
+// status of a kernel.Task goroutine.
+type TaskGoroutineState int
+
+const (
+	// TaskGoroutineNonexistent indicates that the task goroutine has either
+	// not yet been created by Task.Start() or has returned from Task.run().
+	// This must be the zero value for TaskGoroutineState.
+	TaskGoroutineNonexistent TaskGoroutineState = iota
+
+	// TaskGoroutineRunningSys indicates that the task goroutine is executing
+	// sentry code.
+	TaskGoroutineRunningSys
+
+	// TaskGoroutineRunningApp indicates that the task goroutine is executing
+	// application code.
+	TaskGoroutineRunningApp
+
+	// TaskGoroutineBlockedInterruptible indicates that the task goroutine is
+	// blocked in Task.block(), and hence may be woken by Task.interrupt()
+	// (e.g. due to signal delivery).
+	TaskGoroutineBlockedInterruptible
+
+	// TaskGoroutineBlockedUninterruptible indicates that the task goroutine is
+	// stopped outside of Task.block() and Task.doStop(), and hence cannot be
+	// woken by Task.interrupt().
+	TaskGoroutineBlockedUninterruptible
+
+	// TaskGoroutineStopped indicates that the task goroutine is blocked in
+	// Task.doStop(). TaskGoroutineStopped is similar to
+	// TaskGoroutineBlockedUninterruptible, but is a separate state to make it
+	// possible to determine when Task.stop is meaningful.
+	TaskGoroutineStopped
+)
+
+// TaskGoroutineSchedInfo contains task goroutine scheduling state which must
+// be read and updated atomically.
+//
+// +stateify savable
+type TaskGoroutineSchedInfo struct {
+	// Timestamp was the value of Kernel.cpuClock when this
+	// TaskGoroutineSchedInfo was last updated.
+	Timestamp uint64
+
+	// State is the current state of the task goroutine.
+	State TaskGoroutineState
+
+	// UserTicks is the amount of time the task goroutine has spent executing
+	// its associated Task's application code, in units of linux.ClockTick.
+	UserTicks uint64
+
+	// SysTicks is the amount of time the task goroutine has spent executing in
+	// the sentry, in units of linux.ClockTick.
+	SysTicks uint64
+}
+
+// userTicksAt returns the extrapolated value of ts.UserTicks after
+// Kernel.CPUClockNow() indicates a time of now.
+//
+// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
+// monotonic, this is satisfied if now is the result of a previous call to
+// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
+// change to t.gosched can cause userTicksAt to adjust stats by too much,
+// making the observed stats non-monotonic.
+func (ts *TaskGoroutineSchedInfo) userTicksAt(now uint64) uint64 {
+	if ts.Timestamp < now && ts.State == TaskGoroutineRunningApp {
+		// Update stats to reflect execution since the last update.
+		return ts.UserTicks + (now - ts.Timestamp)
+	}
+	return ts.UserTicks
+}
+
+// sysTicksAt returns the extrapolated value of ts.SysTicks after
+// Kernel.CPUClockNow() indicates a time of now.
+//
+// Preconditions: As for userTicksAt.
+func (ts *TaskGoroutineSchedInfo) sysTicksAt(now uint64) uint64 {
+	if ts.Timestamp < now && ts.State == TaskGoroutineRunningSys {
+		return ts.SysTicks + (now - ts.Timestamp)
+	}
+	return ts.SysTicks
+}
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
+	now := t.k.CPUClockNow()
+	if t.gosched.State != TaskGoroutineRunningSys {
+		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state))
+	}
+	t.goschedSeq.BeginWrite()
+	// This function is very hot; avoid defer.
+	t.gosched.SysTicks += now - t.gosched.Timestamp
+	t.gosched.Timestamp = now
+	t.gosched.State = state
+	t.goschedSeq.EndWrite()
+
+	if state != TaskGoroutineRunningApp {
+		// Task is blocking/stopping.
+		t.k.decRunningTasks()
+	}
+}
+
+// Preconditions: The caller must be running on the task goroutine, and leaving
+// a state indicated by a previous call to
+// t.accountTaskGoroutineEnter(state).
+func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
+	if state != TaskGoroutineRunningApp {
+		// Task is unblocking/continuing.
+		t.k.incRunningTasks()
+	}
+
+	now := t.k.CPUClockNow()
+	if t.gosched.State != state {
+		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys))
+	}
+	t.goschedSeq.BeginWrite()
+	// This function is very hot; avoid defer.
+	if state == TaskGoroutineRunningApp {
+		t.gosched.UserTicks += now - t.gosched.Timestamp
+	}
+	t.gosched.Timestamp = now
+	t.gosched.State = TaskGoroutineRunningSys
+	t.goschedSeq.EndWrite()
+}
+
+// TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
+// Most clients should use t.CPUStats() instead.
+func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
+	return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched)
+}
+
+// CPUStats returns the CPU usage statistics of t.
+func (t *Task) CPUStats() usage.CPUStats {
+	return t.cpuStatsAt(t.k.CPUClockNow())
+}
+
+// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt.
+func (t *Task) cpuStatsAt(now uint64) usage.CPUStats {
+	tsched := t.TaskGoroutineSchedInfo()
+	return usage.CPUStats{
+		UserTime:          time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)),
+		SysTime:           time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)),
+		VoluntarySwitches: atomic.LoadUint64(&t.yieldCount),
+	}
+}
+
+// CPUStats returns the combined CPU usage statistics of all past and present
+// threads in tg.
+func (tg *ThreadGroup) CPUStats() usage.CPUStats {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	// Hack to get a pointer to the Kernel.
+	if tg.leader == nil {
+		// Per comment on tg.leader, this is only possible if nothing in the
+		// ThreadGroup has ever executed anyway.
+		return usage.CPUStats{}
+	}
+	return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
+}
+
+// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex
+// must be locked.
+func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
+	stats := tg.exitedCPUStats
+	// Account for live tasks.
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		stats.Accumulate(t.cpuStatsAt(now))
+	}
+	return stats
+}
+
+// JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return
+// resource usage statistics for all children of [tg] that have terminated and
+// been waited for. These statistics will include the resources used by
+// grandchildren, and further removed descendants, if all of the intervening
+// descendants waited on their terminated children."
+func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.childCPUStats
+}
+
+// taskClock is a ktime.Clock that measures the time that a task has spent
+// executing. taskClock is primarily used to implement CLOCK_THREAD_CPUTIME_ID.
+//
+// +stateify savable
+type taskClock struct {
+	t *Task
+
+	// If includeSys is true, the taskClock includes both time spent executing
+	// application code as well as time spent in the sentry. Otherwise, the
+	// taskClock includes only time spent executing application code.
+	includeSys bool
+
+	// Implements waiter.Waitable. TimeUntil wouldn't change its estimation
+	// based on either of the clock events, so there's no event to be
+	// notified for.
+	ktime.NoClockEvents `state:"nosave"`
+
+	// Implements ktime.Clock.WallTimeUntil.
+	//
+	// As an upper bound, a task's clock cannot advance faster than CPU
+	// time. It would have to execute at a rate of more than 1 task-second
+	// per 1 CPU-second, which isn't possible.
+	ktime.WallRateClock `state:"nosave"`
+}
+
+// UserCPUClock returns a clock measuring the CPU time the task has spent
+// executing application code.
+func (t *Task) UserCPUClock() ktime.Clock {
+	return &taskClock{t: t, includeSys: false}
+}
+
+// CPUClock returns a clock measuring the CPU time the task has spent executing
+// application and "kernel" code.
+func (t *Task) CPUClock() ktime.Clock {
+	return &taskClock{t: t, includeSys: true}
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *taskClock) Now() ktime.Time {
+	stats := tc.t.CPUStats()
+	if tc.includeSys {
+		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+	}
+	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// tgClock is a ktime.Clock that measures the time a thread group has spent
+// executing. tgClock is primarily used to implement CLOCK_PROCESS_CPUTIME_ID.
+//
+// +stateify savable
+type tgClock struct {
+	tg *ThreadGroup
+
+	// If includeSys is true, the tgClock includes both time spent executing
+	// application code as well as time spent in the sentry. Otherwise, the
+	// tgClock includes only time spent executing application code.
+	includeSys bool
+
+	// Implements waiter.Waitable.
+	ktime.ClockEventsQueue `state:"nosave"`
+}
+
+// Now implements ktime.Clock.Now.
+func (tgc *tgClock) Now() ktime.Time {
+	stats := tgc.tg.CPUStats()
+	if tgc.includeSys {
+		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+	}
+	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// WallTimeUntil implements ktime.Clock.WallTimeUntil.
+func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
+	// Thread group CPU time should not exceed wall time * live tasks, since
+	// task goroutines exit after the transition to TaskExitZombie in
+	// runExitNotify.
+	tgc.tg.pidns.owner.mu.RLock()
+	n := tgc.tg.liveTasks
+	tgc.tg.pidns.owner.mu.RUnlock()
+	if n == 0 {
+		if t.Before(now) {
+			return 0
+		}
+		// The timer tick raced with thread group exit, after which no more
+		// tasks can enter the thread group. So tgc.Now() will never advance
+		// again. Return a large delay; the timer should be stopped long before
+		// it comes again anyway.
+		return time.Hour
+	}
+	// This is a lower bound on the amount of time that can elapse before an
+	// associated timer expires, so returning this value tends to result in a
+	// sequence of closely-spaced ticks just before timer expiry. To avoid
+	// this, round up to the nearest ClockTick; CPU usage measurements are
+	// limited to this resolution anyway.
+	remaining := time.Duration(t.Sub(now).Nanoseconds()/int64(n)) * time.Nanosecond
+	return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
+}
+
+// UserCPUClock returns a ktime.Clock that measures the time that a thread
+// group has spent executing.
+func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
+	return &tgClock{tg: tg, includeSys: false}
+}
+
+// CPUClock returns a ktime.Clock that measures the time that a thread group
+// has spent executing, including sentry time.
+func (tg *ThreadGroup) CPUClock() ktime.Clock {
+	return &tgClock{tg: tg, includeSys: true}
+}
+
+type kernelCPUClockTicker struct {
+	k *Kernel
+
+	// These are essentially kernelCPUClockTicker.Notify local variables that
+	// are cached between calls to reduce allocations.
+	rng *rand.Rand
+	tgs []*ThreadGroup
+}
+
+func newKernelCPUClockTicker(k *Kernel) *kernelCPUClockTicker {
+	return &kernelCPUClockTicker{
+		k:   k,
+		rng: rand.New(rand.NewSource(rand.Int63())),
+	}
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (ticker *kernelCPUClockTicker) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
+	// Only increment cpuClock by 1 regardless of the number of expirations.
+	// This approximately compensates for cases where thread throttling or bad
+	// Go runtime scheduling prevents the kernelCPUClockTicker goroutine, and
+	// presumably task goroutines as well, from executing for a long period of
+	// time. It's also necessary to prevent CPU clocks from seeing large
+	// discontinuous jumps.
+	now := atomic.AddUint64(&ticker.k.cpuClock, 1)
+
+	// Check thread group CPU timers.
+	tgs := ticker.k.tasks.Root.ThreadGroupsAppend(ticker.tgs)
+	for _, tg := range tgs {
+		if atomic.LoadUint32(&tg.cpuTimersEnabled) == 0 {
+			continue
+		}
+
+		ticker.k.tasks.mu.RLock()
+		if tg.leader == nil {
+			// No tasks have ever run in this thread group.
+			ticker.k.tasks.mu.RUnlock()
+			continue
+		}
+		// Accumulate thread group CPU stats, and randomly select running tasks
+		// using reservoir sampling to receive CPU timer signals.
+		var virtReceiver *Task
+		nrVirtCandidates := 0
+		var profReceiver *Task
+		nrProfCandidates := 0
+		tgUserTime := tg.exitedCPUStats.UserTime
+		tgSysTime := tg.exitedCPUStats.SysTime
+		for t := tg.tasks.Front(); t != nil; t = t.Next() {
+			tsched := t.TaskGoroutineSchedInfo()
+			tgUserTime += time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick))
+			tgSysTime += time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick))
+			switch tsched.State {
+			case TaskGoroutineRunningApp:
+				// Considered by ITIMER_VIRT, ITIMER_PROF, and RLIMIT_CPU
+				// timers.
+				nrVirtCandidates++
+				if int(randInt31n(ticker.rng, int32(nrVirtCandidates))) == 0 {
+					virtReceiver = t
+				}
+				fallthrough
+			case TaskGoroutineRunningSys:
+				// Considered by ITIMER_PROF and RLIMIT_CPU timers.
+				nrProfCandidates++
+				if int(randInt31n(ticker.rng, int32(nrProfCandidates))) == 0 {
+					profReceiver = t
+				}
+			}
+		}
+		tgVirtNow := ktime.FromNanoseconds(tgUserTime.Nanoseconds())
+		tgProfNow := ktime.FromNanoseconds((tgUserTime + tgSysTime).Nanoseconds())
+
+		// All of the following are standard (not real-time) signals, which are
+		// automatically deduplicated, so we ignore the number of expirations.
+		tg.signalHandlers.mu.Lock()
+		// It should only be possible for these timers to advance if we found
+		// at least one running task.
+		if virtReceiver != nil {
+			// ITIMER_VIRTUAL
+			newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow)
+			tg.itimerVirtSetting = newItimerVirtSetting
+			if exp != 0 {
+				virtReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGVTALRM), true)
+			}
+		}
+		if profReceiver != nil {
+			// ITIMER_PROF
+			newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow)
+			tg.itimerProfSetting = newItimerProfSetting
+			if exp != 0 {
+				profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGPROF), true)
+			}
+			// RLIMIT_CPU soft limit
+			newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow)
+			tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting
+			if exp != 0 {
+				profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGXCPU), true)
+			}
+			// RLIMIT_CPU hard limit
+			rlimitCPUMax := tg.limits.Get(limits.CPU).Max
+			if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) {
+				profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true)
+			}
+		}
+		tg.signalHandlers.mu.Unlock()
+
+		ticker.k.tasks.mu.RUnlock()
+	}
+
+	// Retain tgs between calls to Notify to reduce allocations.
+	for i := range tgs {
+		tgs[i] = nil
+	}
+	ticker.tgs = tgs[:0]
+
+	// If nothing is running, we can disable the timer.
+	tasks := atomic.LoadInt64(&ticker.k.runningTasks)
+	if tasks == 0 {
+		ticker.k.runningTasksMu.Lock()
+		defer ticker.k.runningTasksMu.Unlock()
+		tasks := atomic.LoadInt64(&ticker.k.runningTasks)
+		if tasks != 0 {
+			// Raced with a 0 -> 1 transition.
+			return setting, false
+		}
+
+		// Stop the timer. We must cache the current setting so the
+		// kernel can access it without violating the lock order.
+		ticker.k.cpuClockTickerSetting = setting
+		ticker.k.cpuClockTickerDisabled = true
+		setting.Enabled = false
+		return setting, true
+	}
+
+	return setting, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (ticker *kernelCPUClockTicker) Destroy() {
+}
+
+// randInt31n returns a random integer in [0, n).
+//
+// randInt31n is equivalent to math/rand.Rand.int31n(), which is unexported.
+// See that function for details.
+func randInt31n(rng *rand.Rand, n int32) int32 {
+	v := rng.Uint32()
+	prod := uint64(v) * uint64(n)
+	low := uint32(prod)
+	if low < uint32(n) {
+		thresh := uint32(-n) % uint32(n)
+		for low < thresh {
+			v = rng.Uint32()
+			prod = uint64(v) * uint64(n)
+			low = uint32(prod)
+		}
+	}
+	return int32(prod >> 32)
+}
+
+// NotifyRlimitCPUUpdated is called by setrlimit.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) NotifyRlimitCPUUpdated() {
+	t.k.cpuClockTicker.Atomically(func() {
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		t.tg.signalHandlers.mu.Lock()
+		defer t.tg.signalHandlers.mu.Unlock()
+		rlimitCPU := t.tg.limits.Get(limits.CPU)
+		t.tg.rlimitCPUSoftSetting = ktime.Setting{
+			Enabled: rlimitCPU.Cur != limits.Infinity,
+			Next:    ktime.FromNanoseconds((time.Duration(rlimitCPU.Cur) * time.Second).Nanoseconds()),
+			Period:  time.Second,
+		}
+		if rlimitCPU.Max != limits.Infinity {
+			// Check if tg is already over the hard limit.
+			tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow())
+			tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds())
+			if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) {
+				t.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true)
+			}
+		}
+		t.tg.updateCPUTimersEnabledLocked()
+	})
+}
+
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) updateCPUTimersEnabledLocked() {
+	rlimitCPU := tg.limits.Get(limits.CPU)
+	if tg.itimerVirtSetting.Enabled || tg.itimerProfSetting.Enabled || tg.rlimitCPUSoftSetting.Enabled || rlimitCPU.Max != limits.Infinity {
+		atomic.StoreUint32(&tg.cpuTimersEnabled, 1)
+	} else {
+		atomic.StoreUint32(&tg.cpuTimersEnabled, 0)
+	}
+}
+
+// StateStatus returns a string representation of the task's current state,
+// appropriate for /proc/[pid]/status.
+func (t *Task) StateStatus() string {
+	switch s := t.TaskGoroutineSchedInfo().State; s {
+	case TaskGoroutineNonexistent:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		switch t.exitState {
+		case TaskExitZombie:
+			return "Z (zombie)"
+		case TaskExitDead:
+			return "X (dead)"
+		default:
+			// The task goroutine can't exit before passing through
+			// runExitNotify, so this indicates that the task has been created,
+			// but the task goroutine hasn't yet started. The Linux equivalent
+			// is struct task_struct::state == TASK_NEW
+			// (kernel/fork.c:copy_process() =>
+			// kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is
+			// masked out by TASK_REPORT for /proc/[pid]/status, leaving only
+			// TASK_RUNNING.
+			return "R (running)"
+		}
+	case TaskGoroutineRunningSys, TaskGoroutineRunningApp:
+		return "R (running)"
+	case TaskGoroutineBlockedInterruptible:
+		return "S (sleeping)"
+	case TaskGoroutineStopped:
+		t.tg.signalHandlers.mu.Lock()
+		defer t.tg.signalHandlers.mu.Unlock()
+		switch t.stop.(type) {
+		case *groupStop:
+			return "T (stopped)"
+		case *ptraceStop:
+			return "t (tracing stop)"
+		}
+		fallthrough
+	case TaskGoroutineBlockedUninterruptible:
+		// This is the name Linux uses for TASK_UNINTERRUPTIBLE and
+		// TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL):
+		// fs/proc/array.c:task_state_array.
+		return "D (disk sleep)"
+	default:
+		panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s))
+	}
+}
+
+// CPUMask returns a copy of t's allowed CPU mask.
+func (t *Task) CPUMask() sched.CPUSet {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.allowedCPUMask.Copy()
+}
+
+// SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of
+// mask.
+//
+// Preconditions: mask.Size() ==
+// sched.CPUSetSize(t.Kernel().ApplicationCores()).
+func (t *Task) SetCPUMask(mask sched.CPUSet) error {
+	if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want {
+		panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want))
+	}
+
+	// Remove CPUs in mask above Kernel.applicationCores.
+	mask.ClearAbove(t.k.applicationCores)
+
+	// Ensure that at least 1 CPU is still allowed.
+	if mask.NumCPUs() == 0 {
+		return syserror.EINVAL
+	}
+
+	if t.k.useHostCores {
+		// No-op; pretend the mask was immediately changed back.
+		return nil
+	}
+
+	t.tg.pidns.owner.mu.RLock()
+	rootTID := t.tg.pidns.owner.Root.tids[t]
+	t.tg.pidns.owner.mu.RUnlock()
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.allowedCPUMask = mask
+	atomic.StoreInt32(&t.cpu, assignCPU(mask, rootTID))
+	return nil
+}
+
+// CPU returns the cpu id for a given task.
+func (t *Task) CPU() int32 {
+	if t.k.useHostCores {
+		return int32(hostcpu.GetCPU())
+	}
+
+	return atomic.LoadInt32(&t.cpu)
+}
+
+// assignCPU returns the virtualized CPU number for the task with global TID
+// tid and allowedCPUMask allowed.
+func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) {
+	// To pretend that threads are evenly distributed to allowed CPUs, choose n
+	// to be less than the number of CPUs in allowed ...
+	n := int(tid) % int(allowed.NumCPUs())
+	// ... then pick the nth CPU in allowed.
+	allowed.ForEachCPU(func(c uint) {
+		if n--; n == 0 {
+			cpu = int32(c)
+		}
+	})
+	return cpu
+}
+
+// Niceness returns t's niceness.
+func (t *Task) Niceness() int {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.niceness
+}
+
+// Priority returns t's priority.
+func (t *Task) Priority() int {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.niceness + 20
+}
+
+// SetNiceness sets t's niceness to n.
+func (t *Task) SetNiceness(n int) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.niceness = n
+}
+
+// NumaPolicy returns t's current numa policy.
+func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.numaPolicy, t.numaNodeMask
+}
+
+// SetNumaPolicy sets t's numa policy.
+func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.numaPolicy = policy
+	t.numaNodeMask = nodeMask
+}
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
new file mode 100644
index 000000000..79766cafe
--- /dev/null
+++ b/pkg/sentry/kernel/task_signals.go
@@ -0,0 +1,1139 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file defines the behavior of task signal handling.
+
+import (
+	"fmt"
+	"sync/atomic"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/eventchannel"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// SignalAction is an internal signal action.
+type SignalAction int
+
+// Available signal actions.
+// Note that although we refer the complete set internally,
+// the application is only capable of using the Default and
+// Ignore actions from the system call interface.
+const (
+	SignalActionTerm SignalAction = iota
+	SignalActionCore
+	SignalActionStop
+	SignalActionIgnore
+	SignalActionHandler
+)
+
+// Default signal handler actions. Note that for most signals,
+// (except SIGKILL and SIGSTOP) these can be overridden by the app.
+var defaultActions = map[linux.Signal]SignalAction{
+	// POSIX.1-1990 standard.
+	linux.SIGHUP:  SignalActionTerm,
+	linux.SIGINT:  SignalActionTerm,
+	linux.SIGQUIT: SignalActionCore,
+	linux.SIGILL:  SignalActionCore,
+	linux.SIGABRT: SignalActionCore,
+	linux.SIGFPE:  SignalActionCore,
+	linux.SIGKILL: SignalActionTerm, // but see ThreadGroup.applySignalSideEffects
+	linux.SIGSEGV: SignalActionCore,
+	linux.SIGPIPE: SignalActionTerm,
+	linux.SIGALRM: SignalActionTerm,
+	linux.SIGTERM: SignalActionTerm,
+	linux.SIGUSR1: SignalActionTerm,
+	linux.SIGUSR2: SignalActionTerm,
+	linux.SIGCHLD: SignalActionIgnore,
+	linux.SIGCONT: SignalActionIgnore, // but see ThreadGroup.applySignalSideEffects
+	linux.SIGSTOP: SignalActionStop,
+	linux.SIGTSTP: SignalActionStop,
+	linux.SIGTTIN: SignalActionStop,
+	linux.SIGTTOU: SignalActionStop,
+	// POSIX.1-2001 standard.
+	linux.SIGBUS:    SignalActionCore,
+	linux.SIGPROF:   SignalActionTerm,
+	linux.SIGSYS:    SignalActionCore,
+	linux.SIGTRAP:   SignalActionCore,
+	linux.SIGURG:    SignalActionIgnore,
+	linux.SIGVTALRM: SignalActionTerm,
+	linux.SIGXCPU:   SignalActionCore,
+	linux.SIGXFSZ:   SignalActionCore,
+	// The rest on linux.
+	linux.SIGSTKFLT: SignalActionTerm,
+	linux.SIGIO:     SignalActionTerm,
+	linux.SIGPWR:    SignalActionTerm,
+	linux.SIGWINCH:  SignalActionIgnore,
+}
+
+// computeAction figures out what to do given a signal number
+// and an arch.SignalAct. SIGSTOP always results in a SignalActionStop,
+// and SIGKILL always results in a SignalActionTerm.
+// Signal 0 is always ignored as many programs use it for various internal functions
+// and don't expect it to do anything.
+//
+// In the event the signal is not one of these, act.Handler determines what
+// happens next.
+// If act.Handler is:
+// 0, the default action is taken;
+// 1, the signal is ignored;
+// anything else, the function returns SignalActionHandler.
+func computeAction(sig linux.Signal, act arch.SignalAct) SignalAction {
+	switch sig {
+	case linux.SIGSTOP:
+		return SignalActionStop
+	case linux.SIGKILL:
+		return SignalActionTerm
+	case linux.Signal(0):
+		return SignalActionIgnore
+	}
+
+	switch act.Handler {
+	case arch.SignalActDefault:
+		return defaultActions[sig]
+	case arch.SignalActIgnore:
+		return SignalActionIgnore
+	default:
+		return SignalActionHandler
+	}
+}
+
+// UnblockableSignals contains the set of signals which cannot be blocked.
+var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP)
+
+// StopSignals is the set of signals whose default action is SignalActionStop.
+var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU)
+
+// dequeueSignalLocked returns a pending signal that is *not* included in mask.
+// If there are no pending unmasked signals, dequeueSignalLocked returns nil.
+//
+// Preconditions: t.tg.signalHandlers.mu must be locked.
+func (t *Task) dequeueSignalLocked(mask linux.SignalSet) *arch.SignalInfo {
+	if info := t.pendingSignals.dequeue(mask); info != nil {
+		return info
+	}
+	return t.tg.pendingSignals.dequeue(mask)
+}
+
+// discardSpecificLocked removes all instances of the given signal from all
+// signal queues in tg.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) discardSpecificLocked(sig linux.Signal) {
+	tg.pendingSignals.discardSpecific(sig)
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		t.pendingSignals.discardSpecific(sig)
+	}
+}
+
+// PendingSignals returns the set of pending signals.
+func (t *Task) PendingSignals() linux.SignalSet {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet
+}
+
+// deliverSignal delivers the given signal and returns the following run state.
+func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunState {
+	sigact := computeAction(linux.Signal(info.Signo), act)
+
+	if t.haveSyscallReturn {
+		if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+			// Signals that are ignored, cause a thread group stop, or
+			// terminate the thread group do not interact with interrupted
+			// syscalls; in Linux terms, they are never returned to the signal
+			// handling path from get_signal => get_signal_to_deliver. The
+			// behavior of an interrupted syscall is determined by the first
+			// signal that is actually handled (by userspace).
+			if sigact == SignalActionHandler {
+				switch {
+				case sre == ERESTARTNOHAND:
+					fallthrough
+				case sre == ERESTART_RESTARTBLOCK:
+					fallthrough
+				case (sre == ERESTARTSYS && !act.IsRestart()):
+					t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
+					t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1)))
+				default:
+					t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
+					t.Arch().RestartSyscall()
+				}
+			}
+		}
+	}
+
+	switch sigact {
+	case SignalActionTerm, SignalActionCore:
+		// "Default action is to terminate the process." - signal(7)
+		t.Debugf("Signal %d: terminating thread group", info.Signo)
+
+		// Emit an event channel messages related to this uncaught signal.
+		ucs := &ucspb.UncaughtSignal{
+			Tid:          int32(t.Kernel().TaskSet().Root.IDOfTask(t)),
+			Pid:          int32(t.Kernel().TaskSet().Root.IDOfThreadGroup(t.ThreadGroup())),
+			Registers:    t.Arch().StateData().Proto(),
+			SignalNumber: info.Signo,
+		}
+
+		// Attach an fault address if appropriate.
+		switch linux.Signal(info.Signo) {
+		case linux.SIGSEGV, linux.SIGFPE, linux.SIGILL, linux.SIGTRAP, linux.SIGBUS:
+			ucs.FaultAddr = info.Addr()
+		}
+
+		eventchannel.Emit(ucs)
+
+		t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)})
+		return (*runExit)(nil)
+
+	case SignalActionStop:
+		// "Default action is to stop the process."
+		t.initiateGroupStop(info)
+
+	case SignalActionIgnore:
+		// "Default action is to ignore the signal."
+		t.Debugf("Signal %d: ignored", info.Signo)
+
+	case SignalActionHandler:
+		// Try to deliver the signal to the user-configured handler.
+		t.Debugf("Signal %d: delivering to handler", info.Signo)
+		if err := t.deliverSignalToHandler(info, act); err != nil {
+			// This is not a warning, it can occur during normal operation.
+			t.Debugf("Failed to deliver signal %+v to user handler: %v", info, err)
+
+			// Send a forced SIGSEGV. If the signal that couldn't be delivered
+			// was a SIGSEGV, force the handler to SIG_DFL.
+			t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */)
+			t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		}
+
+	default:
+		panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(linux.Signal(info.Signo), act)))
+	}
+	return (*runInterrupt)(nil)
+}
+
+// deliverSignalToHandler changes the task's userspace state to enter the given
+// user-configured handler for the given signal.
+func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) error {
+	// Signal delivery to an application handler interrupts restartable
+	// sequences.
+	t.rseqInterrupt()
+
+	// Are executing on the main stack,
+	// or the provided alternate stack?
+	sp := usermem.Addr(t.Arch().Stack())
+
+	// N.B. This is a *copy* of the alternate stack that the user's signal
+	// handler expects to see in its ucontext (even if it's not in use).
+	alt := t.signalStack
+	if act.IsOnStack() && alt.IsEnabled() {
+		alt.SetOnStack()
+		if !alt.Contains(sp) {
+			sp = usermem.Addr(alt.Top())
+		}
+	}
+
+	// Set up the signal handler. If we have a saved signal mask, the signal
+	// handler should run with the current mask, but sigreturn should restore
+	// the saved one.
+	st := &arch.Stack{t.Arch(), t.MemoryManager(), sp}
+	mask := t.signalMask
+	if t.haveSavedSignalMask {
+		mask = t.savedSignalMask
+	}
+
+	// Set up the restorer.
+	// x86-64 should always uses SA_RESTORER, but this flag is optional on other platforms.
+	// Please see the linux code as reference:
+	// linux/arch/x86/kernel/signal.c:__setup_rt_frame()
+	// If SA_RESTORER is not configured, we can use the sigreturn trampolines
+	// the vdso provides instead.
+	// Please see the linux code as reference:
+	// linux/arch/arm64/kernel/signal.c:setup_return()
+	if act.Flags&linux.SA_RESTORER == 0 {
+		act.Restorer = t.MemoryManager().VDSOSigReturn()
+	}
+
+	if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
+		return err
+	}
+	t.haveSavedSignalMask = false
+
+	// Add our signal mask.
+	newMask := t.signalMask | act.Mask
+	if !act.IsNoDefer() {
+		newMask |= linux.SignalSetOf(linux.Signal(info.Signo))
+	}
+	t.SetSignalMask(newMask)
+
+	return nil
+}
+
+var ctrlResume = &SyscallControl{ignoreReturn: true}
+
+// SignalReturn implements sigreturn(2) (if rt is false) or rt_sigreturn(2) (if
+// rt is true).
+func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
+	st := t.Stack()
+	sigset, alt, err := t.Arch().SignalRestore(st, rt)
+	if err != nil {
+		return nil, err
+	}
+
+	// Attempt to record the given signal stack. Note that we silently
+	// ignore failures here, as does Linux. Only an EFAULT may be
+	// generated, but SignalRestore has already deserialized the entire
+	// frame successfully.
+	t.SetSignalStack(alt)
+
+	// Restore our signal mask. SIGKILL and SIGSTOP should not be blocked.
+	t.SetSignalMask(sigset &^ UnblockableSignals)
+
+	return ctrlResume, nil
+}
+
+// Sigtimedwait implements the semantics of sigtimedwait(2).
+//
+// Preconditions: The caller must be running on the task goroutine. t.exitState
+// < TaskExitZombie.
+func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
+	// set is the set of signals we're interested in; invert it to get the set
+	// of signals to block.
+	mask := ^(set &^ UnblockableSignals)
+
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if info := t.dequeueSignalLocked(mask); info != nil {
+		return info, nil
+	}
+
+	if timeout == 0 {
+		return nil, syserror.EAGAIN
+	}
+
+	// Unblock signals we're waiting for. Remember the original signal mask so
+	// that Task.sendSignalTimerLocked doesn't discard ignored signals that
+	// we're temporarily unblocking.
+	t.realSignalMask = t.signalMask
+	t.setSignalMaskLocked(t.signalMask & mask)
+
+	// Wait for a timeout or new signal.
+	t.tg.signalHandlers.mu.Unlock()
+	_, err := t.BlockWithTimeout(nil, true, timeout)
+	t.tg.signalHandlers.mu.Lock()
+
+	// Restore the original signal mask.
+	t.setSignalMaskLocked(t.realSignalMask)
+	t.realSignalMask = 0
+
+	if info := t.dequeueSignalLocked(mask); info != nil {
+		return info, nil
+	}
+	if err == syserror.ETIMEDOUT {
+		return nil, syserror.EAGAIN
+	}
+	return nil, err
+}
+
+// SendSignal sends the given signal to t.
+//
+// The following errors may be returned:
+//
+//	syserror.ESRCH - The task has exited.
+//	syserror.EINVAL - The signal is not valid.
+//	syserror.EAGAIN - THe signal is realtime, and cannot be queued.
+//
+func (t *Task) SendSignal(info *arch.SignalInfo) error {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.sendSignalLocked(info, false /* group */)
+}
+
+// SendGroupSignal sends the given signal to t's thread group.
+func (t *Task) SendGroupSignal(info *arch.SignalInfo) error {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.sendSignalLocked(info, true /* group */)
+}
+
+// SendSignal sends the given signal to tg, using tg's leader to determine if
+// the signal is blocked.
+func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	return tg.leader.sendSignalLocked(info, true /* group */)
+}
+
+func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error {
+	return t.sendSignalTimerLocked(info, group, nil)
+}
+
+func (t *Task) sendSignalTimerLocked(info *arch.SignalInfo, group bool, timer *IntervalTimer) error {
+	if t.exitState == TaskExitDead {
+		return syserror.ESRCH
+	}
+	sig := linux.Signal(info.Signo)
+	if sig == 0 {
+		return nil
+	}
+	if !sig.IsValid() {
+		return syserror.EINVAL
+	}
+
+	// Signal side effects apply even if the signal is ultimately discarded.
+	t.tg.applySignalSideEffectsLocked(sig)
+
+	// TODO: "Only signals for which the "init" process has established a
+	// signal handler can be sent to the "init" process by other members of the
+	// PID namespace. This restriction applies even to privileged processes,
+	// and prevents other members of the PID namespace from accidentally
+	// killing the "init" process." - pid_namespaces(7). We don't currently do
+	// this for child namespaces, though we should; we also don't do this for
+	// the root namespace (the same restriction applies to global init on
+	// Linux), where whether or not we should is much murkier. In practice,
+	// most sandboxed applications are not prepared to function as an init
+	// process.
+
+	// Unmasked, ignored signals are discarded without being queued, unless
+	// they will be visible to a tracer. Even for group signals, it's the
+	// originally-targeted task's signal mask and tracer that matter; compare
+	// Linux's kernel/signal.c:__send_signal() => prepare_signal() =>
+	// sig_ignored().
+	ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore
+	if sigset := linux.SignalSetOf(sig); sigset&t.signalMask == 0 && sigset&t.realSignalMask == 0 && ignored && !t.hasTracer() {
+		t.Debugf("Discarding ignored signal %d", sig)
+		if timer != nil {
+			timer.signalRejectedLocked()
+		}
+		return nil
+	}
+
+	q := &t.pendingSignals
+	if group {
+		q = &t.tg.pendingSignals
+	}
+	if !q.enqueue(info, timer) {
+		if sig.IsRealtime() {
+			return syserror.EAGAIN
+		}
+		t.Debugf("Discarding duplicate signal %d", sig)
+		if timer != nil {
+			timer.signalRejectedLocked()
+		}
+		return nil
+	}
+
+	// Find a receiver to notify. Note that the task we choose to notify, if
+	// any, may not be the task that actually dequeues and handles the signal;
+	// e.g. a racing signal mask change may cause the notified task to become
+	// ineligible, or a racing sibling task may dequeue the signal first.
+	if t.canReceiveSignalLocked(sig) {
+		t.Debugf("Notified of signal %d", sig)
+		t.interrupt()
+		return nil
+	}
+	if group {
+		if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
+			nt.Debugf("Notified of group signal %d", sig)
+			nt.interrupt()
+			return nil
+		}
+	}
+	t.Debugf("No task notified of signal %d", sig)
+	return nil
+}
+
+func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) {
+	switch {
+	case linux.SignalSetOf(sig)&StopSignals != 0:
+		// Stop signals cause all prior SIGCONT to be discarded. (This is
+		// despite the fact this has little effect since SIGCONT's most
+		// important effect is applied when the signal is sent in the branch
+		// below, not when the signal is delivered.)
+		tg.discardSpecificLocked(linux.SIGCONT)
+	case sig == linux.SIGCONT:
+		// "The SIGCONT signal has a side effect of waking up (all threads of)
+		// a group-stopped process. This side effect happens before
+		// signal-delivery-stop. The tracer can't suppress this side effect (it
+		// can only suppress signal injection, which only causes the SIGCONT
+		// handler to not be executed in the tracee, if such a handler is
+		// installed." - ptrace(2)
+		tg.endGroupStopLocked(true)
+	case sig == linux.SIGKILL:
+		// "SIGKILL does not generate signal-delivery-stop and therefore the
+		// tracer can't suppress it. SIGKILL kills even within system calls
+		// (syscall-exit-stop is not generated prior to death by SIGKILL)." -
+		// ptrace(2)
+		//
+		// Note that this differs from ThreadGroup.requestExit in that it
+		// ignores tg.execing.
+		if !tg.exiting {
+			tg.exiting = true
+			tg.exitStatus = ExitStatus{Signo: int(linux.SIGKILL)}
+		}
+		for t := tg.tasks.Front(); t != nil; t = t.Next() {
+			t.killLocked()
+		}
+	}
+}
+
+// canReceiveSignalLocked returns true if t should be interrupted to receive
+// the given signal. canReceiveSignalLocked is analogous to Linux's
+// kernel/signal.c:wants_signal(), but see below for divergences.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
+	// Notify that the signal is queued.
+	t.signalQueue.Notify(waiter.EventMask(linux.MakeSignalSet(sig)))
+
+	// - Do not choose tasks that are blocking the signal.
+	if linux.SignalSetOf(sig)&t.signalMask != 0 {
+		return false
+	}
+	// - No need to check Task.exitState, as the exit path sets every bit in the
+	// signal mask when it transitions from TaskExitNone to TaskExitInitiated.
+	// - No special case for SIGKILL: SIGKILL already interrupted all tasks in the
+	// task group via applySignalSideEffects => killLocked.
+	// - Do not choose stopped tasks, which cannot handle signals.
+	if t.stop != nil {
+		return false
+	}
+	// - Do not choose tasks that have already been interrupted, as they may be
+	// busy handling another signal.
+	if len(t.interruptChan) != 0 {
+		return false
+	}
+	return true
+}
+
+// findSignalReceiverLocked returns a task in tg that should be interrupted to
+// receive the given signal. If no such task exists, findSignalReceiverLocked
+// returns nil.
+//
+// Linux actually records curr_target to balance the group signal targets.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) findSignalReceiverLocked(sig linux.Signal) *Task {
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		if t.canReceiveSignalLocked(sig) {
+			return t
+		}
+	}
+	return nil
+}
+
+// forceSignal ensures that the task is not ignoring or blocking the given
+// signal. If unconditional is true, forceSignal takes action even if the
+// signal isn't being ignored or blocked.
+func (t *Task) forceSignal(sig linux.Signal, unconditional bool) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.forceSignalLocked(sig, unconditional)
+}
+
+func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) {
+	blocked := linux.SignalSetOf(sig)&t.signalMask != 0
+	act := t.tg.signalHandlers.actions[sig]
+	ignored := act.Handler == arch.SignalActIgnore
+	if blocked || ignored || unconditional {
+		act.Handler = arch.SignalActDefault
+		t.tg.signalHandlers.actions[sig] = act
+		if blocked {
+			t.setSignalMaskLocked(t.signalMask &^ linux.SignalSetOf(sig))
+		}
+	}
+}
+
+// SignalMask returns a copy of t's signal mask.
+func (t *Task) SignalMask() linux.SignalSet {
+	return linux.SignalSet(atomic.LoadUint64((*uint64)(&t.signalMask)))
+}
+
+// SetSignalMask sets t's signal mask.
+//
+// Preconditions: SetSignalMask can only be called by the task goroutine.
+// t.exitState < TaskExitZombie.
+func (t *Task) SetSignalMask(mask linux.SignalSet) {
+	// By precondition, t prevents t.tg from completing an execve and mutating
+	// t.tg.signalHandlers, so we can skip the TaskSet mutex.
+	t.tg.signalHandlers.mu.Lock()
+	t.setSignalMaskLocked(mask)
+	t.tg.signalHandlers.mu.Unlock()
+}
+
+// Preconditions: The signal mutex must be locked.
+func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
+	oldMask := t.signalMask
+	atomic.StoreUint64((*uint64)(&t.signalMask), uint64(mask))
+
+	// If the new mask blocks any signals that were not blocked by the old
+	// mask, and at least one such signal is pending in tg.pendingSignals, and
+	// t has been woken, it could be the case that t was woken to handle that
+	// signal, but will no longer do so as a result of its new signal mask, so
+	// we have to pick a replacement.
+	blocked := mask &^ oldMask
+	blockedGroupPending := blocked & t.tg.pendingSignals.pendingSet
+	if blockedGroupPending != 0 && t.interrupted() {
+		linux.ForEachSignal(blockedGroupPending, func(sig linux.Signal) {
+			if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
+				nt.interrupt()
+				return
+			}
+		})
+		// We have to re-issue the interrupt consumed by t.interrupted() since
+		// it might have been for a different reason.
+		t.interruptSelf()
+	}
+
+	// Conversely, if the new mask unblocks any signals that were blocked by
+	// the old mask, and at least one such signal is pending, we may now need
+	// to handle that signal.
+	unblocked := oldMask &^ mask
+	unblockedPending := unblocked & (t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet)
+	if unblockedPending != 0 {
+		t.interruptSelf()
+	}
+}
+
+// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
+// comment).
+//
+// Preconditions: SetSavedSignalMask can only be called by the task goroutine.
+func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
+	t.savedSignalMask = mask
+	t.haveSavedSignalMask = true
+}
+
+// SignalStack returns the task-private signal stack.
+func (t *Task) SignalStack() arch.SignalStack {
+	alt := t.signalStack
+	if t.onSignalStack(alt) {
+		alt.Flags |= arch.SignalStackFlagOnStack
+	}
+	return alt
+}
+
+// onSignalStack returns true if the task is executing on the given signal stack.
+func (t *Task) onSignalStack(alt arch.SignalStack) bool {
+	sp := usermem.Addr(t.Arch().Stack())
+	return alt.Contains(sp)
+}
+
+// SetSignalStack sets the task-private signal stack.
+//
+// This value may not be changed if the task is currently executing on the
+// signal stack, i.e. if t.onSignalStack returns true. In this case, this
+// function will return false. Otherwise, true is returned.
+func (t *Task) SetSignalStack(alt arch.SignalStack) bool {
+	// Check that we're not executing on the stack.
+	if t.onSignalStack(t.signalStack) {
+		return false
+	}
+
+	if alt.Flags&arch.SignalStackFlagDisable != 0 {
+		// Don't record anything beyond the flags.
+		t.signalStack = arch.SignalStack{
+			Flags: arch.SignalStackFlagDisable,
+		}
+	} else {
+		// Mask out irrelevant parts: only disable matters.
+		alt.Flags &= arch.SignalStackFlagDisable
+		t.signalStack = alt
+	}
+	return true
+}
+
+// SetSignalAct atomically sets the thread group's signal action for signal sig
+// to *actptr (if actptr is not nil) and returns the old signal action.
+func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (arch.SignalAct, error) {
+	if !sig.IsValid() {
+		return arch.SignalAct{}, syserror.EINVAL
+	}
+
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	sh := tg.signalHandlers
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	oldact := sh.actions[sig]
+	if actptr != nil {
+		if sig == linux.SIGKILL || sig == linux.SIGSTOP {
+			return oldact, syserror.EINVAL
+		}
+
+		act := *actptr
+		act.Mask &^= UnblockableSignals
+		sh.actions[sig] = act
+		// From POSIX, by way of Linux:
+		//
+		// "Setting a signal action to SIG_IGN for a signal that is pending
+		// shall cause the pending signal to be discarded, whether or not it is
+		// blocked."
+		//
+		// "Setting a signal action to SIG_DFL for a signal that is pending and
+		// whose default action is to ignore the signal (for example, SIGCHLD),
+		// shall cause the pending signal to be discarded, whether or not it is
+		// blocked."
+		if computeAction(sig, act) == SignalActionIgnore {
+			tg.discardSpecificLocked(sig)
+		}
+	}
+	return oldact, nil
+}
+
+// CopyOutSignalAct converts the given SignalAct into an architecture-specific
+// type and then copies it out to task memory.
+func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
+	n := t.Arch().NewSignalAct()
+	n.SerializeFrom(s)
+	_, err := n.CopyOut(t, addr)
+	return err
+}
+
+// CopyInSignalAct copies an architecture-specific sigaction type from task
+// memory and then converts it into a SignalAct.
+func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
+	n := t.Arch().NewSignalAct()
+	var s arch.SignalAct
+	if _, err := n.CopyIn(t, addr); err != nil {
+		return s, err
+	}
+	n.DeserializeTo(&s)
+	return s, nil
+}
+
+// CopyOutSignalStack converts the given SignalStack into an
+// architecture-specific type and then copies it out to task memory.
+func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error {
+	n := t.Arch().NewSignalStack()
+	n.SerializeFrom(s)
+	_, err := n.CopyOut(t, addr)
+	return err
+}
+
+// CopyInSignalStack copies an architecture-specific stack_t from task memory
+// and then converts it into a SignalStack.
+func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
+	n := t.Arch().NewSignalStack()
+	var s arch.SignalStack
+	if _, err := n.CopyIn(t, addr); err != nil {
+		return s, err
+	}
+	n.DeserializeTo(&s)
+	return s, nil
+}
+
+// groupStop is a TaskStop placed on tasks that have received a stop signal
+// (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from
+// the ptrace man page.)
+//
+// +stateify savable
+type groupStop struct{}
+
+// Killable implements TaskStop.Killable.
+func (*groupStop) Killable() bool { return true }
+
+// initiateGroupStop attempts to initiate a group stop based on a
+// previously-dequeued stop signal.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) initiateGroupStop(info *arch.SignalInfo) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.groupStopPending {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing stop signal", info.Signo)
+		return
+	}
+	if !t.tg.groupStopDequeued {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing SIGCONT", info.Signo)
+		return
+	}
+	if t.tg.exiting {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing group exit", info.Signo)
+		return
+	}
+	if t.tg.execing != nil {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo)
+		return
+	}
+	if !t.tg.groupStopComplete {
+		t.tg.groupStopSignal = linux.Signal(info.Signo)
+	}
+	t.tg.groupStopPendingCount = 0
+	for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() {
+		if t2.killedLocked() || t2.exitState >= TaskExitInitiated {
+			t2.groupStopPending = false
+			continue
+		}
+		t2.groupStopPending = true
+		t2.groupStopAcknowledged = false
+		if t2.ptraceSeized {
+			t2.trapNotifyPending = true
+			if s, ok := t2.stop.(*ptraceStop); ok && s.listen {
+				t2.endInternalStopLocked()
+			}
+		}
+		t2.interrupt()
+		t.tg.groupStopPendingCount++
+	}
+	t.Debugf("Signal %d: stopping %d threads in thread group", info.Signo, t.tg.groupStopPendingCount)
+}
+
+// endGroupStopLocked ensures that all prior stop signals received by tg are
+// not stopping tg and will not stop tg in the future. If broadcast is true,
+// parent and tracer notification will be scheduled if appropriate.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) {
+	// Discard all previously-queued stop signals.
+	linux.ForEachSignal(StopSignals, tg.discardSpecificLocked)
+
+	if tg.groupStopPendingCount == 0 && !tg.groupStopComplete {
+		return
+	}
+
+	completeStr := "incomplete"
+	if tg.groupStopComplete {
+		completeStr = "complete"
+	}
+	tg.leader.Debugf("Ending %s group stop with %d threads pending", completeStr, tg.groupStopPendingCount)
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		t.groupStopPending = false
+		if t.ptraceSeized {
+			t.trapNotifyPending = true
+			if s, ok := t.stop.(*ptraceStop); ok && s.listen {
+				t.endInternalStopLocked()
+			}
+		} else {
+			if _, ok := t.stop.(*groupStop); ok {
+				t.endInternalStopLocked()
+			}
+		}
+	}
+	if broadcast {
+		// Instead of notifying the parent here, set groupContNotify so that
+		// one of the continuing tasks does so. (Linux does something similar.)
+		// The reason we do this is to keep locking sane. In order to send a
+		// signal to the parent, we need to lock its signal mutex, but we're
+		// already holding tg's signal mutex, and the TaskSet mutex must be
+		// locked for writing for us to hold two signal mutexes. Since we don't
+		// want to require this for endGroupStopLocked (which is called from
+		// signal-sending paths), nor do we want to lose atomicity by releasing
+		// the mutexes we're already holding, just let the continuing thread
+		// group deal with it.
+		tg.groupContNotify = true
+		tg.groupContInterrupted = !tg.groupStopComplete
+		tg.groupContWaitable = true
+	}
+	// Unsetting groupStopDequeued will cause racing calls to initiateGroupStop
+	// to recognize that the group stop has been cancelled.
+	tg.groupStopDequeued = false
+	tg.groupStopSignal = 0
+	tg.groupStopPendingCount = 0
+	tg.groupStopComplete = false
+	tg.groupStopWaitable = false
+}
+
+// participateGroupStopLocked is called to handle thread group side effects
+// after t unsets t.groupStopPending. The caller must handle task side effects
+// (e.g. placing the task goroutine into the group stop). It returns true if
+// the caller must notify t.tg.leader's parent of a completed group stop (which
+// participateGroupStopLocked cannot do due to holding the wrong locks).
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) participateGroupStopLocked() bool {
+	if t.groupStopAcknowledged {
+		return false
+	}
+	t.groupStopAcknowledged = true
+	t.tg.groupStopPendingCount--
+	if t.tg.groupStopPendingCount != 0 {
+		return false
+	}
+	if t.tg.groupStopComplete {
+		return false
+	}
+	t.Debugf("Completing group stop")
+	t.tg.groupStopComplete = true
+	t.tg.groupStopWaitable = true
+	t.tg.groupContNotify = false
+	t.tg.groupContWaitable = false
+	return true
+}
+
+// signalStop sends a signal to t's thread group of a new group stop, group
+// continue, or ptrace stop, if appropriate. code and status are set in the
+// signal sent to tg, if any.
+//
+// Preconditions: The TaskSet mutex must be locked (for reading or writing).
+func (t *Task) signalStop(target *Task, code int32, status int32) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	act, ok := t.tg.signalHandlers.actions[linux.SIGCHLD]
+	if !ok || (act.Handler != arch.SignalActIgnore && act.Flags&arch.SignalFlagNoCldStop == 0) {
+		sigchld := &arch.SignalInfo{
+			Signo: int32(linux.SIGCHLD),
+			Code:  code,
+		}
+		sigchld.SetPid(int32(t.tg.pidns.tids[target]))
+		sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+		sigchld.SetStatus(status)
+		// TODO(b/72102453): Set utime, stime.
+		t.sendSignalLocked(sigchld, true /* group */)
+	}
+}
+
+// The runInterrupt state handles conditions indicated by interrupts.
+//
+// +stateify savable
+type runInterrupt struct{}
+
+func (*runInterrupt) execute(t *Task) taskRunState {
+	// Interrupts are de-duplicated (if t is interrupted twice before
+	// t.interrupted() is called, t.interrupted() will only return true once),
+	// so early exits from this function must re-enter the runInterrupt state
+	// to check for more interrupt-signaled conditions.
+
+	t.tg.signalHandlers.mu.Lock()
+
+	// Did we just leave a group stop?
+	if t.tg.groupContNotify {
+		t.tg.groupContNotify = false
+		sig := t.tg.groupStopSignal
+		intr := t.tg.groupContInterrupted
+		t.tg.signalHandlers.mu.Unlock()
+		t.tg.pidns.owner.mu.RLock()
+		// For consistency with Linux, if the parent and (thread group
+		// leader's) tracer are in the same thread group, deduplicate
+		// notifications.
+		notifyParent := t.tg.leader.parent != nil
+		if tracer := t.tg.leader.Tracer(); tracer != nil {
+			if notifyParent && tracer.tg == t.tg.leader.parent.tg {
+				notifyParent = false
+			}
+			// Sending CLD_STOPPED to the tracer doesn't really make any sense;
+			// the thread group leader may have already entered the stop and
+			// notified its tracer accordingly. But it's consistent with
+			// Linux...
+			if intr {
+				tracer.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+				if !notifyParent {
+					tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop | EventChildGroupStop)
+				} else {
+					tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop)
+				}
+			} else {
+				tracer.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
+				tracer.tg.eventQueue.Notify(EventGroupContinue)
+			}
+		}
+		if notifyParent {
+			// If groupContInterrupted, do as Linux does and pretend the group
+			// stop completed just before it ended. The theoretical behavior in
+			// this case would be to send a SIGCHLD indicating the completed
+			// stop, followed by a SIGCHLD indicating the continue. However,
+			// SIGCHLD is a standard signal, so the latter would always be
+			// dropped. Hence sending only the former is equivalent.
+			if intr {
+				t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+				t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue | EventChildGroupStop)
+			} else {
+				t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
+				t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue)
+			}
+		}
+		t.tg.pidns.owner.mu.RUnlock()
+		return (*runInterrupt)(nil)
+	}
+
+	// Do we need to enter a group stop or related ptrace stop? This path is
+	// analogous to Linux's kernel/signal.c:get_signal() => do_signal_stop()
+	// (with ptrace enabled) and do_jobctl_trap().
+	if t.groupStopPending || t.trapStopPending || t.trapNotifyPending {
+		sig := t.tg.groupStopSignal
+		notifyParent := false
+		if t.groupStopPending {
+			t.groupStopPending = false
+			// We care about t.tg.groupStopSignal (for tracer notification)
+			// even if this doesn't complete a group stop, so keep the
+			// value of sig we've already read.
+			notifyParent = t.participateGroupStopLocked()
+		}
+		t.trapStopPending = false
+		t.trapNotifyPending = false
+		// Drop the signal mutex so we can take the TaskSet mutex.
+		t.tg.signalHandlers.mu.Unlock()
+
+		t.tg.pidns.owner.mu.RLock()
+		if t.tg.leader.parent == nil {
+			notifyParent = false
+		}
+		if tracer := t.Tracer(); tracer != nil {
+			if t.ptraceSeized {
+				if sig == 0 {
+					sig = linux.SIGTRAP
+				}
+				// "If tracee was attached using PTRACE_SEIZE, group-stop is
+				// indicated by PTRACE_EVENT_STOP: status>>16 ==
+				// PTRACE_EVENT_STOP. This allows detection of group-stops
+				// without requiring an extra PTRACE_GETSIGINFO call." -
+				// "Group-stop", ptrace(2)
+				t.ptraceCode = int32(sig) | linux.PTRACE_EVENT_STOP<<8
+				t.ptraceSiginfo = &arch.SignalInfo{
+					Signo: int32(sig),
+					Code:  t.ptraceCode,
+				}
+				t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
+				t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+			} else {
+				t.ptraceCode = int32(sig)
+				t.ptraceSiginfo = nil
+			}
+			if t.beginPtraceStopLocked() {
+				tracer.signalStop(t, arch.CLD_STOPPED, int32(sig))
+				// For consistency with Linux, if the parent and tracer are in the
+				// same thread group, deduplicate notification signals.
+				if notifyParent && tracer.tg == t.tg.leader.parent.tg {
+					notifyParent = false
+					tracer.tg.eventQueue.Notify(EventChildGroupStop | EventTraceeStop)
+				} else {
+					tracer.tg.eventQueue.Notify(EventTraceeStop)
+				}
+			}
+		} else {
+			t.tg.signalHandlers.mu.Lock()
+			if !t.killedLocked() {
+				t.beginInternalStopLocked((*groupStop)(nil))
+			}
+			t.tg.signalHandlers.mu.Unlock()
+		}
+		if notifyParent {
+			t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+			t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
+		}
+		t.tg.pidns.owner.mu.RUnlock()
+
+		return (*runInterrupt)(nil)
+	}
+
+	// Are there signals pending?
+	if info := t.dequeueSignalLocked(t.signalMask); info != nil {
+		if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 {
+			// Indicate that we've dequeued a stop signal before unlocking the
+			// signal mutex; initiateGroupStop will check for races with
+			// endGroupStopLocked after relocking it.
+			t.tg.groupStopDequeued = true
+		}
+		if t.ptraceSignalLocked(info) {
+			// Dequeueing the signal action must wait until after the
+			// signal-delivery-stop ends since the tracer can change or
+			// suppress the signal.
+			t.tg.signalHandlers.mu.Unlock()
+			return (*runInterruptAfterSignalDeliveryStop)(nil)
+		}
+		act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
+		t.tg.signalHandlers.mu.Unlock()
+		return t.deliverSignal(info, act)
+	}
+
+	t.tg.signalHandlers.mu.Unlock()
+	return (*runApp)(nil)
+}
+
+// +stateify savable
+type runInterruptAfterSignalDeliveryStop struct{}
+
+func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
+	t.tg.pidns.owner.mu.Lock()
+	// Can't defer unlock: deliverSignal must be called without holding TaskSet
+	// mutex.
+	sig := linux.Signal(t.ptraceCode)
+	defer func() {
+		t.ptraceSiginfo = nil
+	}()
+	if !sig.IsValid() {
+		t.tg.pidns.owner.mu.Unlock()
+		return (*runInterrupt)(nil)
+	}
+	info := t.ptraceSiginfo
+	if sig != linux.Signal(info.Signo) {
+		info.Signo = int32(sig)
+		info.Errno = 0
+		info.Code = arch.SignalInfoUser
+		// pid isn't a valid field for all signal numbers, but Linux
+		// doesn't care (kernel/signal.c:ptrace_signal()).
+		//
+		// Linux uses t->parent for the tid and uid here, which is the tracer
+		// if it hasn't detached or the real parent otherwise.
+		parent := t.parent
+		if tracer := t.Tracer(); tracer != nil {
+			parent = tracer
+		}
+		if parent == nil {
+			// Tracer has detached and t was created by Kernel.CreateProcess().
+			// Pretend the parent is in an ancestor PID + user namespace.
+			info.SetPid(0)
+			info.SetUid(int32(auth.OverflowUID))
+		} else {
+			info.SetPid(int32(t.tg.pidns.tids[parent]))
+			info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+		}
+	}
+	t.tg.signalHandlers.mu.Lock()
+	t.tg.pidns.owner.mu.Unlock()
+	// If the signal is masked, re-queue it.
+	if linux.SignalSetOf(sig)&t.signalMask != 0 {
+		t.sendSignalLocked(info, false /* group */)
+		t.tg.signalHandlers.mu.Unlock()
+		return (*runInterrupt)(nil)
+	}
+	act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
+	t.tg.signalHandlers.mu.Unlock()
+	return t.deliverSignal(info, act)
+}
+
+// SignalRegister registers a waiter for pending signals.
+func (t *Task) SignalRegister(e *waiter.Entry, mask waiter.EventMask) {
+	t.tg.signalHandlers.mu.Lock()
+	t.signalQueue.EventRegister(e, mask)
+	t.tg.signalHandlers.mu.Unlock()
+}
+
+// SignalUnregister unregisters a waiter for pending signals.
+func (t *Task) SignalUnregister(e *waiter.Entry) {
+	t.tg.signalHandlers.mu.Lock()
+	t.signalQueue.EventUnregister(e)
+	t.tg.signalHandlers.mu.Unlock()
+}
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
new file mode 100644
index 000000000..8485fb4b6
--- /dev/null
+++ b/pkg/sentry/kernel/task_start.go
@@ -0,0 +1,319 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// TaskConfig defines the configuration of a new Task (see below).
+type TaskConfig struct {
+	// Kernel is the owning Kernel.
+	Kernel *Kernel
+
+	// Parent is the new task's parent. Parent may be nil.
+	Parent *Task
+
+	// If InheritParent is not nil, use InheritParent's parent as the new
+	// task's parent.
+	InheritParent *Task
+
+	// ThreadGroup is the ThreadGroup the new task belongs to.
+	ThreadGroup *ThreadGroup
+
+	// SignalMask is the new task's initial signal mask.
+	SignalMask linux.SignalSet
+
+	// TaskContext is the TaskContext of the new task. Ownership of the
+	// TaskContext is transferred to TaskSet.NewTask, whether or not it
+	// succeeds.
+	TaskContext *TaskContext
+
+	// FSContext is the FSContext of the new task. A reference must be held on
+	// FSContext, which is transferred to TaskSet.NewTask whether or not it
+	// succeeds.
+	FSContext *FSContext
+
+	// FDTable is the FDTableof the new task. A reference must be held on
+	// FDMap, which is transferred to TaskSet.NewTask whether or not it
+	// succeeds.
+	FDTable *FDTable
+
+	// Credentials is the Credentials of the new task.
+	Credentials *auth.Credentials
+
+	// Niceness is the niceness of the new task.
+	Niceness int
+
+	// NetworkNamespace is the network namespace to be used for the new task.
+	NetworkNamespace *inet.Namespace
+
+	// AllowedCPUMask contains the cpus that this task can run on.
+	AllowedCPUMask sched.CPUSet
+
+	// UTSNamespace is the UTSNamespace of the new task.
+	UTSNamespace *UTSNamespace
+
+	// IPCNamespace is the IPCNamespace of the new task.
+	IPCNamespace *IPCNamespace
+
+	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
+	AbstractSocketNamespace *AbstractSocketNamespace
+
+	// MountNamespaceVFS2 is the MountNamespace of the new task.
+	MountNamespaceVFS2 *vfs.MountNamespace
+
+	// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
+	RSeqAddr usermem.Addr
+
+	// RSeqSignature is the signature that the rseq abort IP must be signed
+	// with.
+	RSeqSignature uint32
+
+	// ContainerID is the container the new task belongs to.
+	ContainerID string
+}
+
+// NewTask creates a new task defined by cfg.
+//
+// NewTask does not start the returned task; the caller must call Task.Start.
+func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
+	t, err := ts.newTask(cfg)
+	if err != nil {
+		cfg.TaskContext.release()
+		cfg.FSContext.DecRef()
+		cfg.FDTable.DecRef()
+		if cfg.MountNamespaceVFS2 != nil {
+			cfg.MountNamespaceVFS2.DecRef()
+		}
+		return nil, err
+	}
+	return t, nil
+}
+
+// newTask is a helper for TaskSet.NewTask that only takes ownership of parts
+// of cfg if it succeeds.
+func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
+	tg := cfg.ThreadGroup
+	tc := cfg.TaskContext
+	t := &Task{
+		taskNode: taskNode{
+			tg:       tg,
+			parent:   cfg.Parent,
+			children: make(map[*Task]struct{}),
+		},
+		runState:           (*runApp)(nil),
+		interruptChan:      make(chan struct{}, 1),
+		signalMask:         cfg.SignalMask,
+		signalStack:        arch.SignalStack{Flags: arch.SignalStackFlagDisable},
+		tc:                 *tc,
+		fsContext:          cfg.FSContext,
+		fdTable:            cfg.FDTable,
+		p:                  cfg.Kernel.Platform.NewContext(),
+		k:                  cfg.Kernel,
+		ptraceTracees:      make(map[*Task]struct{}),
+		allowedCPUMask:     cfg.AllowedCPUMask.Copy(),
+		ioUsage:            &usage.IO{},
+		niceness:           cfg.Niceness,
+		netns:              cfg.NetworkNamespace,
+		utsns:              cfg.UTSNamespace,
+		ipcns:              cfg.IPCNamespace,
+		abstractSockets:    cfg.AbstractSocketNamespace,
+		mountNamespaceVFS2: cfg.MountNamespaceVFS2,
+		rseqCPU:            -1,
+		rseqAddr:           cfg.RSeqAddr,
+		rseqSignature:      cfg.RSeqSignature,
+		futexWaiter:        futex.NewWaiter(),
+		containerID:        cfg.ContainerID,
+	}
+	t.creds.Store(cfg.Credentials)
+	t.endStopCond.L = &t.tg.signalHandlers.mu
+	t.ptraceTracer.Store((*Task)(nil))
+	// We don't construct t.blockingTimer until Task.run(); see that function
+	// for justification.
+
+	// Make the new task (and possibly thread group) visible to the rest of
+	// the system atomically.
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	if tg.exiting || tg.execing != nil {
+		// If the caller is in the same thread group, then what we return
+		// doesn't matter too much since the caller will exit before it returns
+		// to userspace. If the caller isn't in the same thread group, then
+		// we're in uncharted territory and can return whatever we want.
+		return nil, syserror.EINTR
+	}
+	if err := ts.assignTIDsLocked(t); err != nil {
+		return nil, err
+	}
+	// Below this point, newTask is expected not to fail (there is no rollback
+	// of assignTIDsLocked or any of the following).
+
+	// Logging on t's behalf will panic if t.logPrefix hasn't been
+	// initialized. This is the earliest point at which we can do so
+	// (since t now has thread IDs).
+	t.updateInfoLocked()
+
+	if cfg.InheritParent != nil {
+		t.parent = cfg.InheritParent.parent
+	}
+	if t.parent != nil {
+		t.parent.children[t] = struct{}{}
+	}
+
+	if tg.leader == nil {
+		// New thread group.
+		tg.leader = t
+		if parentPG := tg.parentPG(); parentPG == nil {
+			tg.createSession()
+		} else {
+			// Inherit the process group and terminal.
+			parentPG.incRefWithParent(parentPG)
+			tg.processGroup = parentPG
+			tg.tty = t.parent.tg.tty
+		}
+	}
+	tg.tasks.PushBack(t)
+	tg.tasksCount++
+	tg.liveTasks++
+	tg.activeTasks++
+
+	// Propagate external TaskSet stops to the new task.
+	t.stopCount = ts.stopCount
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t])
+
+	t.startTime = t.k.RealtimeClock().Now()
+
+	return t, nil
+}
+
+// assignTIDsLocked ensures that new task t is visible in all PID namespaces in
+// which it should be visible.
+//
+// Preconditions: ts.mu must be locked for writing.
+func (ts *TaskSet) assignTIDsLocked(t *Task) error {
+	type allocatedTID struct {
+		ns  *PIDNamespace
+		tid ThreadID
+	}
+	var allocatedTIDs []allocatedTID
+	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+		tid, err := ns.allocateTID()
+		if err != nil {
+			// Failure. Remove the tids we already allocated in descendant
+			// namespaces.
+			for _, a := range allocatedTIDs {
+				delete(a.ns.tasks, a.tid)
+				delete(a.ns.tids, t)
+				if t.tg.leader == nil {
+					delete(a.ns.tgids, t.tg)
+				}
+			}
+			return err
+		}
+		ns.tasks[tid] = t
+		ns.tids[t] = tid
+		if t.tg.leader == nil {
+			// New thread group.
+			ns.tgids[t.tg] = tid
+		}
+		allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
+	}
+	return nil
+}
+
+// allocateTID returns an unused ThreadID from ns.
+//
+// Preconditions: ns.owner.mu must be locked for writing.
+func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
+	if ns.exiting {
+		// "In this case, a subsequent fork(2) into this PID namespace will
+		// fail with the error ENOMEM; it is not possible to create a new
+		// processes [sic] in a PID namespace whose init process has
+		// terminated." - pid_namespaces(7)
+		return 0, syserror.ENOMEM
+	}
+	tid := ns.last
+	for {
+		// Next.
+		tid++
+		if tid > TasksLimit {
+			tid = InitTID + 1
+		}
+
+		// Is it available?
+		tidInUse := func() bool {
+			if _, ok := ns.tasks[tid]; ok {
+				return true
+			}
+			if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok {
+				return true
+			}
+			if _, ok := ns.sessions[SessionID(tid)]; ok {
+				return true
+			}
+			return false
+		}()
+
+		if !tidInUse {
+			ns.last = tid
+			return tid, nil
+		}
+
+		// Did we do a full cycle?
+		if tid == ns.last {
+			// No tid available.
+			return 0, syserror.EAGAIN
+		}
+	}
+}
+
+// Start starts the task goroutine. Start must be called exactly once for each
+// task returned by NewTask.
+//
+// 'tid' must be the task's TID in the root PID namespace and it's used for
+// debugging purposes only (set as parameter to Task.run to make it visible
+// in stack dumps).
+func (t *Task) Start(tid ThreadID) {
+	// If the task was restored, it may be "starting" after having already exited.
+	if t.runState == nil {
+		return
+	}
+	t.goroutineStopped.Add(1)
+	t.tg.liveGoroutines.Add(1)
+	t.tg.pidns.owner.liveGoroutines.Add(1)
+	t.tg.pidns.owner.runningGoroutines.Add(1)
+
+	// Task is now running in system mode.
+	t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)
+
+	// Use the task's TID in the root PID namespace to make it visible in stack dumps.
+	go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
+}
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
new file mode 100644
index 000000000..10c6e455c
--- /dev/null
+++ b/pkg/sentry/kernel/task_stop.go
@@ -0,0 +1,226 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements task stops, which represent the equivalent of Linux's
+// uninterruptible sleep states in a way that is compatible with save/restore.
+// Task stops comprise both internal stops (which form part of the task's
+// "normal" control flow) and external stops (which do not); see README.md for
+// details.
+//
+// There are multiple interfaces for interacting with stops because there are
+// multiple cases to consider:
+//
+// - A task goroutine can begin a stop on its associated task (e.g. a
+// vfork() syscall stopping the calling task until the child task releases its
+// MM). In this case, calling Task.interrupt is both unnecessary (the task
+// goroutine obviously cannot be blocked in Task.block or executing application
+// code) and undesirable (as it may spuriously interrupt a in-progress
+// syscall).
+//
+// Beginning internal stops in this case is implemented by
+// Task.beginInternalStop / Task.beginInternalStopLocked. As of this writing,
+// there are no instances of this case that begin external stops, except for
+// autosave; however, autosave terminates the sentry without ending the
+// external stop, so the spurious interrupt is moot.
+//
+// - An arbitrary goroutine can begin a stop on an unrelated task (e.g. all
+// tasks being stopped in preparation for state checkpointing). If the task
+// goroutine may be in Task.block or executing application code, it must be
+// interrupted by Task.interrupt for it to actually enter the stop; since,
+// strictly speaking, we have no way of determining this, we call
+// Task.interrupt unconditionally.
+//
+// Beginning external stops in this case is implemented by
+// Task.BeginExternalStop. As of this writing, there are no instances of this
+// case that begin internal stops.
+//
+// - An arbitrary goroutine can end a stop on an unrelated task (e.g. an
+// exiting task resuming a sibling task that has been blocked in an execve()
+// syscall waiting for other tasks to exit). In this case, Task.endStopCond
+// must be notified to kick the task goroutine out of Task.doStop.
+//
+// Ending internal stops in this case is implemented by
+// Task.endInternalStopLocked. Ending external stops in this case is
+// implemented by Task.EndExternalStop.
+//
+// - Hypothetically, a task goroutine can end an internal stop on its
+// associated task. As of this writing, there are no instances of this case.
+// However, any instances of this case could still use the above functions,
+// since notifying Task.endStopCond would be unnecessary but harmless.
+
+import (
+	"fmt"
+	"sync/atomic"
+)
+
+// A TaskStop is a condition visible to the task control flow graph that
+// prevents a task goroutine from running or exiting, i.e. an internal stop.
+//
+// NOTE(b/30793614): Most TaskStops don't contain any data; they're
+// distinguished by their type. The obvious way to implement such a TaskStop
+// is:
+//
+//     type groupStop struct{}
+//     func (groupStop) Killable() bool { return true }
+//     ...
+//     t.beginInternalStop(groupStop{})
+//
+// However, this doesn't work because the state package can't serialize values,
+// only pointers. Furthermore, the correctness of save/restore depends on the
+// ability to pass a TaskStop to endInternalStop that will compare equal to the
+// TaskStop that was passed to beginInternalStop, even if a save/restore cycle
+// occurred between the two. As a result, the current idiom is to always use a
+// typecast nil for data-free TaskStops:
+//
+//     type groupStop struct{}
+//     func (*groupStop) Killable() bool { return true }
+//     ...
+//     t.beginInternalStop((*groupStop)(nil))
+//
+// This is pretty gross, but the alternatives seem grosser.
+type TaskStop interface {
+	// Killable returns true if Task.Kill should end the stop prematurely.
+	// Killable is analogous to Linux's TASK_WAKEKILL.
+	Killable() bool
+}
+
+// beginInternalStop indicates the start of an internal stop that applies to t.
+//
+// Preconditions: The task must not already be in an internal stop (i.e. t.stop
+// == nil). The caller must be running on the task goroutine.
+func (t *Task) beginInternalStop(s TaskStop) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.beginInternalStopLocked(s)
+}
+
+// Preconditions: The signal mutex must be locked. All preconditions for
+// Task.beginInternalStop also apply.
+func (t *Task) beginInternalStopLocked(s TaskStop) {
+	if t.stop != nil {
+		panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop))
+	}
+	t.Debugf("Entering internal stop %#v", s)
+	t.stop = s
+	t.beginStopLocked()
+}
+
+// endInternalStopLocked indicates the end of an internal stop that applies to
+// t. endInternalStopLocked does not wait for the task to resume.
+//
+// The caller is responsible for ensuring that the internal stop they expect
+// actually applies to t; this requires holding the signal mutex which protects
+// t.stop, which is why there is no endInternalStop that locks the signal mutex
+// for you.
+//
+// Preconditions: The signal mutex must be locked. The task must be in an
+// internal stop (i.e. t.stop != nil).
+func (t *Task) endInternalStopLocked() {
+	if t.stop == nil {
+		panic("Attempting to leave non-existent internal stop")
+	}
+	t.Debugf("Leaving internal stop %#v", t.stop)
+	t.stop = nil
+	t.endStopLocked()
+}
+
+// BeginExternalStop indicates the start of an external stop that applies to t.
+// BeginExternalStop does not wait for t's task goroutine to stop.
+func (t *Task) BeginExternalStop() {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.beginStopLocked()
+	t.interrupt()
+}
+
+// EndExternalStop indicates the end of an external stop started by a previous
+// call to Task.BeginExternalStop. EndExternalStop does not wait for t's task
+// goroutine to resume.
+func (t *Task) EndExternalStop() {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.endStopLocked()
+}
+
+// beginStopLocked increments t.stopCount to indicate that a new internal or
+// external stop applies to t.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) beginStopLocked() {
+	if newval := atomic.AddInt32(&t.stopCount, 1); newval <= 0 {
+		// Most likely overflow.
+		panic(fmt.Sprintf("Invalid stopCount: %d", newval))
+	}
+}
+
+// endStopLocked decrements t.stopCount to indicate that an existing internal
+// or external stop no longer applies to t.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) endStopLocked() {
+	if newval := atomic.AddInt32(&t.stopCount, -1); newval < 0 {
+		panic(fmt.Sprintf("Invalid stopCount: %d", newval))
+	} else if newval == 0 {
+		t.endStopCond.Signal()
+	}
+}
+
+// BeginExternalStop indicates the start of an external stop that applies to
+// all current and future tasks in ts. BeginExternalStop does not wait for
+// task goroutines to stop.
+func (ts *TaskSet) BeginExternalStop() {
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.stopCount++
+	if ts.stopCount <= 0 {
+		panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount))
+	}
+	if ts.Root == nil {
+		return
+	}
+	for t := range ts.Root.tids {
+		t.tg.signalHandlers.mu.Lock()
+		t.beginStopLocked()
+		t.tg.signalHandlers.mu.Unlock()
+		t.interrupt()
+	}
+}
+
+// EndExternalStop indicates the end of an external stop started by a previous
+// call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task
+// goroutines to resume.
+func (ts *TaskSet) EndExternalStop() {
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.stopCount--
+	if ts.stopCount < 0 {
+		panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount))
+	}
+	if ts.Root == nil {
+		return
+	}
+	for t := range ts.Root.tids {
+		t.tg.signalHandlers.mu.Lock()
+		t.endStopLocked()
+		t.tg.signalHandlers.mu.Unlock()
+	}
+}
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
new file mode 100644
index 000000000..a5903b0b5
--- /dev/null
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -0,0 +1,469 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"os"
+	"runtime/trace"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
+	"gvisor.dev/gvisor/pkg/metric"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
+// include/linux/errno.h. These errnos are never returned to userspace
+// directly, but are used to communicate the expected behavior of an
+// interrupted syscall from the syscall to signal handling.
+type SyscallRestartErrno int
+
+// These numeric values are significant because ptrace syscall exit tracing can
+// observe them.
+//
+// For all of the following errnos, if the syscall is not interrupted by a
+// signal delivered to a user handler, the syscall is restarted.
+const (
+	// ERESTARTSYS is returned by an interrupted syscall to indicate that it
+	// should be converted to EINTR if interrupted by a signal delivered to a
+	// user handler without SA_RESTART set, and restarted otherwise.
+	ERESTARTSYS = SyscallRestartErrno(512)
+
+	// ERESTARTNOINTR is returned by an interrupted syscall to indicate that it
+	// should always be restarted.
+	ERESTARTNOINTR = SyscallRestartErrno(513)
+
+	// ERESTARTNOHAND is returned by an interrupted syscall to indicate that it
+	// should be converted to EINTR if interrupted by a signal delivered to a
+	// user handler, and restarted otherwise.
+	ERESTARTNOHAND = SyscallRestartErrno(514)
+
+	// ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate
+	// that it should be restarted using a custom function. The interrupted
+	// syscall must register a custom restart function by calling
+	// Task.SetRestartSyscallFn.
+	ERESTART_RESTARTBLOCK = SyscallRestartErrno(516)
+)
+
+var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application")
+
+// Error implements error.Error.
+func (e SyscallRestartErrno) Error() string {
+	// Descriptions are borrowed from strace.
+	switch e {
+	case ERESTARTSYS:
+		return "to be restarted if SA_RESTART is set"
+	case ERESTARTNOINTR:
+		return "to be restarted"
+	case ERESTARTNOHAND:
+		return "to be restarted if no handler"
+	case ERESTART_RESTARTBLOCK:
+		return "interrupted by signal"
+	default:
+		return "(unknown interrupt error)"
+	}
+}
+
+// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by
+// rv, the value in a syscall return register.
+func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) {
+	switch int(rv) {
+	case -int(ERESTARTSYS):
+		return ERESTARTSYS, true
+	case -int(ERESTARTNOINTR):
+		return ERESTARTNOINTR, true
+	case -int(ERESTARTNOHAND):
+		return ERESTARTNOHAND, true
+	case -int(ERESTART_RESTARTBLOCK):
+		return ERESTART_RESTARTBLOCK, true
+	default:
+		return 0, false
+	}
+}
+
+// SyscallRestartBlock represents the restart block for a syscall restartable
+// with a custom function. It encapsulates the state required to restart a
+// syscall across a S/R.
+type SyscallRestartBlock interface {
+	Restart(t *Task) (uintptr, error)
+}
+
+// SyscallControl is returned by syscalls to control the behavior of
+// Task.doSyscallInvoke.
+type SyscallControl struct {
+	// next is the state that the task goroutine should switch to. If next is
+	// nil, the task goroutine should continue to syscall exit as usual.
+	next taskRunState
+
+	// If ignoreReturn is true, Task.doSyscallInvoke should not store any value
+	// in the task's syscall return value register.
+	ignoreReturn bool
+}
+
+var (
+	// CtrlDoExit is returned by the implementations of the exit and exit_group
+	// syscalls to enter the task exit path directly, skipping syscall exit
+	// tracing.
+	CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true}
+
+	// ctrlStopAndReinvokeSyscall is returned by syscalls using the external
+	// feature before syscall execution. This causes Task.doSyscallInvoke
+	// to return runSyscallReinvoke, allowing Task.run to check for stops
+	// before immediately re-invoking the syscall (skipping the re-checking
+	// of seccomp filters and ptrace which would confuse userspace
+	// tracing).
+	ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true}
+
+	// ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at
+	// their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather
+	// than tail-calling it, allowing stops to be checked before syscall exit.
+	ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)}
+)
+
+func (t *Task) invokeExternal() {
+	t.BeginExternalStop()
+	go func() { // S/R-SAFE: External control flow.
+		defer t.EndExternalStop()
+		t.SyscallTable().External(t.Kernel())
+	}()
+}
+
+func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) {
+	s := t.SyscallTable()
+
+	fe := s.FeatureEnable.Word(sysno)
+
+	var straceContext interface{}
+	if bits.IsAnyOn32(fe, StraceEnableBits) {
+		straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe)
+	}
+
+	if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) {
+		t.invokeExternal()
+		// Ensure we check for stops, then invoke the syscall again.
+		ctrl = ctrlStopAndReinvokeSyscall
+	} else {
+		fn := s.Lookup(sysno)
+		var region *trace.Region // Only non-nil if tracing == true.
+		if trace.IsEnabled() {
+			region = trace.StartRegion(t.traceContext, s.LookupName(sysno))
+		}
+		if fn != nil {
+			// Call our syscall implementation.
+			rval, ctrl, err = fn(t, args)
+		} else {
+			// Use the missing function if not found.
+			rval, err = t.SyscallTable().Missing(t, sysno, args)
+		}
+		if region != nil {
+			region.End()
+		}
+	}
+
+	if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
+		t.invokeExternal()
+		// Don't reinvoke the syscall.
+	}
+
+	if bits.IsAnyOn32(fe, StraceEnableBits) {
+		s.Stracer.SyscallExit(straceContext, t, sysno, rval, err)
+	}
+
+	return
+}
+
+// doSyscall is the entry point for an invocation of a system call specified by
+// the current state of t's registers.
+//
+// The syscall path is very hot; avoid defer.
+func (t *Task) doSyscall() taskRunState {
+	// Save value of the register which is clobbered in the following
+	// t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64.
+	//
+	// On x86, register rax was shared by syscall number and return
+	// value, and at the entry of the syscall handler, the rax was
+	// saved to regs.orig_rax which was exposed to userspace.
+	// But on arm64, syscall number was passed through X8, and the X0
+	// was shared by the first syscall argument and return value. The
+	// X0 was saved to regs.orig_x0 which was not exposed to userspace.
+	// So we have to do the same operation here to save the X0 value
+	// into the task context.
+	t.Arch().SyscallSaveOrig()
+
+	sysno := t.Arch().SyscallNo()
+	args := t.Arch().SyscallArgs()
+
+	// Tracers expect to see this between when the task traps into the kernel
+	// to perform a syscall and when the syscall is actually invoked.
+	// This useless-looking temporary is needed because Go.
+	tmp := uintptr(syscall.ENOSYS)
+	t.Arch().SetReturn(-tmp)
+
+	// Check seccomp filters. The nil check is for performance (as seccomp use
+	// is rare), not needed for correctness.
+	if t.syscallFilters.Load() != nil {
+		switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r {
+		case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
+			t.Debugf("Syscall %d: denied by seccomp", sysno)
+			return (*runSyscallExit)(nil)
+		case linux.SECCOMP_RET_ALLOW:
+			// ok
+		case linux.SECCOMP_RET_KILL_THREAD:
+			t.Debugf("Syscall %d: killed by seccomp", sysno)
+			t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+			return (*runExit)(nil)
+		case linux.SECCOMP_RET_TRACE:
+			t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno)
+			return (*runSyscallAfterPtraceEventSeccomp)(nil)
+		default:
+			panic(fmt.Sprintf("Unknown seccomp result %d", r))
+		}
+	}
+
+	return t.doSyscallEnter(sysno, args)
+}
+
+type runSyscallAfterPtraceEventSeccomp struct{}
+
+func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
+	if t.killed() {
+		// "[S]yscall-exit-stop is not generated prior to death by SIGKILL." -
+		// ptrace(2)
+		return (*runInterrupt)(nil)
+	}
+	sysno := t.Arch().SyscallNo()
+	// "The tracer can skip the system call by changing the syscall number to
+	// -1." - Documentation/prctl/seccomp_filter.txt
+	if sysno == ^uintptr(0) {
+		return (*runSyscallExit)(nil).execute(t)
+	}
+	args := t.Arch().SyscallArgs()
+	return t.doSyscallEnter(sysno, args)
+}
+
+func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState {
+	if next, ok := t.ptraceSyscallEnter(); ok {
+		return next
+	}
+	return t.doSyscallInvoke(sysno, args)
+}
+
+// +stateify savable
+type runSyscallAfterSyscallEnterStop struct{}
+
+func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
+	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
+		t.tg.signalHandlers.mu.Lock()
+		t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
+		t.tg.signalHandlers.mu.Unlock()
+	}
+	if t.killed() {
+		return (*runInterrupt)(nil)
+	}
+	sysno := t.Arch().SyscallNo()
+	if sysno == ^uintptr(0) {
+		return (*runSyscallExit)(nil)
+	}
+	args := t.Arch().SyscallArgs()
+
+	return t.doSyscallInvoke(sysno, args)
+}
+
+// +stateify savable
+type runSyscallAfterSysemuStop struct{}
+
+func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
+	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
+		t.tg.signalHandlers.mu.Lock()
+		t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
+		t.tg.signalHandlers.mu.Unlock()
+	}
+	if t.killed() {
+		return (*runInterrupt)(nil)
+	}
+	return (*runSyscallExit)(nil).execute(t)
+}
+
+func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState {
+	rval, ctrl, err := t.executeSyscall(sysno, args)
+
+	if ctrl != nil {
+		if !ctrl.ignoreReturn {
+			t.Arch().SetReturn(rval)
+		}
+		if ctrl.next != nil {
+			return ctrl.next
+		}
+	} else if err != nil {
+		t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
+		t.haveSyscallReturn = true
+	} else {
+		t.Arch().SetReturn(rval)
+	}
+
+	return (*runSyscallExit)(nil).execute(t)
+}
+
+// +stateify savable
+type runSyscallReinvoke struct{}
+
+func (*runSyscallReinvoke) execute(t *Task) taskRunState {
+	if t.killed() {
+		// It's possible that since the last execution, the task has
+		// been forcible killed. Invoking the system call here could
+		// result in an infinite loop if it is again preempted by an
+		// external stop and reinvoked.
+		return (*runInterrupt)(nil)
+	}
+
+	sysno := t.Arch().SyscallNo()
+	args := t.Arch().SyscallArgs()
+	return t.doSyscallInvoke(sysno, args)
+}
+
+// +stateify savable
+type runSyscallExit struct{}
+
+func (*runSyscallExit) execute(t *Task) taskRunState {
+	t.ptraceSyscallExit()
+	return (*runApp)(nil)
+}
+
+// doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
+// indicated by an execution fault at address addr. doVsyscall returns the
+// task's next run state.
+func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
+	vsyscallCount.Increment()
+
+	// Grab the caller up front, to make sure there's a sensible stack.
+	caller := t.Arch().Native(uintptr(0))
+	if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
+		t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return (*runApp)(nil)
+	}
+
+	// For _vsyscalls_, there is no need to translate System V calling convention
+	// to syscall ABI because they both use RDI, RSI, and RDX for the first three
+	// arguments and none of the vsyscalls uses more than two arguments.
+	args := t.Arch().SyscallArgs()
+	if t.syscallFilters.Load() != nil {
+		switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
+		case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
+			t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
+			return (*runApp)(nil)
+		case linux.SECCOMP_RET_ALLOW:
+			// ok
+		case linux.SECCOMP_RET_TRACE:
+			t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller))
+			return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller}
+		case linux.SECCOMP_RET_KILL_THREAD:
+			t.Debugf("vsyscall %d: killed by seccomp", sysno)
+			t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+			return (*runExit)(nil)
+		default:
+			panic(fmt.Sprintf("Unknown seccomp result %d", r))
+		}
+	}
+
+	return t.doVsyscallInvoke(sysno, args, caller)
+}
+
+type runVsyscallAfterPtraceEventSeccomp struct {
+	addr   usermem.Addr
+	sysno  uintptr
+	caller interface{}
+}
+
+func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
+	if t.killed() {
+		return (*runInterrupt)(nil)
+	}
+	sysno := t.Arch().SyscallNo()
+	// "... the syscall may not be changed to another system call using the
+	// orig_rax register. It may only be changed to -1 order [sic] to skip the
+	// currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
+	// Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
+	// causes do_exit(SIGSYS), and changing sp is ignored.
+	if (sysno != ^uintptr(0) && sysno != r.sysno) || usermem.Addr(t.Arch().IP()) != r.addr {
+		t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+		return (*runExit)(nil)
+	}
+	if sysno == ^uintptr(0) {
+		return (*runApp)(nil)
+	}
+	return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
+}
+
+func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState {
+	rval, ctrl, err := t.executeSyscall(sysno, args)
+	if ctrl != nil {
+		t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
+		// Set the return value. The stack has already been adjusted.
+		t.Arch().SetReturn(0)
+	} else if err == nil {
+		t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller))
+		// Set the return value. The stack has already been adjusted.
+		t.Arch().SetReturn(uintptr(rval))
+	} else {
+		t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err)
+		if err == syserror.EFAULT {
+			t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+			t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+			// A return is not emulated in this case.
+			return (*runApp)(nil)
+		}
+		t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
+	}
+	t.Arch().SetIP(t.Arch().Value(caller))
+	t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
+	return (*runApp)(nil)
+}
+
+// ExtractErrno extracts an integer error number from the error.
+// The syscall number is purely for context in the error case. Use -1 if
+// syscall number is unknown.
+func ExtractErrno(err error, sysno int) int {
+	switch err := err.(type) {
+	case nil:
+		return 0
+	case syscall.Errno:
+		return int(err)
+	case SyscallRestartErrno:
+		return int(err)
+	case *memmap.BusError:
+		// Bus errors may generate SIGBUS, but for syscalls they still
+		// return EFAULT. See case in task_run.go where the fault is
+		// handled (and the SIGBUS is delivered).
+		return int(syscall.EFAULT)
+	case *os.PathError:
+		return ExtractErrno(err.Err, sysno)
+	case *os.LinkError:
+		return ExtractErrno(err.Err, sysno)
+	case *os.SyscallError:
+		return ExtractErrno(err.Err, sysno)
+	default:
+		if errno, ok := syserror.TranslateError(err); ok {
+			return int(errno)
+		}
+	}
+	panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err))
+}
diff --git a/pkg/sentry/kernel/task_test.go b/pkg/sentry/kernel/task_test.go
new file mode 100644
index 000000000..cfcde9a7a
--- /dev/null
+++ b/pkg/sentry/kernel/task_test.go
@@ -0,0 +1,69 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+)
+
+func TestTaskCPU(t *testing.T) {
+	for _, test := range []struct {
+		mask sched.CPUSet
+		tid  ThreadID
+		cpu  int32
+	}{
+		{
+			mask: []byte{0xff},
+			tid:  1,
+			cpu:  0,
+		},
+		{
+			mask: []byte{0xff},
+			tid:  10,
+			cpu:  1,
+		},
+		{
+			// more than 8 cpus.
+			mask: []byte{0xff, 0xff},
+			tid:  10,
+			cpu:  9,
+		},
+		{
+			// missing the first cpu.
+			mask: []byte{0xfe},
+			tid:  1,
+			cpu:  1,
+		},
+		{
+			mask: []byte{0xfe},
+			tid:  10,
+			cpu:  3,
+		},
+		{
+			// missing the fifth cpu.
+			mask: []byte{0xef},
+			tid:  10,
+			cpu:  2,
+		},
+	} {
+		assigned := assignCPU(test.mask, test.tid)
+		if test.cpu != assigned {
+			t.Errorf("assignCPU(%v, %v) got %v, want %v", test.mask, test.tid, assigned, test.cpu)
+		}
+	}
+
+}
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
new file mode 100644
index 000000000..b02044ad2
--- /dev/null
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -0,0 +1,301 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"math"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// MAX_RW_COUNT is the maximum size in bytes of a single read or write.
+// Reads and writes that exceed this size may be silently truncated.
+// (Linux: include/linux/fs.h:MAX_RW_COUNT)
+var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
+
+// Activate ensures that the task has an active address space.
+func (t *Task) Activate() {
+	if mm := t.MemoryManager(); mm != nil {
+		if err := mm.Activate(t); err != nil {
+			panic("unable to activate mm: " + err.Error())
+		}
+	}
+}
+
+// Deactivate relinquishes the task's active address space.
+func (t *Task) Deactivate() {
+	if mm := t.MemoryManager(); mm != nil {
+		mm.Deactivate()
+	}
+}
+
+// CopyIn copies a fixed-size value or slice of fixed-size values in from the
+// task's memory. The copy will fail with syscall.EFAULT if it traverses user
+// memory that is unmapped or not readable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) {
+	return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyInBytes is a fast version of CopyIn if the caller can serialize the
+// data without reflection and pass in a byte slice.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+	return t.MemoryManager().CopyIn(t, addr, dst, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyOut copies a fixed-size value or slice of fixed-size values out to the
+// task's memory. The copy will fail with syscall.EFAULT if it traverses user
+// memory that is unmapped or not writeable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) {
+	return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyOutBytes is a fast version of CopyOut if the caller can serialize the
+// data without reflection and pass in a byte slice.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+	return t.MemoryManager().CopyOut(t, addr, src, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyInString copies a NUL-terminated string of length at most maxlen in from
+// the task's memory. The copy will fail with syscall.EFAULT if it traverses
+// user memory that is unmapped or not readable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) {
+	return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxlen, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyInVector copies a NULL-terminated vector of strings from the task's
+// memory. The copy will fail with syscall.EFAULT if it traverses
+// user memory that is unmapped or not readable by the user.
+//
+// maxElemSize is the maximum size of each individual element.
+//
+// maxTotalSize is the maximum total length of all elements plus the total
+// number of elements. For example, the following strings correspond to
+// the following set of sizes:
+//
+//     { "a", "b", "c" } => 6 (3 for lengths, 3 for elements)
+//     { "abc" }         => 4 (3 for length, 1 for elements)
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([]string, error) {
+	var v []string
+	for {
+		argAddr := t.Arch().Native(0)
+		if _, err := t.CopyIn(addr, argAddr); err != nil {
+			return v, err
+		}
+		if t.Arch().Value(argAddr) == 0 {
+			break
+		}
+		// Each string has a zero terminating byte counted, so copying out a string
+		// requires at least one byte of space. Also, see the calculation below.
+		if maxTotalSize <= 0 {
+			return nil, syserror.ENOMEM
+		}
+		thisMax := maxElemSize
+		if maxTotalSize < thisMax {
+			thisMax = maxTotalSize
+		}
+		arg, err := t.CopyInString(usermem.Addr(t.Arch().Value(argAddr)), thisMax)
+		if err != nil {
+			return v, err
+		}
+		v = append(v, arg)
+		addr += usermem.Addr(t.Arch().Width())
+		maxTotalSize -= len(arg) + 1
+	}
+	return v, nil
+}
+
+// CopyOutIovecs converts src to an array of struct iovecs and copies it to the
+// memory mapped at addr.
+//
+// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error {
+	switch t.Arch().Width() {
+	case 8:
+		const itemLen = 16
+		if _, ok := addr.AddLength(uint64(src.NumRanges()) * itemLen); !ok {
+			return syserror.EFAULT
+		}
+
+		b := t.CopyScratchBuffer(itemLen)
+		for ; !src.IsEmpty(); src = src.Tail() {
+			ar := src.Head()
+			usermem.ByteOrder.PutUint64(b[0:8], uint64(ar.Start))
+			usermem.ByteOrder.PutUint64(b[8:16], uint64(ar.Length()))
+			if _, err := t.CopyOutBytes(addr, b); err != nil {
+				return err
+			}
+			addr += itemLen
+		}
+
+	default:
+		return syserror.ENOSYS
+	}
+
+	return nil
+}
+
+// CopyInIovecs copies an array of numIovecs struct iovecs from the memory
+// mapped at addr, converts them to usermem.AddrRanges, and returns them as a
+// usermem.AddrRangeSeq.
+//
+// CopyInIovecs shares the following properties with Linux's
+// lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector():
+//
+// - If the length of any AddrRange would exceed the range of an ssize_t,
+// CopyInIovecs returns EINVAL.
+//
+// - If the length of any AddrRange would cause its end to overflow,
+// CopyInIovecs returns EFAULT.
+//
+// - If any AddrRange would include addresses outside the application address
+// range, CopyInIovecs returns EFAULT.
+//
+// - The combined length of all AddrRanges is limited to MAX_RW_COUNT. If the
+// combined length of all AddrRanges would otherwise exceed this amount, ranges
+// beyond MAX_RW_COUNT are silently truncated.
+//
+// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) {
+	if numIovecs == 0 {
+		return usermem.AddrRangeSeq{}, nil
+	}
+
+	var dst []usermem.AddrRange
+	if numIovecs > 1 {
+		dst = make([]usermem.AddrRange, 0, numIovecs)
+	}
+
+	switch t.Arch().Width() {
+	case 8:
+		const itemLen = 16
+		if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok {
+			return usermem.AddrRangeSeq{}, syserror.EFAULT
+		}
+
+		b := t.CopyScratchBuffer(itemLen)
+		for i := 0; i < numIovecs; i++ {
+			if _, err := t.CopyInBytes(addr, b); err != nil {
+				return usermem.AddrRangeSeq{}, err
+			}
+
+			base := usermem.Addr(usermem.ByteOrder.Uint64(b[0:8]))
+			length := usermem.ByteOrder.Uint64(b[8:16])
+			if length > math.MaxInt64 {
+				return usermem.AddrRangeSeq{}, syserror.EINVAL
+			}
+			ar, ok := t.MemoryManager().CheckIORange(base, int64(length))
+			if !ok {
+				return usermem.AddrRangeSeq{}, syserror.EFAULT
+			}
+
+			if numIovecs == 1 {
+				// Special case to avoid allocating dst.
+				return usermem.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil
+			}
+			dst = append(dst, ar)
+
+			addr += itemLen
+		}
+
+	default:
+		return usermem.AddrRangeSeq{}, syserror.ENOSYS
+	}
+
+	// Truncate to MAX_RW_COUNT.
+	var total uint64
+	for i := range dst {
+		dstlen := uint64(dst[i].Length())
+		if rem := uint64(MAX_RW_COUNT) - total; rem < dstlen {
+			dst[i].End -= usermem.Addr(dstlen - rem)
+			dstlen = rem
+		}
+		total += dstlen
+	}
+
+	return usermem.AddrRangeSeqFromSlice(dst), nil
+}
+
+// SingleIOSequence returns a usermem.IOSequence representing [addr,
+// addr+length) in t's address space. If this contains addresses outside the
+// application address range, it returns EFAULT. If length exceeds
+// MAX_RW_COUNT, the range is silently truncated.
+//
+// SingleIOSequence is analogous to Linux's
+// lib/iov_iter.c:import_single_range(). (Note that the non-vectorized read and
+// write syscalls in Linux do not use import_single_range(). However they check
+// access_ok() in fs/read_write.c:vfs_read/vfs_write, and overflowing address
+// ranges are truncated to MAX_RW_COUNT by fs/read_write.c:rw_verify_area().)
+func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+	if length > MAX_RW_COUNT {
+		length = MAX_RW_COUNT
+	}
+	ar, ok := t.MemoryManager().CheckIORange(addr, int64(length))
+	if !ok {
+		return usermem.IOSequence{}, syserror.EFAULT
+	}
+	return usermem.IOSequence{
+		IO:    t.MemoryManager(),
+		Addrs: usermem.AddrRangeSeqOf(ar),
+		Opts:  opts,
+	}, nil
+}
+
+// IovecsIOSequence returns a usermem.IOSequence representing the array of
+// iovcnt struct iovecs at addr in t's address space. opts applies to the
+// returned IOSequence, not the reading of the struct iovec array.
+//
+// IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
+//
+// Preconditions: As for Task.CopyInIovecs.
+func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+	if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+	ars, err := t.CopyInIovecs(addr, iovcnt)
+	if err != nil {
+		return usermem.IOSequence{}, err
+	}
+	return usermem.IOSequence{
+		IO:    t.MemoryManager(),
+		Addrs: ars,
+		Opts:  opts,
+	}, nil
+}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
new file mode 100644
index 000000000..4dfd2c990
--- /dev/null
+++ b/pkg/sentry/kernel/thread_group.go
@@ -0,0 +1,531 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// A ThreadGroup is a logical grouping of tasks that has widespread
+// significance to other kernel features (e.g. signal handling). ("Thread
+// groups" are usually called "processes" in userspace documentation.)
+//
+// ThreadGroup is a superset of Linux's struct signal_struct.
+//
+// +stateify savable
+type ThreadGroup struct {
+	threadGroupNode
+
+	// signalHandlers is the set of signal handlers used by every task in this
+	// thread group. (signalHandlers may also be shared with other thread
+	// groups.)
+	//
+	// signalHandlers.mu (hereafter "the signal mutex") protects state related
+	// to signal handling, as well as state that usually needs to be atomic
+	// with signal handling, for all ThreadGroups and Tasks using
+	// signalHandlers. (This is analogous to Linux's use of struct
+	// sighand_struct::siglock.)
+	//
+	// The signalHandlers pointer can only be mutated during an execve
+	// (Task.finishExec). Consequently, when it's possible for a task in the
+	// thread group to be completing an execve, signalHandlers is protected by
+	// the owning TaskSet.mu. Otherwise, it is possible to read the
+	// signalHandlers pointer without synchronization. In particular,
+	// completing an execve requires that all other tasks in the thread group
+	// have exited, so task goroutines do not need the owning TaskSet.mu to
+	// read the signalHandlers pointer of their thread groups.
+	signalHandlers *SignalHandlers
+
+	// pendingSignals is the set of pending signals that may be handled by any
+	// task in this thread group.
+	//
+	// pendingSignals is protected by the signal mutex.
+	pendingSignals pendingSignals
+
+	// If groupStopDequeued is true, a task in the thread group has dequeued a
+	// stop signal, but has not yet initiated the group stop.
+	//
+	// groupStopDequeued is analogous to Linux's JOBCTL_STOP_DEQUEUED.
+	//
+	// groupStopDequeued is protected by the signal mutex.
+	groupStopDequeued bool
+
+	// groupStopSignal is the signal that caused a group stop to be initiated.
+	//
+	// groupStopSignal is protected by the signal mutex.
+	groupStopSignal linux.Signal
+
+	// groupStopPendingCount is the number of active tasks in the thread group
+	// for which Task.groupStopPending is set.
+	//
+	// groupStopPendingCount is analogous to Linux's
+	// signal_struct::group_stop_count.
+	//
+	// groupStopPendingCount is protected by the signal mutex.
+	groupStopPendingCount int
+
+	// If groupStopComplete is true, groupStopPendingCount transitioned from
+	// non-zero to zero without an intervening SIGCONT.
+	//
+	// groupStopComplete is analogous to Linux's SIGNAL_STOP_STOPPED.
+	//
+	// groupStopComplete is protected by the signal mutex.
+	groupStopComplete bool
+
+	// If groupStopWaitable is true, the thread group is indicating a waitable
+	// group stop event (as defined by EventChildGroupStop).
+	//
+	// Linux represents the analogous state as SIGNAL_STOP_STOPPED being set
+	// and group_exit_code being non-zero.
+	//
+	// groupStopWaitable is protected by the signal mutex.
+	groupStopWaitable bool
+
+	// If groupContNotify is true, then a SIGCONT has recently ended a group
+	// stop on this thread group, and the first task to observe it should
+	// notify its parent. groupContInterrupted is true iff SIGCONT ended an
+	// incomplete group stop. If groupContNotify is false, groupContInterrupted is
+	// meaningless.
+	//
+	// Analogues in Linux:
+	//
+	// - groupContNotify && groupContInterrupted is represented by
+	// SIGNAL_CLD_STOPPED.
+	//
+	// - groupContNotify && !groupContInterrupted is represented by
+	// SIGNAL_CLD_CONTINUED.
+	//
+	// - !groupContNotify is represented by neither flag being set.
+	//
+	// groupContNotify and groupContInterrupted are protected by the signal
+	// mutex.
+	groupContNotify      bool
+	groupContInterrupted bool
+
+	// If groupContWaitable is true, the thread group is indicating a waitable
+	// continue event (as defined by EventGroupContinue).
+	//
+	// groupContWaitable is analogous to Linux's SIGNAL_STOP_CONTINUED.
+	//
+	// groupContWaitable is protected by the signal mutex.
+	groupContWaitable bool
+
+	// exiting is true if all tasks in the ThreadGroup should exit. exiting is
+	// analogous to Linux's SIGNAL_GROUP_EXIT.
+	//
+	// exiting is protected by the signal mutex. exiting can only transition
+	// from false to true.
+	exiting bool
+
+	// exitStatus is the thread group's exit status.
+	//
+	// While exiting is false, exitStatus is protected by the signal mutex.
+	// When exiting becomes true, exitStatus becomes immutable.
+	exitStatus ExitStatus
+
+	// terminationSignal is the signal that this thread group's leader will
+	// send to its parent when it exits.
+	//
+	// terminationSignal is protected by the TaskSet mutex.
+	terminationSignal linux.Signal
+
+	// liveGoroutines is the number of non-exited task goroutines in the thread
+	// group.
+	//
+	// liveGoroutines is not saved; it is reset as task goroutines are
+	// restarted by Task.Start.
+	liveGoroutines sync.WaitGroup `state:"nosave"`
+
+	timerMu sync.Mutex `state:"nosave"`
+
+	// itimerRealTimer implements ITIMER_REAL for the thread group.
+	itimerRealTimer *ktime.Timer
+
+	// itimerVirtSetting is the ITIMER_VIRTUAL setting for the thread group.
+	//
+	// itimerVirtSetting is protected by the signal mutex.
+	itimerVirtSetting ktime.Setting
+
+	// itimerProfSetting is the ITIMER_PROF setting for the thread group.
+	//
+	// itimerProfSetting is protected by the signal mutex.
+	itimerProfSetting ktime.Setting
+
+	// rlimitCPUSoftSetting is the setting for RLIMIT_CPU soft limit
+	// notifications for the thread group.
+	//
+	// rlimitCPUSoftSetting is protected by the signal mutex.
+	rlimitCPUSoftSetting ktime.Setting
+
+	// cpuTimersEnabled is non-zero if itimerVirtSetting.Enabled is true,
+	// itimerProfSetting.Enabled is true, rlimitCPUSoftSetting.Enabled is true,
+	// or limits.Get(CPU) is finite.
+	//
+	// cpuTimersEnabled is protected by the signal mutex. cpuTimersEnabled is
+	// accessed using atomic memory operations.
+	cpuTimersEnabled uint32
+
+	// timers is the thread group's POSIX interval timers. nextTimerID is the
+	// TimerID at which allocation should begin searching for an unused ID.
+	//
+	// timers and nextTimerID are protected by timerMu.
+	timers      map[linux.TimerID]*IntervalTimer
+	nextTimerID linux.TimerID
+
+	// exitedCPUStats is the CPU usage for all exited tasks in the thread
+	// group. exitedCPUStats is protected by the TaskSet mutex.
+	exitedCPUStats usage.CPUStats
+
+	// childCPUStats is the CPU usage of all joined descendants of this thread
+	// group. childCPUStats is protected by the TaskSet mutex.
+	childCPUStats usage.CPUStats
+
+	// ioUsage is the I/O usage for all exited tasks in the thread group.
+	// The ioUsage pointer is immutable.
+	ioUsage *usage.IO
+
+	// maxRSS is the historical maximum resident set size of the thread group, updated when:
+	//
+	// - A task in the thread group exits, since after all tasks have
+	// exited the MemoryManager is no longer reachable.
+	//
+	// - The thread group completes an execve, since this changes
+	// MemoryManagers.
+	//
+	// maxRSS is protected by the TaskSet mutex.
+	maxRSS uint64
+
+	// childMaxRSS is the maximum resident set size in bytes of all joined
+	// descendants of this thread group.
+	//
+	// childMaxRSS is protected by the TaskSet mutex.
+	childMaxRSS uint64
+
+	// Resource limits for this ThreadGroup. The limits pointer is immutable.
+	limits *limits.LimitSet
+
+	// processGroup is the processGroup for this thread group.
+	//
+	// processGroup is protected by the TaskSet mutex.
+	processGroup *ProcessGroup
+
+	// execed indicates an exec has occurred since creation. This will be
+	// set by finishExec, and new TheadGroups will have this field cleared.
+	// When execed is set, the processGroup may no longer be changed.
+	//
+	// execed is protected by the TaskSet mutex.
+	execed bool
+
+	// oldRSeqCritical is the thread group's old rseq critical region.
+	oldRSeqCritical atomic.Value `state:".(*OldRSeqCriticalRegion)"`
+
+	// mounts is the thread group's mount namespace. This does not really
+	// correspond to a "mount namespace" in Linux, but is more like a
+	// complete VFS that need not be shared between processes. See the
+	// comment in mounts.go  for more information.
+	//
+	// mounts is immutable.
+	mounts *fs.MountNamespace
+
+	// tty is the thread group's controlling terminal. If nil, there is no
+	// controlling terminal.
+	//
+	// tty is protected by the signal mutex.
+	tty *TTY
+
+	// oomScoreAdj is the thread group's OOM score adjustment. This is
+	// currently not used but is maintained for consistency.
+	// TODO(gvisor.dev/issue/1967)
+	//
+	// oomScoreAdj is accessed using atomic memory operations.
+	oomScoreAdj int32
+}
+
+// NewThreadGroup returns a new, empty thread group in PID namespace pidns. The
+// thread group leader will send its parent terminationSignal when it exits.
+// The new thread group isn't visible to the system until a task has been
+// created inside of it by a successful call to TaskSet.NewTask.
+func (k *Kernel) NewThreadGroup(mntns *fs.MountNamespace, pidns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet) *ThreadGroup {
+	tg := &ThreadGroup{
+		threadGroupNode: threadGroupNode{
+			pidns: pidns,
+		},
+		signalHandlers:    sh,
+		terminationSignal: terminationSignal,
+		ioUsage:           &usage.IO{},
+		limits:            limits,
+		mounts:            mntns,
+	}
+	tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
+	tg.timers = make(map[linux.TimerID]*IntervalTimer)
+	tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
+	return tg
+}
+
+// saveOldRSeqCritical is invoked by stateify.
+func (tg *ThreadGroup) saveOldRSeqCritical() *OldRSeqCriticalRegion {
+	return tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
+}
+
+// loadOldRSeqCritical is invoked by stateify.
+func (tg *ThreadGroup) loadOldRSeqCritical(r *OldRSeqCriticalRegion) {
+	tg.oldRSeqCritical.Store(r)
+}
+
+// SignalHandlers returns the signal handlers used by tg.
+//
+// Preconditions: The caller must provide the synchronization required to read
+// tg.signalHandlers, as described in the field's comment.
+func (tg *ThreadGroup) SignalHandlers() *SignalHandlers {
+	return tg.signalHandlers
+}
+
+// Limits returns tg's limits.
+func (tg *ThreadGroup) Limits() *limits.LimitSet {
+	return tg.limits
+}
+
+// release releases the thread group's resources.
+func (tg *ThreadGroup) release() {
+	// Timers must be destroyed without holding the TaskSet or signal mutexes
+	// since timers send signals with Timer.mu locked.
+	tg.itimerRealTimer.Destroy()
+	var its []*IntervalTimer
+	tg.pidns.owner.mu.Lock()
+	tg.signalHandlers.mu.Lock()
+	for _, it := range tg.timers {
+		its = append(its, it)
+	}
+	tg.timers = make(map[linux.TimerID]*IntervalTimer) // nil maps can't be saved
+	tg.signalHandlers.mu.Unlock()
+	tg.pidns.owner.mu.Unlock()
+	for _, it := range its {
+		it.DestroyTimer()
+	}
+	if tg.mounts != nil {
+		tg.mounts.DecRef()
+	}
+}
+
+// forEachChildThreadGroupLocked indicates over all child ThreadGroups.
+//
+// Precondition: TaskSet.mu must be held.
+func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		for child := range t.children {
+			if child == child.tg.leader {
+				fn(child.tg)
+			}
+		}
+	}
+}
+
+// SetControllingTTY sets tty as the controlling terminal of tg.
+func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error {
+	tty.mu.Lock()
+	defer tty.mu.Unlock()
+
+	// We might be asked to set the controlling terminal of multiple
+	// processes, so we lock both the TaskSet and SignalHandlers.
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+
+	// "The calling process must be a session leader and not have a
+	// controlling terminal already." - tty_ioctl(4)
+	if tg.processGroup.session.leader != tg || tg.tty != nil {
+		return syserror.EINVAL
+	}
+
+	// "If this terminal is already the controlling terminal of a different
+	// session group, then the ioctl fails with EPERM, unless the caller
+	// has the CAP_SYS_ADMIN capability and arg equals 1, in which case the
+	// terminal is stolen, and all processes that had it as controlling
+	// terminal lose it." - tty_ioctl(4)
+	if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session {
+		// Stealing requires CAP_SYS_ADMIN in the root user namespace.
+		if creds := auth.CredentialsFromContext(tg.leader); !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) || arg != 1 {
+			return syserror.EPERM
+		}
+		// Steal the TTY away. Unlike TIOCNOTTY, don't send signals.
+		for othertg := range tg.pidns.owner.Root.tgids {
+			// This won't deadlock by locking tg.signalHandlers
+			// because at this point:
+			// - We only lock signalHandlers if it's in the same
+			//   session as the tty's controlling thread group.
+			// - We know that the calling thread group is not in
+			//   the same session as the tty's controlling thread
+			//   group.
+			if othertg.processGroup.session == tty.tg.processGroup.session {
+				othertg.signalHandlers.mu.Lock()
+				othertg.tty = nil
+				othertg.signalHandlers.mu.Unlock()
+			}
+		}
+	}
+
+	// Set the controlling terminal and foreground process group.
+	tg.tty = tty
+	tg.processGroup.session.foreground = tg.processGroup
+	// Set this as the controlling process of the terminal.
+	tty.tg = tg
+
+	return nil
+}
+
+// ReleaseControllingTTY gives up tty as the controlling tty of tg.
+func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error {
+	tty.mu.Lock()
+	defer tty.mu.Unlock()
+
+	// We might be asked to set the controlling terminal of multiple
+	// processes, so we lock both the TaskSet and SignalHandlers.
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+
+	// Just below, we may re-lock signalHandlers in order to send signals.
+	// Thus we can't defer Unlock here.
+	tg.signalHandlers.mu.Lock()
+
+	if tg.tty == nil || tg.tty != tty {
+		tg.signalHandlers.mu.Unlock()
+		return syserror.ENOTTY
+	}
+
+	// "If the process was session leader, then send SIGHUP and SIGCONT to
+	// the foreground process group and all processes in the current
+	// session lose their controlling terminal." - tty_ioctl(4)
+	// Remove tty as the controlling tty for each process in the session,
+	// then send them SIGHUP and SIGCONT.
+
+	// If we're not the session leader, we don't have to do much.
+	if tty.tg != tg {
+		tg.tty = nil
+		tg.signalHandlers.mu.Unlock()
+		return nil
+	}
+
+	tg.signalHandlers.mu.Unlock()
+
+	// We're the session leader. SIGHUP and SIGCONT the foreground process
+	// group and remove all controlling terminals in the session.
+	var lastErr error
+	for othertg := range tg.pidns.owner.Root.tgids {
+		if othertg.processGroup.session == tg.processGroup.session {
+			othertg.signalHandlers.mu.Lock()
+			othertg.tty = nil
+			if othertg.processGroup == tg.processGroup.session.foreground {
+				if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGHUP)}, true /* group */); err != nil {
+					lastErr = err
+				}
+				if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGCONT)}, true /* group */); err != nil {
+					lastErr = err
+				}
+			}
+			othertg.signalHandlers.mu.Unlock()
+		}
+	}
+
+	return lastErr
+}
+
+// ForegroundProcessGroup returns the process group ID of the foreground
+// process group.
+func (tg *ThreadGroup) ForegroundProcessGroup(tty *TTY) (int32, error) {
+	tty.mu.Lock()
+	defer tty.mu.Unlock()
+
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+
+	// "When fd does not refer to the controlling terminal of the calling
+	// process, -1 is returned" - tcgetpgrp(3)
+	if tg.tty != tty {
+		return -1, syserror.ENOTTY
+	}
+
+	return int32(tg.processGroup.session.foreground.id), nil
+}
+
+// SetForegroundProcessGroup sets the foreground process group of tty to pgid.
+func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) (int32, error) {
+	tty.mu.Lock()
+	defer tty.mu.Unlock()
+
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+
+	// TODO(b/129283598): "If tcsetpgrp() is called by a member of a
+	// background process group in its session, and the calling process is
+	// not blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all
+	// members of this background process group."
+
+	// tty must be the controlling terminal.
+	if tg.tty != tty {
+		return -1, syserror.ENOTTY
+	}
+
+	// pgid must be positive.
+	if pgid < 0 {
+		return -1, syserror.EINVAL
+	}
+
+	// pg must not be empty. Empty process groups are removed from their
+	// pid namespaces.
+	pg, ok := tg.pidns.processGroups[pgid]
+	if !ok {
+		return -1, syserror.ESRCH
+	}
+
+	// pg must be part of this process's session.
+	if tg.processGroup.session != pg.session {
+		return -1, syserror.EPERM
+	}
+
+	tg.processGroup.session.foreground.id = pgid
+	return 0, nil
+}
+
+// itimerRealListener implements ktime.Listener for ITIMER_REAL expirations.
+//
+// +stateify savable
+type itimerRealListener struct {
+	tg *ThreadGroup
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (l *itimerRealListener) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
+	l.tg.SendSignal(SignalInfoPriv(linux.SIGALRM))
+	return ktime.Setting{}, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (l *itimerRealListener) Destroy() {
+}
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
new file mode 100644
index 000000000..872e1a82d
--- /dev/null
+++ b/pkg/sentry/kernel/threads.go
@@ -0,0 +1,478 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// TasksLimit is the maximum number of threads for untrusted application.
+// Linux doesn't really limit this directly, rather it is limited by total
+// memory size, stacks allocated and a global maximum. There's no real reason
+// for us to limit it either, (esp. since threads are backed by go routines),
+// and we would expect to hit resource limits long before hitting this number.
+// However, for correctness, we still check that the user doesn't exceed this
+// number.
+//
+// Note that because of the way futexes are implemented, there *are* in fact
+// serious restrictions on valid thread IDs. They are limited to 2^30 - 1
+// (kernel/fork.c:MAX_THREADS).
+const TasksLimit = (1 << 16)
+
+// ThreadID is a generic thread identifier.
+type ThreadID int32
+
+// String returns a decimal representation of the ThreadID.
+func (tid ThreadID) String() string {
+	return fmt.Sprintf("%d", tid)
+}
+
+// InitTID is the TID given to the first task added to each PID namespace. The
+// thread group led by InitTID is called the namespace's init process. The
+// death of a PID namespace's init process causes all tasks visible in that
+// namespace to be killed.
+const InitTID ThreadID = 1
+
+// A TaskSet comprises all tasks in a system.
+//
+// +stateify savable
+type TaskSet struct {
+	// mu protects all relationships betweens tasks and thread groups in the
+	// TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
+	mu sync.RWMutex `state:"nosave"`
+
+	// Root is the root PID namespace, in which all tasks in the TaskSet are
+	// visible. The Root pointer is immutable.
+	Root *PIDNamespace
+
+	// sessions is the set of all sessions.
+	sessions sessionList
+
+	// stopCount is the number of active external stops applicable to all tasks
+	// in the TaskSet (calls to TaskSet.BeginExternalStop that have not been
+	// paired with a call to TaskSet.EndExternalStop). stopCount is protected
+	// by mu.
+	//
+	// stopCount is not saved for the same reason as Task.stopCount; it is
+	// always reset to zero after restore.
+	stopCount int32 `state:"nosave"`
+
+	// liveGoroutines is the number of non-exited task goroutines in the
+	// TaskSet.
+	//
+	// liveGoroutines is not saved; it is reset as task goroutines are
+	// restarted by Task.Start.
+	liveGoroutines sync.WaitGroup `state:"nosave"`
+
+	// runningGoroutines is the number of running task goroutines in the
+	// TaskSet.
+	//
+	// runningGoroutines is not saved; its counter value is required to be zero
+	// at time of save (but note that this is not necessarily the same thing as
+	// sync.WaitGroup's zero value).
+	runningGoroutines sync.WaitGroup `state:"nosave"`
+
+	// aioGoroutines is the number of goroutines running async I/O
+	// callbacks.
+	//
+	// aioGoroutines is not saved but is required to be zero at the time of
+	// save.
+	aioGoroutines sync.WaitGroup `state:"nosave"`
+}
+
+// newTaskSet returns a new, empty TaskSet.
+func newTaskSet(pidns *PIDNamespace) *TaskSet {
+	ts := &TaskSet{Root: pidns}
+	pidns.owner = ts
+	return ts
+}
+
+// forEachThreadGroupLocked applies f to each thread group in ts.
+//
+// Preconditions: ts.mu must be locked (for reading or writing).
+func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
+	for tg := range ts.Root.tgids {
+		f(tg)
+	}
+}
+
+// A PIDNamespace represents a PID namespace, a bimap between thread IDs and
+// tasks. See the pid_namespaces(7) man page for further details.
+//
+// N.B. A task is said to be visible in a PID namespace if the PID namespace
+// contains a thread ID that maps to that task.
+//
+// +stateify savable
+type PIDNamespace struct {
+	// owner is the TaskSet that this PID namespace belongs to. The owner
+	// pointer is immutable.
+	owner *TaskSet
+
+	// parent is the PID namespace of the process that created this one. If
+	// this is the root PID namespace, parent is nil. The parent pointer is
+	// immutable.
+	//
+	// Invariant: All tasks that are visible in this namespace are also visible
+	// in all ancestor namespaces.
+	parent *PIDNamespace
+
+	// userns is the user namespace with which this PID namespace is
+	// associated. Privileged operations on this PID namespace must have
+	// appropriate capabilities in userns. The userns pointer is immutable.
+	userns *auth.UserNamespace
+
+	// The following fields are protected by owner.mu.
+
+	// last is the last ThreadID to be allocated in this namespace.
+	last ThreadID
+
+	// tasks is a mapping from ThreadIDs in this namespace to tasks visible in
+	// the namespace.
+	tasks map[ThreadID]*Task
+
+	// tids is a mapping from tasks visible in this namespace to their
+	// identifiers in this namespace.
+	tids map[*Task]ThreadID
+
+	// tgids is a mapping from thread groups visible in this namespace to
+	// their identifiers in this namespace.
+	//
+	// The content of tgids is equivalent to tids[tg.leader]. This exists
+	// primarily as an optimization to quickly find all thread groups.
+	tgids map[*ThreadGroup]ThreadID
+
+	// sessions is a mapping from SessionIDs in this namespace to sessions
+	// visible in the namespace.
+	sessions map[SessionID]*Session
+
+	// sids is a mapping from sessions visible in this namespace to their
+	// identifiers in this namespace.
+	sids map[*Session]SessionID
+
+	// processGroups is a mapping from ProcessGroupIDs in this namespace to
+	// process groups visible in the namespace.
+	processGroups map[ProcessGroupID]*ProcessGroup
+
+	// pgids is a mapping from process groups visible in this namespace to
+	// their identifiers in this namespace.
+	pgids map[*ProcessGroup]ProcessGroupID
+
+	// exiting indicates that the namespace's init process is exiting or has
+	// exited.
+	exiting bool
+}
+
+func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace {
+	return &PIDNamespace{
+		owner:         ts,
+		parent:        parent,
+		userns:        userns,
+		tasks:         make(map[ThreadID]*Task),
+		tids:          make(map[*Task]ThreadID),
+		tgids:         make(map[*ThreadGroup]ThreadID),
+		sessions:      make(map[SessionID]*Session),
+		sids:          make(map[*Session]SessionID),
+		processGroups: make(map[ProcessGroupID]*ProcessGroup),
+		pgids:         make(map[*ProcessGroup]ProcessGroupID),
+	}
+}
+
+// NewRootPIDNamespace creates the root PID namespace. 'owner' is not available
+// yet when root namespace is created and must be set by caller.
+func NewRootPIDNamespace(userns *auth.UserNamespace) *PIDNamespace {
+	return newPIDNamespace(nil, nil, userns)
+}
+
+// NewChild returns a new, empty PID namespace that is a child of ns. Authority
+// over the new PID namespace is controlled by userns.
+func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace {
+	return newPIDNamespace(ns.owner, ns, userns)
+}
+
+// TaskWithID returns the task with thread ID tid in PID namespace ns. If no
+// task has that TID, TaskWithID returns nil.
+func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task {
+	ns.owner.mu.RLock()
+	t := ns.tasks[tid]
+	ns.owner.mu.RUnlock()
+	return t
+}
+
+// ThreadGroupWithID returns the thread group lead by the task with thread ID
+// tid in PID namespace ns. If no task has that TID, or if the task with that
+// TID is not a thread group leader, ThreadGroupWithID returns nil.
+func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	t := ns.tasks[tid]
+	if t == nil {
+		return nil
+	}
+	if t != t.tg.leader {
+		return nil
+	}
+	return t.tg
+}
+
+// IDOfTask returns the TID assigned to the given task in PID namespace ns. If
+// the task is not visible in that namespace, IDOfTask returns 0. (This return
+// value is significant in some cases, e.g. getppid() is documented as
+// returning 0 if the caller's parent is in an ancestor namespace and
+// consequently not visible to the caller.) If the task is nil, IDOfTask returns
+// 0.
+func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID {
+	ns.owner.mu.RLock()
+	id := ns.tids[t]
+	ns.owner.mu.RUnlock()
+	return id
+}
+
+// IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns.
+// If the task is not visible in that namespace, IDOfThreadGroup returns 0.
+func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID {
+	ns.owner.mu.RLock()
+	id := ns.tgids[tg]
+	ns.owner.mu.RUnlock()
+	return id
+}
+
+// Tasks returns a snapshot of the tasks in ns.
+func (ns *PIDNamespace) Tasks() []*Task {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	tasks := make([]*Task, 0, len(ns.tasks))
+	for t := range ns.tids {
+		tasks = append(tasks, t)
+	}
+	return tasks
+}
+
+// ThreadGroups returns a snapshot of the thread groups in ns.
+func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
+	return ns.ThreadGroupsAppend(nil)
+}
+
+// ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs.
+func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	for tg := range ns.tgids {
+		tgs = append(tgs, tg)
+	}
+	return tgs
+}
+
+// UserNamespace returns the user namespace associated with PID namespace ns.
+func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace {
+	return ns.userns
+}
+
+// A threadGroupNode defines the relationship between a thread group and the
+// rest of the system. Conceptually, threadGroupNode is data belonging to the
+// owning TaskSet, as if TaskSet contained a field `nodes
+// map[*ThreadGroup]*threadGroupNode`. However, for practical reasons,
+// threadGroupNode is embedded in the ThreadGroup it represents.
+// (threadGroupNode is an anonymous field in ThreadGroup; this is to expose
+// threadGroupEntry's methods on ThreadGroup to make it implement
+// threadGroupLinker.)
+//
+// +stateify savable
+type threadGroupNode struct {
+	// pidns is the PID namespace containing the thread group and all of its
+	// member tasks. The pidns pointer is immutable.
+	pidns *PIDNamespace
+
+	// eventQueue is notified whenever a event of interest to Task.Wait occurs
+	// in a child of this thread group, or a ptrace tracee of a task in this
+	// thread group. Events are defined in task_exit.go.
+	//
+	// Note that we cannot check and save this wait queue similarly to other
+	// wait queues, as the queue will not be empty by the time of saving, due
+	// to the wait sourced from Exec().
+	eventQueue waiter.Queue `state:"nosave"`
+
+	// leader is the thread group's leader, which is the oldest task in the
+	// thread group; usually the last task in the thread group to call
+	// execve(), or if no such task exists then the first task in the thread
+	// group, which was created by a call to fork() or clone() without
+	// CLONE_THREAD. Once a thread group has been made visible to the rest of
+	// the system by TaskSet.newTask, leader is never nil.
+	//
+	// Note that it's possible for the leader to exit without causing the rest
+	// of the thread group to exit; in such a case, leader will still be valid
+	// and non-nil, but leader will not be in tasks.
+	//
+	// leader is protected by the TaskSet mutex.
+	leader *Task
+
+	// If execing is not nil, it is a task in the thread group that has killed
+	// all other tasks so that it can become the thread group leader and
+	// perform an execve. (execing may already be the thread group leader.)
+	//
+	// execing is analogous to Linux's signal_struct::group_exit_task.
+	//
+	// execing is protected by the TaskSet mutex.
+	execing *Task
+
+	// tasks is all tasks in the thread group that have not yet been reaped.
+	//
+	// tasks is protected by both the TaskSet mutex and the signal mutex:
+	// Mutating tasks requires locking the TaskSet mutex for writing *and*
+	// locking the signal mutex. Reading tasks requires locking the TaskSet
+	// mutex *or* locking the signal mutex.
+	tasks taskList
+
+	// tasksCount is the number of tasks in the thread group that have not yet
+	// been reaped; equivalently, tasksCount is the number of tasks in tasks.
+	//
+	// tasksCount is protected by both the TaskSet mutex and the signal mutex,
+	// as with tasks.
+	tasksCount int
+
+	// liveTasks is the number of tasks in the thread group that have not yet
+	// reached TaskExitZombie.
+	//
+	// liveTasks is protected by the TaskSet mutex (NOT the signal mutex).
+	liveTasks int
+
+	// activeTasks is the number of tasks in the thread group that have not yet
+	// reached TaskExitInitiated.
+	//
+	// activeTasks is protected by both the TaskSet mutex and the signal mutex,
+	// as with tasks.
+	activeTasks int
+}
+
+// PIDNamespace returns the PID namespace containing tg.
+func (tg *ThreadGroup) PIDNamespace() *PIDNamespace {
+	return tg.pidns
+}
+
+// TaskSet returns the TaskSet containing tg.
+func (tg *ThreadGroup) TaskSet() *TaskSet {
+	return tg.pidns.owner
+}
+
+// Leader returns tg's leader.
+func (tg *ThreadGroup) Leader() *Task {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.leader
+}
+
+// Count returns the number of non-exited threads in the group.
+func (tg *ThreadGroup) Count() int {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	var count int
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		count++
+	}
+	return count
+}
+
+// MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for
+// all tasks in tg.
+func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+
+	var tasks []ThreadID
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		if id, ok := pidns.tids[t]; ok {
+			tasks = append(tasks, id)
+		}
+	}
+	return tasks
+}
+
+// ID returns tg's leader's thread ID in its own PID namespace. If tg's leader
+// is dead, ID returns 0.
+func (tg *ThreadGroup) ID() ThreadID {
+	tg.pidns.owner.mu.RLock()
+	id := tg.pidns.tgids[tg]
+	tg.pidns.owner.mu.RUnlock()
+	return id
+}
+
+// A taskNode defines the relationship between a task and the rest of the
+// system. The comments on threadGroupNode also apply to taskNode.
+//
+// +stateify savable
+type taskNode struct {
+	// tg is the thread group that this task belongs to. The tg pointer is
+	// immutable.
+	tg *ThreadGroup `state:"wait"`
+
+	// taskEntry links into tg.tasks. Note that this means that
+	// Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread
+	// group. See threadGroupNode.tasks for synchronization info.
+	taskEntry
+
+	// parent is the task's parent. parent may be nil.
+	//
+	// parent is protected by the TaskSet mutex.
+	parent *Task
+
+	// children is this task's children.
+	//
+	// children is protected by the TaskSet mutex.
+	children map[*Task]struct{}
+
+	// If childPIDNamespace is not nil, all new tasks created by this task will
+	// be members of childPIDNamespace rather than this one. (As a corollary,
+	// this task becomes unable to create sibling tasks in the same thread
+	// group.)
+	//
+	// childPIDNamespace is exclusive to the task goroutine.
+	childPIDNamespace *PIDNamespace
+}
+
+// ThreadGroup returns the thread group containing t.
+func (t *Task) ThreadGroup() *ThreadGroup {
+	return t.tg
+}
+
+// PIDNamespace returns the PID namespace containing t.
+func (t *Task) PIDNamespace() *PIDNamespace {
+	return t.tg.pidns
+}
+
+// TaskSet returns the TaskSet containing t.
+func (t *Task) TaskSet() *TaskSet {
+	return t.tg.pidns.owner
+}
+
+// Timekeeper returns the system Timekeeper.
+func (t *Task) Timekeeper() *Timekeeper {
+	return t.k.timekeeper
+}
+
+// Parent returns t's parent.
+func (t *Task) Parent() *Task {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	return t.parent
+}
+
+// ThreadID returns t's thread ID in its own PID namespace. If the task is
+// dead, ThreadID returns 0.
+func (t *Task) ThreadID() ThreadID {
+	return t.tg.pidns.IDOfTask(t)
+}
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
new file mode 100644
index 000000000..7ba7dc50c
--- /dev/null
+++ b/pkg/sentry/kernel/time/BUILD
@@ -0,0 +1,19 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "time",
+    srcs = [
+        "context.go",
+        "time.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go
new file mode 100644
index 000000000..00b729d88
--- /dev/null
+++ b/pkg/sentry/kernel/time/context.go
@@ -0,0 +1,44 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+)
+
+// contextID is the time package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxRealtimeClock is a Context.Value key for the current real time.
+	CtxRealtimeClock contextID = iota
+)
+
+// RealtimeClockFromContext returns the real time clock associated with context
+// ctx.
+func RealtimeClockFromContext(ctx context.Context) Clock {
+	if v := ctx.Value(CtxRealtimeClock); v != nil {
+		return v.(Clock)
+	}
+	return nil
+}
+
+// NowFromContext returns the current real time associated with context ctx.
+func NowFromContext(ctx context.Context) Time {
+	if clk := RealtimeClockFromContext(ctx); clk != nil {
+		return clk.Now()
+	}
+	panic("encountered context without RealtimeClock")
+}
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
new file mode 100644
index 000000000..e959700f2
--- /dev/null
+++ b/pkg/sentry/kernel/time/time.go
@@ -0,0 +1,709 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package time defines the Timer type, which provides a periodic timer that
+// works by sampling a user-provided clock.
+package time
+
+import (
+	"fmt"
+	"math"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Events that may be generated by a Clock.
+const (
+	// ClockEventSet occurs when a Clock undergoes a discontinuous change.
+	ClockEventSet waiter.EventMask = 1 << iota
+
+	// ClockEventRateIncrease occurs when the rate at which a Clock advances
+	// increases significantly, such that values returned by previous calls to
+	// Clock.WallTimeUntil may be too large.
+	ClockEventRateIncrease
+)
+
+// Time represents an instant in time with nanosecond precision.
+//
+// Time may represent time with respect to any clock and may not have any
+// meaning in the real world.
+//
+// +stateify savable
+type Time struct {
+	ns int64
+}
+
+var (
+	// MinTime is the zero time instant, the lowest possible time that can
+	// be represented by Time.
+	MinTime = Time{ns: math.MinInt64}
+
+	// MaxTime is the highest possible time that can be represented by
+	// Time.
+	MaxTime = Time{ns: math.MaxInt64}
+
+	// ZeroTime represents the zero time in an unspecified Clock's domain.
+	ZeroTime = Time{ns: 0}
+)
+
+const (
+	// MinDuration is the minimum duration representable by time.Duration.
+	MinDuration = time.Duration(math.MinInt64)
+
+	// MaxDuration is the maximum duration representable by time.Duration.
+	MaxDuration = time.Duration(math.MaxInt64)
+)
+
+// FromNanoseconds returns a Time representing the point ns nanoseconds after
+// an unspecified Clock's zero time.
+func FromNanoseconds(ns int64) Time {
+	return Time{ns}
+}
+
+// FromSeconds returns a Time representing the point s seconds after an
+// unspecified Clock's zero time.
+func FromSeconds(s int64) Time {
+	if s > math.MaxInt64/time.Second.Nanoseconds() {
+		return MaxTime
+	}
+	return Time{s * 1e9}
+}
+
+// FromUnix converts from Unix seconds and nanoseconds to Time, assuming a real
+// time Unix clock domain.
+func FromUnix(s int64, ns int64) Time {
+	if s > math.MaxInt64/time.Second.Nanoseconds() {
+		return MaxTime
+	}
+	t := s * 1e9
+	if t > math.MaxInt64-ns {
+		return MaxTime
+	}
+	return Time{t + ns}
+}
+
+// FromTimespec converts from Linux Timespec to Time.
+func FromTimespec(ts linux.Timespec) Time {
+	return Time{ts.ToNsecCapped()}
+}
+
+// FromTimeval converts a Linux Timeval to Time.
+func FromTimeval(tv linux.Timeval) Time {
+	return Time{tv.ToNsecCapped()}
+}
+
+// Nanoseconds returns nanoseconds elapsed since the zero time in t's Clock
+// domain. If t represents walltime, this is nanoseconds since the Unix epoch.
+func (t Time) Nanoseconds() int64 {
+	return t.ns
+}
+
+// Seconds returns seconds elapsed since the zero time in t's Clock domain. If
+// t represents walltime, this is seconds since Unix epoch.
+func (t Time) Seconds() int64 {
+	return t.Nanoseconds() / time.Second.Nanoseconds()
+}
+
+// Timespec converts Time to a Linux timespec.
+func (t Time) Timespec() linux.Timespec {
+	return linux.NsecToTimespec(t.Nanoseconds())
+}
+
+// Unix returns the (seconds, nanoseconds) representation of t such that
+// seconds*1e9 + nanoseconds = t.
+func (t Time) Unix() (s int64, ns int64) {
+	s = t.ns / 1e9
+	ns = t.ns % 1e9
+	return
+}
+
+// TimeT converts Time to a Linux time_t.
+func (t Time) TimeT() linux.TimeT {
+	return linux.NsecToTimeT(t.Nanoseconds())
+}
+
+// Timeval converts Time to a Linux timeval.
+func (t Time) Timeval() linux.Timeval {
+	return linux.NsecToTimeval(t.Nanoseconds())
+}
+
+// StatxTimestamp converts Time to a Linux statx_timestamp.
+func (t Time) StatxTimestamp() linux.StatxTimestamp {
+	return linux.NsecToStatxTimestamp(t.Nanoseconds())
+}
+
+// Add adds the duration of d to t.
+func (t Time) Add(d time.Duration) Time {
+	if t.ns > 0 && d.Nanoseconds() > math.MaxInt64-int64(t.ns) {
+		return MaxTime
+	}
+	if t.ns < 0 && d.Nanoseconds() < math.MinInt64-int64(t.ns) {
+		return MinTime
+	}
+	return Time{int64(t.ns) + d.Nanoseconds()}
+}
+
+// AddTime adds the duration of u to t.
+func (t Time) AddTime(u Time) Time {
+	return t.Add(time.Duration(u.ns))
+}
+
+// Equal reports whether the two times represent the same instant in time.
+func (t Time) Equal(u Time) bool {
+	return t.ns == u.ns
+}
+
+// Before reports whether the instant t is before the instant u.
+func (t Time) Before(u Time) bool {
+	return t.ns < u.ns
+}
+
+// After reports whether the instant t is after the instant u.
+func (t Time) After(u Time) bool {
+	return t.ns > u.ns
+}
+
+// Sub returns the duration of t - u.
+//
+// N.B. This measure may not make sense for every Time returned by ktime.Clock.
+// Callers who need wall time duration can use ktime.Clock.WallTimeUntil to
+// estimate that wall time.
+func (t Time) Sub(u Time) time.Duration {
+	dur := time.Duration(int64(t.ns)-int64(u.ns)) * time.Nanosecond
+	switch {
+	case u.Add(dur).Equal(t):
+		return dur
+	case t.Before(u):
+		return MinDuration
+	default:
+		return MaxDuration
+	}
+}
+
+// IsMin returns whether t represents the lowest possible time instant.
+func (t Time) IsMin() bool {
+	return t == MinTime
+}
+
+// IsZero returns whether t represents the zero time instant in t's Clock domain.
+func (t Time) IsZero() bool {
+	return t == ZeroTime
+}
+
+// String returns the time represented in nanoseconds as a string.
+func (t Time) String() string {
+	return fmt.Sprintf("%dns", t.Nanoseconds())
+}
+
+// A Clock is an abstract time source.
+type Clock interface {
+	// Now returns the current time in nanoseconds according to the Clock.
+	Now() Time
+
+	// WallTimeUntil returns the estimated wall time until Now will return a
+	// value greater than or equal to t, given that a recent call to Now
+	// returned now. If t has already passed, WallTimeUntil may return 0 or a
+	// negative value.
+	//
+	// WallTimeUntil must be abstract to support Clocks that do not represent
+	// wall time (e.g. thread group execution timers). Clocks that represent
+	// wall times may embed the WallRateClock type to obtain an appropriate
+	// trivial implementation of WallTimeUntil.
+	//
+	// WallTimeUntil is used to determine when associated Timers should next
+	// check for expirations. Returning too small a value may result in
+	// spurious Timer goroutine wakeups, while returning too large a value may
+	// result in late expirations. Implementations should usually err on the
+	// side of underestimating.
+	WallTimeUntil(t, now Time) time.Duration
+
+	// Waitable methods may be used to subscribe to Clock events. Waiters will
+	// not be preserved by Save and must be re-established during restore.
+	//
+	// Since Clock events are transient, implementations of
+	// waiter.Waitable.Readiness should return 0.
+	waiter.Waitable
+}
+
+// WallRateClock implements Clock.WallTimeUntil for Clocks that elapse at the
+// same rate as wall time.
+type WallRateClock struct{}
+
+// WallTimeUntil implements Clock.WallTimeUntil.
+func (*WallRateClock) WallTimeUntil(t, now Time) time.Duration {
+	return t.Sub(now)
+}
+
+// NoClockEvents implements waiter.Waitable for Clocks that do not generate
+// events.
+type NoClockEvents struct{}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (*NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return 0
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (*NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (*NoClockEvents) EventUnregister(e *waiter.Entry) {
+}
+
+// ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and
+// defining waiter.Waitable.Readiness as required by Clock.
+type ClockEventsQueue struct {
+	waiter.Queue
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (*ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return 0
+}
+
+// A TimerListener receives expirations from a Timer.
+type TimerListener interface {
+	// Notify is called when its associated Timer expires. exp is the number of
+	// expirations. setting is the next timer Setting.
+	//
+	// Notify is called with the associated Timer's mutex locked, so Notify
+	// must not take any locks that precede Timer.mu in lock order.
+	//
+	// If Notify returns true, the timer will use the returned setting
+	// rather than the passed one.
+	//
+	// Preconditions: exp > 0.
+	Notify(exp uint64, setting Setting) (newSetting Setting, update bool)
+
+	// Destroy is called when the timer is destroyed.
+	Destroy()
+}
+
+// Setting contains user-controlled mutable Timer properties.
+//
+// +stateify savable
+type Setting struct {
+	// Enabled is true if the timer is running.
+	Enabled bool
+
+	// Next is the time in nanoseconds of the next expiration.
+	Next Time
+
+	// Period is the time in nanoseconds between expirations. If Period is
+	// zero, the timer will not automatically restart after expiring.
+	//
+	// Invariant: Period >= 0.
+	Period time.Duration
+}
+
+// SettingFromSpec converts a (value, interval) pair to a Setting based on a
+// reading from c. value is interpreted as a time relative to c.Now().
+func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Setting, error) {
+	return SettingFromSpecAt(value, interval, c.Now())
+}
+
+// SettingFromSpecAt converts a (value, interval) pair to a Setting. value is
+// interpreted as a time relative to now.
+func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (Setting, error) {
+	if value < 0 {
+		return Setting{}, syserror.EINVAL
+	}
+	if value == 0 {
+		return Setting{Period: interval}, nil
+	}
+	return Setting{
+		Enabled: true,
+		Next:    now.Add(value),
+		Period:  interval,
+	}, nil
+}
+
+// SettingFromAbsSpec converts a (value, interval) pair to a Setting. value is
+// interpreted as an absolute time.
+func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) {
+	if value.Before(ZeroTime) {
+		return Setting{}, syserror.EINVAL
+	}
+	if value.IsZero() {
+		return Setting{Period: interval}, nil
+	}
+	return Setting{
+		Enabled: true,
+		Next:    value,
+		Period:  interval,
+	}, nil
+}
+
+// SettingFromItimerspec converts a linux.Itimerspec to a Setting. If abs is
+// true, its.Value is interpreted as an absolute time. Otherwise, it is
+// interpreted as a time relative to c.Now().
+func SettingFromItimerspec(its linux.Itimerspec, abs bool, c Clock) (Setting, error) {
+	if abs {
+		return SettingFromAbsSpec(FromTimespec(its.Value), its.Interval.ToDuration())
+	}
+	return SettingFromSpec(its.Value.ToDuration(), its.Interval.ToDuration(), c)
+}
+
+// SpecFromSetting converts a timestamp and a Setting to a (relative value,
+// interval) pair, as used by most Linux syscalls that return a struct
+// itimerval or struct itimerspec.
+func SpecFromSetting(now Time, s Setting) (value, period time.Duration) {
+	if !s.Enabled {
+		return 0, s.Period
+	}
+	return s.Next.Sub(now), s.Period
+}
+
+// ItimerspecFromSetting converts a Setting to a linux.Itimerspec.
+func ItimerspecFromSetting(now Time, s Setting) linux.Itimerspec {
+	val, iv := SpecFromSetting(now, s)
+	return linux.Itimerspec{
+		Interval: linux.DurationToTimespec(iv),
+		Value:    linux.DurationToTimespec(val),
+	}
+}
+
+// At returns an updated Setting and a number of expirations after the
+// associated Clock indicates a time of now.
+//
+// Settings may be created by successive calls to At with decreasing
+// values of now (i.e. time may appear to go backward). Supporting this is
+// required to support non-monotonic clocks, as well as allowing
+// Timer.clock.Now() to be called without holding Timer.mu.
+func (s Setting) At(now Time) (Setting, uint64) {
+	if !s.Enabled {
+		return s, 0
+	}
+	if s.Next.After(now) {
+		return s, 0
+	}
+	if s.Period == 0 {
+		s.Enabled = false
+		return s, 1
+	}
+	exp := 1 + uint64(now.Sub(s.Next).Nanoseconds())/uint64(s.Period)
+	s.Next = s.Next.Add(time.Duration(uint64(s.Period) * exp))
+	return s, exp
+}
+
+// Timer is an optionally-periodic timer driven by sampling a user-specified
+// Clock. Timer's semantics support the requirements of Linux's interval timers
+// (setitimer(2), timer_create(2), timerfd_create(2)).
+//
+// Timers should be created using NewTimer and must be cleaned up by calling
+// Timer.Destroy when no longer used.
+//
+// +stateify savable
+type Timer struct {
+	// clock is the time source. clock is immutable.
+	clock Clock
+
+	// listener is notified of expirations. listener is immutable.
+	listener TimerListener
+
+	// mu protects the following mutable fields.
+	mu sync.Mutex `state:"nosave"`
+
+	// setting is the timer setting. setting is protected by mu.
+	setting Setting
+
+	// paused is true if the Timer is paused. paused is protected by mu.
+	paused bool
+
+	// kicker is used to wake the Timer goroutine. The kicker pointer is
+	// immutable, but its state is protected by mu.
+	kicker *time.Timer `state:"nosave"`
+
+	// entry is registered with clock.EventRegister. entry is immutable.
+	//
+	// Per comment in Clock, entry must be re-registered after restore; per
+	// comment in Timer.Load, this is done in Timer.Resume.
+	entry waiter.Entry `state:"nosave"`
+
+	// events is the channel that will be notified whenever entry receives an
+	// event. It is also closed by Timer.Destroy to instruct the Timer
+	// goroutine to exit.
+	events chan struct{} `state:"nosave"`
+}
+
+// timerTickEvents are Clock events that require the Timer goroutine to Tick
+// prematurely.
+const timerTickEvents = ClockEventSet | ClockEventRateIncrease
+
+// NewTimer returns a new Timer that will obtain time from clock and send
+// expirations to listener. The Timer is initially stopped and has no first
+// expiration or period configured.
+func NewTimer(clock Clock, listener TimerListener) *Timer {
+	t := &Timer{
+		clock:    clock,
+		listener: listener,
+	}
+	t.init()
+	return t
+}
+
+// After waits for the duration to elapse according to clock and then sends a
+// notification on the returned channel. The timer is started immediately and
+// will fire exactly once. The second return value is the start time used with
+// the duration.
+//
+// Callers must call Timer.Destroy.
+func After(clock Clock, duration time.Duration) (*Timer, Time, <-chan struct{}) {
+	notifier, tchan := NewChannelNotifier()
+	t := NewTimer(clock, notifier)
+	now := clock.Now()
+
+	t.Swap(Setting{
+		Enabled: true,
+		Period:  0,
+		Next:    now.Add(duration),
+	})
+	return t, now, tchan
+}
+
+// init initializes Timer state that is not preserved across save/restore. If
+// init has already been called, calling it again is a no-op.
+//
+// Preconditions: t.mu must be locked, or the caller must have exclusive access
+// to t.
+func (t *Timer) init() {
+	if t.kicker != nil {
+		return
+	}
+	// If t.kicker is nil, the Timer goroutine can't be running, so we can't
+	// race with it.
+	t.kicker = time.NewTimer(0)
+	t.entry, t.events = waiter.NewChannelEntry(nil)
+	t.clock.EventRegister(&t.entry, timerTickEvents)
+	go t.runGoroutine() // S/R-SAFE: synchronized by t.mu
+}
+
+// Destroy releases resources owned by the Timer. A Destroyed Timer must not be
+// used again; in particular, a Destroyed Timer should not be Saved.
+func (t *Timer) Destroy() {
+	// Stop the Timer, ensuring that the Timer goroutine will not call
+	// t.kicker.Reset, before calling t.kicker.Stop.
+	t.mu.Lock()
+	t.setting.Enabled = false
+	t.mu.Unlock()
+	t.kicker.Stop()
+	// Unregister t.entry, ensuring that the Clock will not send to t.events,
+	// before closing t.events to instruct the Timer goroutine to exit.
+	t.clock.EventUnregister(&t.entry)
+	close(t.events)
+	t.listener.Destroy()
+}
+
+func (t *Timer) runGoroutine() {
+	for {
+		select {
+		case <-t.kicker.C:
+		case _, ok := <-t.events:
+			if !ok {
+				// Channel closed by Destroy.
+				return
+			}
+		}
+		t.Tick()
+	}
+}
+
+// Tick requests that the Timer immediately check for expirations and
+// re-evaluate when it should next check for expirations.
+func (t *Timer) Tick() {
+	now := t.clock.Now()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.paused {
+		return
+	}
+	s, exp := t.setting.At(now)
+	t.setting = s
+	if exp > 0 {
+		if newS, ok := t.listener.Notify(exp, t.setting); ok {
+			t.setting = newS
+		}
+	}
+	t.resetKickerLocked(now)
+}
+
+// Pause pauses the Timer, ensuring that it does not generate any further
+// expirations until Resume is called. If the Timer is already paused, Pause
+// has no effect.
+func (t *Timer) Pause() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.paused = true
+	// t.kicker may be nil if we were restored but never resumed.
+	if t.kicker != nil {
+		t.kicker.Stop()
+	}
+}
+
+// Resume ends the effect of Pause. If the Timer is not paused, Resume has no
+// effect.
+func (t *Timer) Resume() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if !t.paused {
+		return
+	}
+	t.paused = false
+
+	// Lazily initialize the Timer. We can't call Timer.init until Timer.Resume
+	// because save/restore will restore Timers before
+	// kernel.Timekeeper.SetClocks() has been called, so if t.clock is backed
+	// by a kernel.Timekeeper then the Timer goroutine will panic if it calls
+	// t.clock.Now().
+	t.init()
+
+	// Kick the Timer goroutine in case it was already initialized, but the
+	// Timer goroutine was sleeping.
+	t.kicker.Reset(0)
+}
+
+// Get returns a snapshot of the Timer's current Setting and the time
+// (according to the Timer's Clock) at which the snapshot was taken.
+//
+// Preconditions: The Timer must not be paused (since its Setting cannot
+// be advanced to the current time while it is paused.)
+func (t *Timer) Get() (Time, Setting) {
+	now := t.clock.Now()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.paused {
+		panic(fmt.Sprintf("Timer.Get called on paused Timer %p", t))
+	}
+	s, exp := t.setting.At(now)
+	t.setting = s
+	if exp > 0 {
+		if newS, ok := t.listener.Notify(exp, t.setting); ok {
+			t.setting = newS
+		}
+	}
+	t.resetKickerLocked(now)
+	return now, s
+}
+
+// Swap atomically changes the Timer's Setting and returns the Timer's previous
+// Setting and the time (according to the Timer's Clock) at which the snapshot
+// was taken. Setting s.Enabled to true starts the Timer, while setting
+// s.Enabled to false stops it.
+//
+// Preconditions: The Timer must not be paused.
+func (t *Timer) Swap(s Setting) (Time, Setting) {
+	return t.SwapAnd(s, nil)
+}
+
+// SwapAnd atomically changes the Timer's Setting, calls f if it is not nil,
+// and returns the Timer's previous Setting and the time (according to the
+// Timer's Clock) at which the Setting was changed. Setting s.Enabled to true
+// starts the timer, while setting s.Enabled to false stops it.
+//
+// Preconditions: The Timer must not be paused. f cannot call any Timer methods
+// since it is called with the Timer mutex locked.
+func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
+	now := t.clock.Now()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.paused {
+		panic(fmt.Sprintf("Timer.SwapAnd called on paused Timer %p", t))
+	}
+	oldS, oldExp := t.setting.At(now)
+	if oldExp > 0 {
+		t.listener.Notify(oldExp, oldS)
+		// N.B. The returned Setting doesn't matter because we're about
+		// to overwrite.
+	}
+	if f != nil {
+		f()
+	}
+	newS, newExp := s.At(now)
+	t.setting = newS
+	if newExp > 0 {
+		if newS, ok := t.listener.Notify(newExp, t.setting); ok {
+			t.setting = newS
+		}
+	}
+	t.resetKickerLocked(now)
+	return now, oldS
+}
+
+// Atomically invokes f atomically with respect to expirations of t; that is, t
+// cannot generate expirations while f is being called.
+//
+// Preconditions: f cannot call any Timer methods since it is called with the
+// Timer mutex locked.
+func (t *Timer) Atomically(f func()) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	f()
+}
+
+// Preconditions: t.mu must be locked.
+func (t *Timer) resetKickerLocked(now Time) {
+	if t.setting.Enabled {
+		// Clock.WallTimeUntil may return a negative value. This is fine;
+		// time.when treats negative Durations as 0.
+		t.kicker.Reset(t.clock.WallTimeUntil(t.setting.Next, now))
+	}
+	// We don't call t.kicker.Stop if !t.setting.Enabled because in most cases
+	// resetKickerLocked will be called from the Timer goroutine itself, in
+	// which case t.kicker has already fired and t.kicker.Stop will be an
+	// expensive no-op (time.Timer.Stop => time.stopTimer => runtime.stopTimer
+	// => runtime.deltimer).
+}
+
+// Clock returns the Clock used by t.
+func (t *Timer) Clock() Clock {
+	return t.clock
+}
+
+// ChannelNotifier is a TimerListener that sends a message on an empty struct
+// channel.
+//
+// ChannelNotifier cannot be saved or loaded.
+type ChannelNotifier struct {
+	// tchan must be a buffered channel.
+	tchan chan struct{}
+}
+
+// NewChannelNotifier creates a new channel notifier.
+//
+// If the notifier is used with a timer, Timer.Destroy will close the channel
+// returned here.
+func NewChannelNotifier() (TimerListener, <-chan struct{}) {
+	tchan := make(chan struct{}, 1)
+	return &ChannelNotifier{tchan}, tchan
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (c *ChannelNotifier) Notify(uint64, Setting) (Setting, bool) {
+	select {
+	case c.tchan <- struct{}{}:
+	default:
+	}
+
+	return Setting{}, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy and will close the channel.
+func (c *ChannelNotifier) Destroy() {
+	close(c.tchan)
+}
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
new file mode 100644
index 000000000..0adf25691
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -0,0 +1,325 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sync/atomic"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/log"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// Timekeeper manages all of the kernel clocks.
+//
+// +stateify savable
+type Timekeeper struct {
+	// clocks are the clock sources.
+	//
+	// These are not saved directly, as the new machine's clock may behave
+	// differently.
+	//
+	// It is set only once, by SetClocks.
+	clocks sentrytime.Clocks `state:"nosave"`
+
+	// bootTime is the realtime when the system "booted". i.e., when
+	// SetClocks was called in the initial (not restored) run.
+	bootTime ktime.Time
+
+	// monotonicOffset is the offset to apply to the monotonic clock output
+	// from clocks.
+	//
+	// It is set only once, by SetClocks.
+	monotonicOffset int64 `state:"nosave"`
+
+	// monotonicLowerBound is the lowerBound for monotonic time.
+	monotonicLowerBound int64 `state:"nosave"`
+
+	// restored, if non-nil, indicates that this Timekeeper was restored
+	// from a state file. The clocks are not set until restored is closed.
+	restored chan struct{} `state:"nosave"`
+
+	// saveMonotonic is the (offset) value of the monotonic clock at the
+	// time of save.
+	//
+	// It is only valid if restored is non-nil.
+	//
+	// It is only used in SetClocks after restore to compute the new
+	// monotonicOffset.
+	saveMonotonic int64
+
+	// saveRealtime is the value of the realtime clock at the time of save.
+	//
+	// It is only valid if restored is non-nil.
+	//
+	// It is only used in SetClocks after restore to compute the new
+	// monotonicOffset.
+	saveRealtime int64
+
+	// params manages the parameter page.
+	params *VDSOParamPage
+
+	// mu protects destruction with stop and wg.
+	mu sync.Mutex `state:"nosave"`
+
+	// stop is used to tell the update goroutine to exit.
+	stop chan struct{} `state:"nosave"`
+
+	// wg is used to indicate that the update goroutine has exited.
+	wg sync.WaitGroup `state:"nosave"`
+}
+
+// NewTimekeeper returns a Timekeeper that is automatically kept up-to-date.
+// NewTimekeeper does not take ownership of paramPage.
+//
+// SetClocks must be called on the returned Timekeeper before it is usable.
+func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) {
+	return &Timekeeper{
+		params: NewVDSOParamPage(mfp, paramPage),
+	}, nil
+}
+
+// SetClocks the backing clock source.
+//
+// SetClocks must be called before the Timekeeper is used, and it may not be
+// called more than once, as changing the clock source without extra correction
+// could cause time discontinuities.
+//
+// It must also be called after Load.
+func (t *Timekeeper) SetClocks(c sentrytime.Clocks) {
+	// Update the params, marking them "not ready", as we may need to
+	// restart calibration on this new machine.
+	if t.restored != nil {
+		if err := t.params.Write(func() vdsoParams {
+			return vdsoParams{}
+		}); err != nil {
+			panic("unable to reset VDSO params: " + err.Error())
+		}
+	}
+
+	if t.clocks != nil {
+		panic("SetClocks called on previously-initialized Timekeeper")
+	}
+
+	t.clocks = c
+
+	// Compute the offset of the monotonic clock from the base Clocks.
+	//
+	// In a fresh (not restored) sentry, monotonic time starts at zero.
+	//
+	// In a restored sentry, monotonic time jumps forward by approximately
+	// the same amount as real time. There are no guarantees here, we are
+	// just making a best-effort attempt to make it appear that the app
+	// was simply not scheduled for a long period, rather than that the
+	// real time clock was changed.
+	//
+	// If real time went backwards, it remains the same.
+	wantMonotonic := int64(0)
+
+	nowMonotonic, err := t.clocks.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		panic("Unable to get current monotonic time: " + err.Error())
+	}
+
+	nowRealtime, err := t.clocks.GetTime(sentrytime.Realtime)
+	if err != nil {
+		panic("Unable to get current realtime: " + err.Error())
+	}
+
+	if t.restored != nil {
+		wantMonotonic = t.saveMonotonic
+		elapsed := nowRealtime - t.saveRealtime
+		if elapsed > 0 {
+			wantMonotonic += elapsed
+		}
+	}
+
+	t.monotonicOffset = wantMonotonic - nowMonotonic
+
+	if t.restored == nil {
+		// Hold on to the initial "boot" time.
+		t.bootTime = ktime.FromNanoseconds(nowRealtime)
+	}
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.startUpdater()
+
+	if t.restored != nil {
+		close(t.restored)
+	}
+}
+
+// startUpdater starts an update goroutine that keeps the clocks updated.
+//
+// mu must be held.
+func (t *Timekeeper) startUpdater() {
+	if t.stop != nil {
+		// Timekeeper already started
+		return
+	}
+	t.stop = make(chan struct{})
+
+	// Keep the clocks up to date.
+	//
+	// Note that the Go runtime uses host CLOCK_MONOTONIC to service the
+	// timer, so it may run at a *slightly* different rate from the
+	// application CLOCK_MONOTONIC. That is fine, as we only need to update
+	// at approximately this rate.
+	timer := time.NewTicker(sentrytime.ApproxUpdateInterval)
+	t.wg.Add(1)
+	go func() { // S/R-SAFE: stopped during save.
+		defer t.wg.Done()
+		for {
+			// Start with an update immediately, so the clocks are
+			// ready ASAP.
+
+			// Call Update within a Write block to prevent the VDSO
+			// from using the old params between Update and
+			// Write.
+			if err := t.params.Write(func() vdsoParams {
+				monotonicParams, monotonicOk, realtimeParams, realtimeOk := t.clocks.Update()
+
+				var p vdsoParams
+				if monotonicOk {
+					p.monotonicReady = 1
+					p.monotonicBaseCycles = int64(monotonicParams.BaseCycles)
+					p.monotonicBaseRef = int64(monotonicParams.BaseRef) + t.monotonicOffset
+					p.monotonicFrequency = monotonicParams.Frequency
+				}
+				if realtimeOk {
+					p.realtimeReady = 1
+					p.realtimeBaseCycles = int64(realtimeParams.BaseCycles)
+					p.realtimeBaseRef = int64(realtimeParams.BaseRef)
+					p.realtimeFrequency = realtimeParams.Frequency
+				}
+
+				log.Debugf("Updating VDSO parameters: %+v", p)
+
+				return p
+			}); err != nil {
+				log.Warningf("Unable to update VDSO parameter page: %v", err)
+			}
+
+			select {
+			case <-timer.C:
+			case <-t.stop:
+				return
+			}
+		}
+	}()
+}
+
+// stopUpdater stops the update goroutine, blocking until it exits.
+//
+// mu must be held.
+func (t *Timekeeper) stopUpdater() {
+	if t.stop == nil {
+		// Updater not running.
+		return
+	}
+
+	close(t.stop)
+	t.wg.Wait()
+	t.stop = nil
+}
+
+// Destroy destroys the Timekeeper, freeing all associated resources.
+func (t *Timekeeper) Destroy() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.stopUpdater()
+}
+
+// PauseUpdates stops clock parameter updates. This should only be used when
+// Tasks are not running and thus cannot access the clock.
+func (t *Timekeeper) PauseUpdates() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.stopUpdater()
+}
+
+// ResumeUpdates restarts clock parameter updates stopped by PauseUpdates.
+func (t *Timekeeper) ResumeUpdates() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.startUpdater()
+}
+
+// GetTime returns the current time in nanoseconds.
+func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) {
+	if t.clocks == nil {
+		if t.restored == nil {
+			panic("Timekeeper used before initialized with SetClocks")
+		}
+		<-t.restored
+	}
+	now, err := t.clocks.GetTime(c)
+	if err == nil && c == sentrytime.Monotonic {
+		now += t.monotonicOffset
+		for {
+			// It's possible that the clock is shaky. This may be due to
+			// platform issues, e.g. the KVM platform relies on the guest
+			// TSC and host TSC, which may not be perfectly in sync. To
+			// work around this issue, ensure that the monotonic time is
+			// always bounded by the last time read.
+			oldLowerBound := atomic.LoadInt64(&t.monotonicLowerBound)
+			if now < oldLowerBound {
+				now = oldLowerBound
+				break
+			}
+			if atomic.CompareAndSwapInt64(&t.monotonicLowerBound, oldLowerBound, now) {
+				break
+			}
+		}
+	}
+	return now, err
+}
+
+// BootTime returns the system boot real time.
+func (t *Timekeeper) BootTime() ktime.Time {
+	return t.bootTime
+}
+
+// timekeeperClock is a ktime.Clock that reads time from a
+// kernel.Timekeeper-managed clock.
+//
+// +stateify savable
+type timekeeperClock struct {
+	tk *Timekeeper
+	c  sentrytime.ClockID
+
+	// Implements ktime.Clock.WallTimeUntil.
+	ktime.WallRateClock `state:"nosave"`
+
+	// Implements waiter.Waitable. (We have no ability to detect
+	// discontinuities from external changes to CLOCK_REALTIME).
+	ktime.NoClockEvents `state:"nosave"`
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *timekeeperClock) Now() ktime.Time {
+	now, err := tc.tk.GetTime(tc.c)
+	if err != nil {
+		panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err))
+	}
+	return ktime.FromNanoseconds(now)
+}
diff --git a/pkg/sentry/kernel/timekeeper_state.go b/pkg/sentry/kernel/timekeeper_state.go
new file mode 100644
index 000000000..8e961c832
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper_state.go
@@ -0,0 +1,41 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/time"
+)
+
+// beforeSave is invoked by stateify.
+func (t *Timekeeper) beforeSave() {
+	if t.stop != nil {
+		panic("pauseUpdates must be called before Save")
+	}
+
+	// N.B. we want the *offset* monotonic time.
+	var err error
+	if t.saveMonotonic, err = t.GetTime(time.Monotonic); err != nil {
+		panic("unable to get current monotonic time: " + err.Error())
+	}
+
+	if t.saveRealtime, err = t.GetTime(time.Realtime); err != nil {
+		panic("unable to get current realtime: " + err.Error())
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (t *Timekeeper) afterLoad() {
+	t.restored = make(chan struct{})
+}
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
new file mode 100644
index 000000000..cf2f7ca72
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -0,0 +1,156 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// mockClocks is a sentrytime.Clocks that simply returns the times in the
+// struct.
+type mockClocks struct {
+	monotonic int64
+	realtime  int64
+}
+
+// Update implements sentrytime.Clocks.Update. It does nothing.
+func (*mockClocks) Update() (monotonicParams sentrytime.Parameters, monotonicOk bool, realtimeParam sentrytime.Parameters, realtimeOk bool) {
+	return
+}
+
+// Update implements sentrytime.Clocks.GetTime.
+func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) {
+	switch id {
+	case sentrytime.Monotonic:
+		return c.monotonic, nil
+	case sentrytime.Realtime:
+		return c.realtime, nil
+	default:
+		return 0, syserror.EINVAL
+	}
+}
+
+// stateTestClocklessTimekeeper returns a test Timekeeper which has not had
+// SetClocks called.
+func stateTestClocklessTimekeeper(tb testing.TB) *Timekeeper {
+	ctx := contexttest.Context(tb)
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	fr, err := mfp.MemoryFile().Allocate(usermem.PageSize, usage.Anonymous)
+	if err != nil {
+		tb.Fatalf("failed to allocate memory: %v", err)
+	}
+	return &Timekeeper{
+		params: NewVDSOParamPage(mfp, fr),
+	}
+}
+
+func stateTestTimekeeper(tb testing.TB) *Timekeeper {
+	t := stateTestClocklessTimekeeper(tb)
+	t.SetClocks(sentrytime.NewCalibratedClocks())
+	return t
+}
+
+// TestTimekeeperMonotonicZero tests that monotonic time starts at zero.
+func TestTimekeeperMonotonicZero(t *testing.T) {
+	c := &mockClocks{
+		monotonic: 100000,
+	}
+
+	tk := stateTestClocklessTimekeeper(t)
+	tk.SetClocks(c)
+	defer tk.Destroy()
+
+	now, err := tk.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		t.Errorf("GetTime err got %v want nil", err)
+	}
+	if now != 0 {
+		t.Errorf("GetTime got %d want 0", now)
+	}
+
+	c.monotonic += 10
+
+	now, err = tk.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		t.Errorf("GetTime err got %v want nil", err)
+	}
+	if now != 10 {
+		t.Errorf("GetTime got %d want 10", now)
+	}
+}
+
+// TestTimekeeperMonotonicJumpForward tests that monotonic time jumps forward
+// after restore.
+func TestTimekeeperMonotonicForward(t *testing.T) {
+	c := &mockClocks{
+		monotonic: 900000,
+		realtime:  600000,
+	}
+
+	tk := stateTestClocklessTimekeeper(t)
+	tk.restored = make(chan struct{})
+	tk.saveMonotonic = 100000
+	tk.saveRealtime = 400000
+	tk.SetClocks(c)
+	defer tk.Destroy()
+
+	// The monotonic clock should jump ahead by 200000 to 300000.
+	//
+	// The new system monotonic time (900000) is irrelevant to what the app
+	// sees.
+	now, err := tk.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		t.Errorf("GetTime err got %v want nil", err)
+	}
+	if now != 300000 {
+		t.Errorf("GetTime got %d want 300000", now)
+	}
+}
+
+// TestTimekeeperMonotonicJumpBackwards tests that monotonic time does not jump
+// backwards when realtime goes backwards.
+func TestTimekeeperMonotonicJumpBackwards(t *testing.T) {
+	c := &mockClocks{
+		monotonic: 900000,
+		realtime:  400000,
+	}
+
+	tk := stateTestClocklessTimekeeper(t)
+	tk.restored = make(chan struct{})
+	tk.saveMonotonic = 100000
+	tk.saveRealtime = 600000
+	tk.SetClocks(c)
+	defer tk.Destroy()
+
+	// The monotonic clock should remain at 100000.
+	//
+	// The new system monotonic time (900000) is irrelevant to what the app
+	// sees and we don't want to jump the monotonic clock backwards like
+	// realtime did.
+	now, err := tk.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		t.Errorf("GetTime err got %v want nil", err)
+	}
+	if now != 100000 {
+		t.Errorf("GetTime got %d want 100000", now)
+	}
+}
diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go
new file mode 100644
index 000000000..d0e0810e8
--- /dev/null
+++ b/pkg/sentry/kernel/tty.go
@@ -0,0 +1,41 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import "gvisor.dev/gvisor/pkg/sync"
+
+// TTY defines the relationship between a thread group and its controlling
+// terminal.
+//
+// +stateify savable
+type TTY struct {
+	// Index is the terminal index. It is immutable.
+	Index uint32
+
+	mu sync.Mutex `state:"nosave"`
+
+	// tg is protected by mu.
+	tg *ThreadGroup
+}
+
+// TTY returns the thread group's controlling terminal. If nil, there is no
+// controlling terminal.
+func (tg *ThreadGroup) TTY() *TTY {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	return tg.tty
+}
diff --git a/pkg/sentry/kernel/uncaught_signal.proto b/pkg/sentry/kernel/uncaught_signal.proto
new file mode 100644
index 000000000..0bdb062cb
--- /dev/null
+++ b/pkg/sentry/kernel/uncaught_signal.proto
@@ -0,0 +1,37 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package gvisor;
+
+import "pkg/sentry/arch/registers.proto";
+
+message UncaughtSignal {
+  // Thread ID.
+  int32 tid = 1;
+
+  // Process ID.
+  int32 pid = 2;
+
+  // Registers at the time of the fault or signal.
+  Registers registers = 3;
+
+  // Signal number.
+  int32 signal_number = 4;
+
+  // The memory location which caused the fault (set if applicable, 0
+  // otherwise). This will be set for SIGILL, SIGFPE, SIGSEGV, and SIGBUS.
+  uint64 fault_addr = 5;
+}
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
new file mode 100644
index 000000000..8ccf04bd1
--- /dev/null
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -0,0 +1,101 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// UTSNamespace represents a UTS namespace, a holder of two system identifiers:
+// the hostname and domain name.
+//
+// +stateify savable
+type UTSNamespace struct {
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	hostName   string
+	domainName string
+
+	// userns is the user namespace associated with the UTSNamespace.
+	// Privileged operations on this UTSNamespace must have appropriate
+	// capabilities in userns.
+	//
+	// userns is immutable.
+	userns *auth.UserNamespace
+}
+
+// NewUTSNamespace creates a new UTS namespace.
+func NewUTSNamespace(hostName, domainName string, userns *auth.UserNamespace) *UTSNamespace {
+	return &UTSNamespace{
+		hostName:   hostName,
+		domainName: domainName,
+		userns:     userns,
+	}
+}
+
+// UTSNamespace returns the task's UTS namespace.
+func (t *Task) UTSNamespace() *UTSNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.utsns
+}
+
+// HostName returns the host name of this UTS namespace.
+func (u *UTSNamespace) HostName() string {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return u.hostName
+}
+
+// SetHostName sets the host name of this UTS namespace.
+func (u *UTSNamespace) SetHostName(host string) {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	u.hostName = host
+}
+
+// DomainName returns the domain name of this UTS namespace.
+func (u *UTSNamespace) DomainName() string {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return u.domainName
+}
+
+// SetDomainName sets the domain name of this UTS namespace.
+func (u *UTSNamespace) SetDomainName(domain string) {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	u.domainName = domain
+}
+
+// UserNamespace returns the user namespace associated with this UTS namespace.
+func (u *UTSNamespace) UserNamespace() *auth.UserNamespace {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return u.userns
+}
+
+// Clone makes a copy of this UTS namespace, associating the given user
+// namespace.
+func (u *UTSNamespace) Clone(userns *auth.UserNamespace) *UTSNamespace {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return &UTSNamespace{
+		hostName:   u.hostName,
+		domainName: u.domainName,
+		userns:     userns,
+	}
+}
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
new file mode 100644
index 000000000..f1b3c212c
--- /dev/null
+++ b/pkg/sentry/kernel/vdso.go
@@ -0,0 +1,148 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// vdsoParams are the parameters exposed to the VDSO.
+//
+// They are exposed to the VDSO via a parameter page managed by VDSOParamPage,
+// which also includes a sequence counter.
+type vdsoParams struct {
+	monotonicReady      uint64
+	monotonicBaseCycles int64
+	monotonicBaseRef    int64
+	monotonicFrequency  uint64
+
+	realtimeReady      uint64
+	realtimeBaseCycles int64
+	realtimeBaseRef    int64
+	realtimeFrequency  uint64
+}
+
+// VDSOParamPage manages a VDSO parameter page.
+//
+// Its memory layout looks like:
+//
+// type page struct {
+//	// seq is a sequence counter that protects the fields below.
+//	seq uint64
+//	vdsoParams
+// }
+//
+// Everything in the struct is 8 bytes for easy alignment.
+//
+// It must be kept in sync with params in vdso/vdso_time.cc.
+//
+// +stateify savable
+type VDSOParamPage struct {
+	// The parameter page is fr, allocated from mfp.MemoryFile().
+	mfp pgalloc.MemoryFileProvider
+	fr  platform.FileRange
+
+	// seq is the current sequence count written to the page.
+	//
+	// A write is in progress if bit 1 of the counter is set.
+	//
+	// Timekeeper's updater goroutine may call Write before equality is
+	// checked in state_test_util tests, causing this field to change across
+	// save / restore.
+	seq uint64
+}
+
+// NewVDSOParamPage returns a VDSOParamPage.
+//
+// Preconditions:
+//
+// * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does
+//   not take ownership of fr; it must remain allocated for the lifetime of the
+//   VDSOParamPage.
+//
+// * VDSOParamPage must be the only writer to fr.
+//
+// * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block.
+func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage {
+	return &VDSOParamPage{mfp: mfp, fr: fr}
+}
+
+// access returns a mapping of the param page.
+func (v *VDSOParamPage) access() (safemem.Block, error) {
+	bs, err := v.mfp.MemoryFile().MapInternal(v.fr, usermem.ReadWrite)
+	if err != nil {
+		return safemem.Block{}, err
+	}
+	if bs.NumBlocks() != 1 {
+		panic(fmt.Sprintf("Multiple blocks (%d) in VDSO param BlockSeq", bs.NumBlocks()))
+	}
+	return bs.Head(), nil
+}
+
+// incrementSeq increments the sequence counter in the param page.
+func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error {
+	next := v.seq + 1
+	old, err := safemem.SwapUint64(paramPage, next)
+	if err != nil {
+		return err
+	}
+
+	if old != v.seq {
+		return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d. Application may hang or get incorrect time from the VDSO.", old, v.seq)
+	}
+
+	v.seq = next
+	return nil
+}
+
+// Write updates the VDSO parameters.
+//
+// Write starts a write block, calls f to get the new parameters, writes
+// out the new parameters, then ends the write block.
+func (v *VDSOParamPage) Write(f func() vdsoParams) error {
+	paramPage, err := v.access()
+	if err != nil {
+		return err
+	}
+
+	// Write begin.
+	next := v.seq + 1
+	if next%2 != 1 {
+		panic("Out-of-order sequence count")
+	}
+
+	err = v.incrementSeq(paramPage)
+	if err != nil {
+		return err
+	}
+
+	// Get the new params.
+	p := f()
+	buf := binary.Marshal(nil, usermem.ByteOrder, p)
+
+	// Skip the sequence counter.
+	if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil {
+		panic(fmt.Sprintf("Unable to get set VDSO parameters: %v", err))
+	}
+
+	// Write end.
+	return v.incrementSeq(paramPage)
+}
diff --git a/pkg/sentry/kernel/version.go b/pkg/sentry/kernel/version.go
new file mode 100644
index 000000000..5640dd71d
--- /dev/null
+++ b/pkg/sentry/kernel/version.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// Version defines the application-visible system version.
+type Version struct {
+	// Operating system name (e.g. "Linux").
+	Sysname string
+
+	// Operating system release (e.g. "4.4-amd64").
+	Release string
+
+	// Operating system version. On Linux this takes the shape
+	// "#VERSION CONFIG_FLAGS TIMESTAMP"
+	// where:
+	// - VERSION is a sequence counter incremented on every successful build
+	// - CONFIG_FLAGS is a space-separated list of major enabled kernel features
+	//   (e.g. "SMP" and "PREEMPT")
+	// - TIMESTAMP is the build timestamp as returned by `date`
+	Version string
+}