93 files changed, 19474 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
new file mode 100644
index 000000000..62794cff5
--- /dev/null
+++ b/pkg/sentry/kernel/BUILD
@@ -0,0 +1,234 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "kernel_state",
+    srcs = [
+        "abstract_socket_namespace.go",
+        "fd_map.go",
+        "fs_context.go",
+        "ipc_namespace.go",
+        "kernel.go",
+        "pending_signals.go",
+        "pending_signals_list.go",
+        "process_group_list.go",
+        "ptrace.go",
+        "rseq.go",
+        "session_list.go",
+        "sessions.go",
+        "signal.go",
+        "signal_handlers.go",
+        "syscalls.go",
+        "syscalls_state.go",
+        "syslog.go",
+        "task.go",
+        "task_clone.go",
+        "task_context.go",
+        "task_exec.go",
+        "task_exit.go",
+        "task_list.go",
+        "task_resources.go",
+        "task_run.go",
+        "task_sched.go",
+        "task_signals.go",
+        "task_start.go",
+        "task_syscall.go",
+        "thread_group.go",
+        "threads.go",
+        "timekeeper.go",
+        "timekeeper_state.go",
+        "timer.go",
+        "uts_namespace.go",
+        "vdso.go",
+        "version.go",
+    ],
+    out = "kernel_state.go",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"],
+    package = "kernel",
+)
+
+go_template_instance(
+    name = "pending_signals_list",
+    out = "pending_signals_list.go",
+    package = "kernel",
+    prefix = "pendingSignal",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*pendingSignal",
+    },
+)
+
+go_template_instance(
+    name = "process_group_list",
+    out = "process_group_list.go",
+    package = "kernel",
+    prefix = "processGroup",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*ProcessGroup",
+    },
+)
+
+go_template_instance(
+    name = "seqatomic_taskgoroutineschedinfo",
+    out = "seqatomic_taskgoroutineschedinfo.go",
+    package = "kernel",
+    suffix = "TaskGoroutineSchedInfo",
+    template = "//pkg/sync:generic_seqatomic",
+    types = {
+        "Value": "TaskGoroutineSchedInfo",
+    },
+)
+
+go_template_instance(
+    name = "session_list",
+    out = "session_list.go",
+    package = "kernel",
+    prefix = "session",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*Session",
+    },
+)
+
+go_template_instance(
+    name = "task_list",
+    out = "task_list.go",
+    package = "kernel",
+    prefix = "task",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*Task",
+    },
+)
+
+go_library(
+    name = "kernel",
+    srcs = [
+        "abstract_socket_namespace.go",
+        "context.go",
+        "fd_map.go",
+        "fs_context.go",
+        "ipc_namespace.go",
+        "kernel.go",
+        "kernel_state.go",
+        "pending_signals.go",
+        "pending_signals_list.go",
+        "process_group_list.go",
+        "ptrace.go",
+        "rseq.go",
+        "seccomp.go",
+        "seqatomic_taskgoroutineschedinfo.go",
+        "session_list.go",
+        "sessions.go",
+        "signal.go",
+        "signal_handlers.go",
+        "syscalls.go",
+        "syscalls_state.go",
+        "syslog.go",
+        "task.go",
+        "task_acct.go",
+        "task_block.go",
+        "task_clone.go",
+        "task_context.go",
+        "task_exec.go",
+        "task_exit.go",
+        "task_identity.go",
+        "task_list.go",
+        "task_log.go",
+        "task_net.go",
+        "task_resources.go",
+        "task_run.go",
+        "task_sched.go",
+        "task_signals.go",
+        "task_start.go",
+        "task_stop.go",
+        "task_syscall.go",
+        "task_usermem.go",
+        "thread_group.go",
+        "threads.go",
+        "timekeeper.go",
+        "timekeeper_state.go",
+        "timer.go",
+        "uts_namespace.go",
+        "vdso.go",
+        "version.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi",
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/binary",
+        "//pkg/bits",
+        "//pkg/bpf",
+        "//pkg/cpuid",
+        "//pkg/eventchannel",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/secio",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fs/timerfd",
+        "//pkg/sentry/hostcpu",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/epoll",
+        "//pkg/sentry/kernel/futex",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/kernel/semaphore",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/socket/netlink/port",
+        "//pkg/sentry/time",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/state/statefile",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "kernel_test",
+    size = "small",
+    srcs = [
+        "fd_map_test.go",
+        "table_test.go",
+        "task_test.go",
+        "timekeeper_test.go",
+    ],
+    embed = [":kernel"],
+    deps = [
+        "//pkg/abi",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs/filetest",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/time",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/kernel/README.md b/pkg/sentry/kernel/README.md
new file mode 100644
index 000000000..3306780d6
--- /dev/null
+++ b/pkg/sentry/kernel/README.md
@@ -0,0 +1,106 @@
+This package contains:
+
+- A (partial) emulation of the "core Linux kernel", which governs task
+  execution and scheduling, system call dispatch, and signal handling. See
+  below for details.
+
+- The top-level interface for the sentry's Linux kernel emulation in general,
+  used by the `main` function of all versions of the sentry. This interface
+  revolves around the `Env` type (defined in `kernel.go`).
+
+# Background
+
+In Linux, each schedulable context is referred to interchangeably as a "task" or
+"thread". Tasks can be divided into userspace and kernel tasks. In the sentry,
+scheduling is managed by the Go runtime, so each schedulable context is a
+goroutine; only "userspace" (application) contexts are referred to as tasks, and
+represented by Task objects. (From this point forward, "task" refers to the
+sentry's notion of a task unless otherwise specified.)
+
+At a high level, Linux application threads can be thought of as repeating a "run
+loop":
+
+- Some amount of application code is executed in userspace.
+
+- A trap (explicit syscall invocation, hardware interrupt or exception, etc.)
+  causes control flow to switch to the kernel.
+
+- Some amount of kernel code is executed in kernelspace, e.g. to handle the
+  cause of the trap.
+
+- The kernel "returns from the trap" into application code.
+
+Analogously, each task in the sentry is associated with a *task goroutine* that
+executes that task's run loop (`Task.run` in `task_run.go`). However, the
+sentry's task run loop differs in structure in order to support saving execution
+state to, and resuming execution from, checkpoints.
+
+While in kernelspace, a Linux thread can be descheduled (cease execution) in a
+variety of ways:
+
+- It can yield or be preempted, becoming temporarily descheduled but still
+  runnable. At present, the sentry delegates scheduling of runnable threads to
+  the Go runtime.
+
+- It can exit, becoming permanently descheduled. The sentry's equivalent is
+  returning from `Task.run`, terminating the task goroutine.
+
+- It can enter interruptible sleep, a state in which it can be woken by a
+  caller-defined wakeup or the receipt of a signal. In the sentry, interruptible
+  sleep (which is ambiguously referred to as *blocking*) is implemented by
+  making all events that can end blocking (including signal notifications)
+  communicated via Go channels and using `select` to multiplex wakeup sources;
+  see `task_block.go`.
+
+- It can enter uninterruptible sleep, a state in which it can only be woken by a
+  caller-defined wakeup. Killable sleep is a closely related variant in which
+  the task can also be woken by SIGKILL. (These definitions also include Linux's
+  "group-stopped" (`TASK_STOPPED`) and "ptrace-stopped" (`TASK_TRACED`) states.)
+
+To maximize compatibility with Linux, sentry checkpointing appears as a spurious
+signal-delivery interrupt on all tasks; interrupted system calls return `EINTR`
+or are automatically restarted as usual. However, these semantics require that
+uninterruptible and killable sleeps do not appear to be interrupted. In other
+words, the state of the task, including its progress through the interrupted
+operation, must be preserved by checkpointing. For many such sleeps, the wakeup
+condition is application-controlled, making it infeasible to wait for the sleep
+to end before checkpointing. Instead, we must support checkpointing progress
+through sleeping operations.
+
+# Implementation
+
+We break the task's control flow graph into *states*, delimited by:
+
+1. Points where uninterruptible and killable sleeps may occur. For example,
+there exists a state boundary between signal dequeueing and signal delivery
+because there may be an intervening ptrace signal-delivery-stop.
+
+2. Points where sleep-induced branches may "rejoin" normal execution. For
+example, the syscall exit state exists because it can be reached immediately
+following a synchronous syscall, or after a task that is sleeping in `execve()`
+or `vfork()` resumes execution.
+
+3. Points containing large branches. This is strictly for organizational
+purposes. For example, the state that processes interrupt-signaled conditions is
+kept separate from the main "app" state to reduce the size of the latter.
+
+4. `SyscallReinvoke`, which does not correspond to anything in Linux, and exists
+solely to serve the autosave feature.
+
+![dot -Tsvg -Goverlap=false -orun_states.svg run_states.dot](g3doc/run_states.dot "Task control flow graph")
+
+States before which a stop may occur are represented as implementations of the
+`taskRunState` interface named `run(state)`, allowing them to be saved and
+restored. States that cannot be immediately preceded by a stop are simply `Task`
+methods named `do(state)`.
+
+Conditions that can require task goroutines to cease execution for unknown
+lengths of time are called *stops*. Stops are divided into *internal stops*,
+which are stops whose start and end conditions are implemented within the
+sentry, and *external stops*, which are stops whose start and end conditions are
+not known to the sentry. Hence all uninterruptible and killable sleeps are
+internal stops, and the existence of a pending checkpoint operation is an
+external stop. Internal stops are reified into instances of the `TaskStop` type,
+while external stops are merely counted. The task run loop alternates between
+checking for stops and advancing the task's state. This allows checkpointing to
+hold tasks in a stopped state while waiting for all tasks in the system to stop.
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
new file mode 100644
index 000000000..014c4a3bf
--- /dev/null
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -0,0 +1,108 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+type abstractEndpoint struct {
+	ep   unix.BoundEndpoint
+	wr   *refs.WeakRef
+	name string
+	ns   *AbstractSocketNamespace
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+func (e *abstractEndpoint) WeakRefGone() {
+	e.ns.mu.Lock()
+	if e.ns.endpoints[e.name].ep == e.ep {
+		delete(e.ns.endpoints, e.name)
+	}
+	e.ns.mu.Unlock()
+}
+
+// AbstractSocketNamespace is used to implement the Linux abstract socket functionality.
+type AbstractSocketNamespace struct {
+	mu sync.Mutex `state:"nosave"`
+
+	// Keeps mapping from name to endpoint.
+	endpoints map[string]abstractEndpoint
+}
+
+// NewAbstractSocketNamespace returns a new AbstractSocketNamespace.
+func NewAbstractSocketNamespace() *AbstractSocketNamespace {
+	return &AbstractSocketNamespace{
+		endpoints: make(map[string]abstractEndpoint),
+	}
+}
+
+// A boundEndpoint wraps a unix.BoundEndpoint to maintain a reference on its
+// backing object.
+type boundEndpoint struct {
+	unix.BoundEndpoint
+	rc refs.RefCounter
+}
+
+// Release implements unix.BoundEndpoint.Release.
+func (e *boundEndpoint) Release() {
+	e.rc.DecRef()
+	e.BoundEndpoint.Release()
+}
+
+// BoundEndpoint retrieves the endpoint bound to the given name. The return
+// value is nil if no endpoint was bound.
+func (a *AbstractSocketNamespace) BoundEndpoint(name string) unix.BoundEndpoint {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	ep, ok := a.endpoints[name]
+	if !ok {
+		return nil
+	}
+
+	rc := ep.wr.Get()
+	if rc == nil {
+		delete(a.endpoints, name)
+		return nil
+	}
+
+	return &boundEndpoint{ep.ep, rc}
+}
+
+// Bind binds the given socket.
+//
+// When the last reference managed by rc is dropped, ep may be removed from the
+// namespace.
+func (a *AbstractSocketNamespace) Bind(name string, ep unix.BoundEndpoint, rc refs.RefCounter) error {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if ep, ok := a.endpoints[name]; ok {
+		if rc := ep.wr.Get(); rc != nil {
+			rc.DecRef()
+			return syscall.EADDRINUSE
+		}
+	}
+
+	ae := abstractEndpoint{ep: ep, name: name, ns: a}
+	ae.wr = refs.NewWeakRef(rc, &ae)
+	a.endpoints[name] = ae
+	return nil
+}
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
new file mode 100644
index 000000000..7f0680b88
--- /dev/null
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -0,0 +1,73 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "auth_state",
+    srcs = [
+        "credentials.go",
+        "id.go",
+        "id_map_range.go",
+        "id_map_set.go",
+        "user_namespace.go",
+    ],
+    out = "auth_state.go",
+    package = "auth",
+)
+
+go_template_instance(
+    name = "id_map_range",
+    out = "id_map_range.go",
+    package = "auth",
+    prefix = "idMap",
+    template = "//pkg/segment:generic_range",
+    types = {
+        "T": "uint32",
+    },
+)
+
+go_template_instance(
+    name = "id_map_set",
+    out = "id_map_set.go",
+    consts = {
+        "minDegree": "3",
+    },
+    package = "auth",
+    prefix = "idMap",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint32",
+        "Range": "idMapRange",
+        "Value": "uint32",
+        "Functions": "idMapFunctions",
+    },
+)
+
+go_library(
+    name = "auth",
+    srcs = [
+        "auth.go",
+        "auth_state.go",
+        "capability_set.go",
+        "context.go",
+        "credentials.go",
+        "id.go",
+        "id_map.go",
+        "id_map_functions.go",
+        "id_map_range.go",
+        "id_map_set.go",
+        "user_namespace.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/bits",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/kernel/auth/auth.go b/pkg/sentry/kernel/auth/auth.go
new file mode 100644
index 000000000..c49a6b852
--- /dev/null
+++ b/pkg/sentry/kernel/auth/auth.go
@@ -0,0 +1,22 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package auth implements an access control model that is a subset of Linux's.
+//
+// The auth package supports two kinds of access controls: user/group IDs and
+// capabilities. Each resource in the security model is associated with a user
+// namespace; "privileged" operations check that the operator's credentials
+// have the required user/group IDs or capabilities within the user namespace
+// of accessed resources.
+package auth
diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go
new file mode 100644
index 000000000..5b8164c49
--- /dev/null
+++ b/pkg/sentry/kernel/auth/capability_set.go
@@ -0,0 +1,61 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+)
+
+// A CapabilitySet is a set of capabilities implemented as a bitset. The zero
+// value of CapabilitySet is a set containing no capabilities.
+type CapabilitySet uint64
+
+// AllCapabilities is a CapabilitySet containing all valid capabilities.
+var AllCapabilities = CapabilitySetOf(linux.MaxCapability+1) - 1
+
+// CapabilitySetOf returns a CapabilitySet containing only the given
+// capability.
+func CapabilitySetOf(cp linux.Capability) CapabilitySet {
+	return CapabilitySet(bits.MaskOf64(int(cp)))
+}
+
+// CapabilitySetOfMany returns a CapabilitySet containing the given capabilities.
+func CapabilitySetOfMany(cps []linux.Capability) CapabilitySet {
+	var cs uint64
+	for _, cp := range cps {
+		cs |= bits.MaskOf64(int(cp))
+	}
+	return CapabilitySet(cs)
+}
+
+// TaskCapabilities represents all the capability sets for a task. Each of these
+// sets is explained in greater detail in capabilities(7).
+type TaskCapabilities struct {
+	// Permitted is a limiting superset for the effective capabilities that
+	// the thread may assume.
+	PermittedCaps CapabilitySet
+	// Inheritable is a set of capabilities preserved across an execve(2).
+	InheritableCaps CapabilitySet
+	// Effective is the set of capabilities used by the kernel to perform
+	// permission checks for the thread.
+	EffectiveCaps CapabilitySet
+	// Bounding is a limiting superset for the capabilities that a thread
+	// can add to its inheritable set using capset(2).
+	BoundingCaps CapabilitySet
+	// Ambient is a set of capabilities that are preserved across an
+	// execve(2) of a program that is not privileged.
+	AmbientCaps CapabilitySet
+}
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
new file mode 100644
index 000000000..914589b28
--- /dev/null
+++ b/pkg/sentry/kernel/auth/context.go
@@ -0,0 +1,36 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the auth package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxCredentials is a Context.Value key for Credentials.
+	CtxCredentials contextID = iota
+)
+
+// CredentialsFromContext returns a copy of the Credentials used by ctx, or a
+// set of Credentials with no capabilities if ctx does not have Credentials.
+func CredentialsFromContext(ctx context.Context) *Credentials {
+	if v := ctx.Value(CtxCredentials); v != nil {
+		return v.(*Credentials)
+	}
+	return NewAnonymousCredentials()
+}
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
new file mode 100644
index 000000000..b832b28fe
--- /dev/null
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -0,0 +1,227 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Credentials contains information required to authorize privileged operations
+// in a user namespace.
+type Credentials struct {
+	// Real/effective/saved user/group IDs in the root user namespace. None of
+	// these should ever be NoID.
+	RealKUID      KUID
+	EffectiveKUID KUID
+	SavedKUID     KUID
+	RealKGID      KGID
+	EffectiveKGID KGID
+	SavedKGID     KGID
+
+	// Filesystem user/group IDs are not implemented. "... setfsuid() is
+	// nowadays unneeded and should be avoided in new applications (likewise
+	// for setfsgid(2))." - setfsuid(2)
+
+	// Supplementary groups used by set/getgroups.
+	//
+	// ExtraKGIDs slices are immutable, allowing multiple Credentials with the
+	// same ExtraKGIDs to share the same slice.
+	ExtraKGIDs []KGID
+
+	// The capability sets applicable to this set of credentials.
+	PermittedCaps   CapabilitySet
+	InheritableCaps CapabilitySet
+	EffectiveCaps   CapabilitySet
+	BoundingCaps    CapabilitySet
+	// Ambient capabilities are not introduced until Linux 4.3.
+
+	// KeepCaps is the flag for PR_SET_KEEPCAPS which allow capabilities to be
+	// maintained after a switch from root user to non-root user via setuid().
+	KeepCaps bool
+
+	// The user namespace associated with the owner of the credentials.
+	UserNamespace *UserNamespace
+}
+
+// NewAnonymousCredentials returns a set of credentials with no capabilities in
+// any user namespace.
+func NewAnonymousCredentials() *Credentials {
+	// Create a new root user namespace. Since the new namespace's owner is
+	// KUID 0 and the returned credentials have non-zero KUID/KGID, the
+	// returned credentials do not have any capabilities in the new namespace.
+	// Since the new namespace is not part of any existing user namespace
+	// hierarchy, the returned credentials do not have any capabilities in any
+	// other namespace.
+	return &Credentials{
+		RealKUID:      NobodyKUID,
+		EffectiveKUID: NobodyKUID,
+		SavedKUID:     NobodyKUID,
+		RealKGID:      NobodyKGID,
+		EffectiveKGID: NobodyKGID,
+		SavedKGID:     NobodyKGID,
+		UserNamespace: NewRootUserNamespace(),
+	}
+}
+
+// NewRootCredentials returns a set of credentials with KUID and KGID 0 (i.e.
+// global root) in user namespace ns.
+func NewRootCredentials(ns *UserNamespace) *Credentials {
+	// I can't find documentation for this anywhere, but it's correct for the
+	// inheritable capability set to be initially empty (the capabilities test
+	// checks for this property).
+	return &Credentials{
+		RealKUID:      RootKUID,
+		EffectiveKUID: RootKUID,
+		SavedKUID:     RootKUID,
+		RealKGID:      RootKGID,
+		EffectiveKGID: RootKGID,
+		SavedKGID:     RootKGID,
+		PermittedCaps: AllCapabilities,
+		EffectiveCaps: AllCapabilities,
+		BoundingCaps:  AllCapabilities,
+		UserNamespace: ns,
+	}
+}
+
+// NewUserCredentials returns a set of credentials based on the given UID, GIDs,
+// and capabilities in a given namespace. If all arguments are their zero
+// values, this returns the same credentials as NewRootCredentials.
+func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *TaskCapabilities, ns *UserNamespace) *Credentials {
+	creds := NewRootCredentials(ns)
+
+	// Set the UID.
+	uid := kuid
+	creds.RealKUID = uid
+	creds.EffectiveKUID = uid
+	creds.SavedKUID = uid
+
+	// Set GID.
+	gid := kgid
+	creds.RealKGID = gid
+	creds.EffectiveKGID = gid
+	creds.SavedKGID = gid
+
+	// Set additional GIDs.
+	creds.ExtraKGIDs = append(creds.ExtraKGIDs, extraKGIDs...)
+
+	// Set capabilities. If capabilities aren't specified, we default to
+	// all capabilities.
+	if capabilities != nil {
+		creds.PermittedCaps = capabilities.PermittedCaps
+		creds.EffectiveCaps = capabilities.EffectiveCaps
+		creds.BoundingCaps = capabilities.BoundingCaps
+		creds.InheritableCaps = capabilities.InheritableCaps
+		// // TODO: Support ambient capabilities.
+	} else {
+		// If no capabilities are specified, grant the same capabilites
+		// that NewRootCredentials does.
+		creds.PermittedCaps = AllCapabilities
+		creds.EffectiveCaps = AllCapabilities
+		creds.BoundingCaps = AllCapabilities
+	}
+
+	return creds
+}
+
+// Fork generates an identical copy of a set of credentials.
+func (c *Credentials) Fork() *Credentials {
+	nc := new(Credentials)
+	*nc = *c // Copy-by-value; this is legal for all fields.
+	return nc
+}
+
+// InGroup returns true if c is in group kgid. Compare Linux's
+// kernel/groups.c:in_group_p().
+func (c *Credentials) InGroup(kgid KGID) bool {
+	if c.EffectiveKGID == kgid {
+		return true
+	}
+	for _, extraKGID := range c.ExtraKGIDs {
+		if extraKGID == kgid {
+			return true
+		}
+	}
+	return false
+}
+
+// HasCapabilityIn returns true if c has capability cp in ns.
+func (c *Credentials) HasCapabilityIn(cp linux.Capability, ns *UserNamespace) bool {
+	for {
+		// "1. A process has a capability inside a user namespace if it is a member
+		// of that namespace and it has the capability in its effective capability
+		// set." - user_namespaces(7)
+		if c.UserNamespace == ns {
+			return CapabilitySetOf(cp)&c.EffectiveCaps != 0
+		}
+		// "3. ... A process that resides in the parent of the user namespace and
+		// whose effective user ID matches the owner of the namespace has all
+		// capabilities in the namespace."
+		if c.UserNamespace == ns.parent && c.EffectiveKUID == ns.owner {
+			return true
+		}
+		// "2. If a process has a capability in a user namespace, then it has that
+		// capability in all child (and further removed descendant) namespaces as
+		// well."
+		if ns.parent == nil {
+			return false
+		}
+		ns = ns.parent
+	}
+}
+
+// HasCapability returns true if c has capability cp in its user namespace.
+func (c *Credentials) HasCapability(cp linux.Capability) bool {
+	return c.HasCapabilityIn(cp, c.UserNamespace)
+}
+
+// UseUID checks that c can use uid in its user namespace, then translates it
+// to the root user namespace.
+//
+// The checks UseUID does are common, but you should verify that it's doing
+// exactly what you want.
+func (c *Credentials) UseUID(uid UID) (KUID, error) {
+	// uid must be mapped.
+	kuid := c.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return NoID, syserror.EINVAL
+	}
+	// If c has CAP_SETUID, then it can use any UID in its user namespace.
+	if c.HasCapability(linux.CAP_SETUID) {
+		return kuid, nil
+	}
+	// Otherwise, c must already have the UID as its real, effective, or saved
+	// set-user-ID.
+	if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID {
+		return kuid, nil
+	}
+	return NoID, syserror.EPERM
+}
+
+// UseGID checks that c can use gid in its user namespace, then translates it
+// to the root user namespace.
+func (c *Credentials) UseGID(gid GID) (KGID, error) {
+	kgid := c.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return NoID, syserror.EINVAL
+	}
+	if c.HasCapability(linux.CAP_SETGID) {
+		return kgid, nil
+	}
+	if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID {
+		return kgid, nil
+	}
+	return NoID, syserror.EPERM
+}
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
new file mode 100644
index 000000000..37522b018
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id.go
@@ -0,0 +1,121 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"math"
+)
+
+// UID is a user ID in an unspecified user namespace.
+type UID uint32
+
+// GID is a group ID in an unspecified user namespace.
+type GID uint32
+
+// In the root user namespace, user/group IDs have a 1-to-1 relationship with
+// the users/groups they represent. In other user namespaces, this is not the
+// case; for example, two different unmapped users may both "have" the overflow
+// UID. This means that it is generally only valid to compare user and group
+// IDs in the root user namespace. We assign distinct types, KUID/KGID, to such
+// IDs to emphasize this distinction. ("k" is for "key", as in "unique key".
+// Linux also uses the prefix "k", but I think they mean "kernel".)
+
+// KUID is a user ID in the root user namespace.
+type KUID uint32
+
+// KGID is a group ID in the root user namespace.
+type KGID uint32
+
+const (
+	// NoID is uint32(-1). -1 is consistently used as a special value, in Linux
+	// and by extension in the auth package, to mean "no ID":
+	//
+	// - ID mapping returns -1 if the ID is not mapped.
+	//
+	// - Most set*id() syscalls accept -1 to mean "do not change this ID".
+	NoID = math.MaxUint32
+
+	// OverflowUID is the default value of /proc/sys/kernel/overflowuid. The
+	// "overflow UID" is usually [1] used when translating a user ID between
+	// namespaces fails because the ID is not mapped. (We don't implement this
+	// file, so the overflow UID is constant.)
+	//
+	// [1] "There is one notable case where unmapped user and group IDs are not
+	// converted to the corresponding overflow ID value. When viewing a uid_map
+	// or gid_map file in which there is no mapping for the second field, that
+	// field is displayed as 4294967295 (-1 as an unsigned integer);" -
+	// user_namespaces(7)
+	OverflowUID = UID(65534)
+	OverflowGID = GID(65534)
+
+	// NobodyKUID is the user ID usually reserved for the least privileged user
+	// "nobody".
+	NobodyKUID = KUID(65534)
+	NobodyKGID = KGID(65534)
+
+	// RootKUID is the user ID usually used for the most privileged user "root".
+	RootKUID = KUID(0)
+	RootKGID = KGID(0)
+	RootUID  = UID(0)
+	RootGID  = GID(0)
+)
+
+// Ok returns true if uid is not -1.
+func (uid UID) Ok() bool {
+	return uid != NoID
+}
+
+// Ok returns true if gid is not -1.
+func (gid GID) Ok() bool {
+	return gid != NoID
+}
+
+// Ok returns true if kuid is not -1.
+func (kuid KUID) Ok() bool {
+	return kuid != NoID
+}
+
+// Ok returns true if kgid is not -1.
+func (kgid KGID) Ok() bool {
+	return kgid != NoID
+}
+
+// OrOverflow returns uid if it is valid and the overflow UID otherwise.
+func (uid UID) OrOverflow() UID {
+	if uid.Ok() {
+		return uid
+	}
+	return OverflowUID
+}
+
+// OrOverflow returns gid if it is valid and the overflow GID otherwise.
+func (gid GID) OrOverflow() GID {
+	if gid.Ok() {
+		return gid
+	}
+	return OverflowGID
+}
+
+// In translates kuid into user namespace ns. If kuid is not mapped in ns, In
+// returns NoID.
+func (kuid KUID) In(ns *UserNamespace) UID {
+	return ns.MapFromKUID(kuid)
+}
+
+// In translates kgid into user namespace ns. If kgid is not mapped in ns, In
+// returns NoID.
+func (kgid KGID) In(ns *UserNamespace) GID {
+	return ns.MapFromKGID(kgid)
+}
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
new file mode 100644
index 000000000..6adb33530
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -0,0 +1,283 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns.
+func (ns *UserNamespace) MapFromKUID(kuid KUID) UID {
+	if ns.parent == nil {
+		return UID(kuid)
+	}
+	return UID(ns.mapID(&ns.uidMapFromParent, uint32(ns.parent.MapFromKUID(kuid))))
+}
+
+// MapFromKGID translates kgid, a GID in the root namespace, to a GID in ns.
+func (ns *UserNamespace) MapFromKGID(kgid KGID) GID {
+	if ns.parent == nil {
+		return GID(kgid)
+	}
+	return GID(ns.mapID(&ns.gidMapFromParent, uint32(ns.parent.MapFromKGID(kgid))))
+}
+
+// MapToKUID translates uid, a UID in ns, to a UID in the root namespace.
+func (ns *UserNamespace) MapToKUID(uid UID) KUID {
+	if ns.parent == nil {
+		return KUID(uid)
+	}
+	return ns.parent.MapToKUID(UID(ns.mapID(&ns.uidMapToParent, uint32(uid))))
+}
+
+// MapToKGID translates gid, a GID in ns, to a GID in the root namespace.
+func (ns *UserNamespace) MapToKGID(gid GID) KGID {
+	if ns.parent == nil {
+		return KGID(gid)
+	}
+	return ns.parent.MapToKGID(GID(ns.mapID(&ns.gidMapToParent, uint32(gid))))
+}
+
+func (ns *UserNamespace) mapID(m *idMapSet, id uint32) uint32 {
+	if id == NoID {
+		return NoID
+	}
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	if it := m.FindSegment(id); it.Ok() {
+		return it.Value() + (id - it.Start())
+	}
+	return NoID
+}
+
+// allIDsMapped returns true if all IDs in the range [start, end) are mapped in
+// m.
+//
+// Preconditions: end >= start.
+func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool {
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	return m.SpanRange(idMapRange{start, end}) == end-start
+}
+
+// An IDMapEntry represents a mapping from a range of contiguous IDs in a user
+// namespace to an equally-sized range of contiguous IDs in the namespace's
+// parent.
+type IDMapEntry struct {
+	// FirstID is the first ID in the range in the namespace.
+	FirstID uint32
+
+	// FirstParentID is the first ID in the range in the parent namespace.
+	FirstParentID uint32
+
+	// Length is the number of IDs in the range.
+	Length uint32
+}
+
+// SetUIDMap instructs ns to translate UIDs as specified by entries.
+//
+// Note: SetUIDMap does not place an upper bound on the number of entries, but
+// Linux does. This restriction is implemented in SetUIDMap's caller, the
+// implementation of /proc/[pid]/uid_map.
+func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) error {
+	c := CredentialsFromContext(ctx)
+
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	// "After the creation of a new user namespace, the uid_map file of *one*
+	// of the processes in the namespace may be written to *once* to define the
+	// mapping of user IDs in the new user namespace. An attempt to write more
+	// than once to a uid_map file in a user namespace fails with the error
+	// EPERM. Similar rules apply for gid_map files." - user_namespaces(7)
+	if !ns.uidMapFromParent.IsEmpty() {
+		return syserror.EPERM
+	}
+	// "At least one line must be written to the file."
+	if len(entries) == 0 {
+		return syserror.EINVAL
+	}
+	// """
+	// In order for a process to write to the /proc/[pid]/uid_map
+	// (/proc/[pid]/gid_map) file, all of the following requirements must be
+	// met:
+	//
+	// 1. The writing process must have the CAP_SETUID (CAP_SETGID) capability
+	// in the user namespace of the process pid.
+	// """
+	if !c.HasCapabilityIn(linux.CAP_SETUID, ns) {
+		return syserror.EPERM
+	}
+	// "2. The writing process must either be in the user namespace of the process
+	// pid or be in the parent user namespace of the process pid."
+	if c.UserNamespace != ns && c.UserNamespace != ns.parent {
+		return syserror.EPERM
+	}
+	// """
+	// 3. (see trySetUIDMap)
+	//
+	// 4. One of the following two cases applies:
+	//
+	// * Either the writing process has the CAP_SETUID (CAP_SETGID) capability
+	// in the parent user namespace.
+	// """
+	if !c.HasCapabilityIn(linux.CAP_SETUID, ns.parent) {
+		// """
+		// * Or otherwise all of the following restrictions apply:
+		//
+		//   + The data written to uid_map (gid_map) must consist of a single line
+		//   that maps the writing process' effective user ID (group ID) in the
+		//   parent user namespace to a user ID (group ID) in the user namespace.
+		// """
+		if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 {
+			return syserror.EPERM
+		}
+		// """
+		//   + The writing process must have the same effective user ID as the
+		//   process that created the user namespace.
+		// """
+		if c.EffectiveKUID != ns.owner {
+			return syserror.EPERM
+		}
+	}
+	// trySetUIDMap leaves data in maps if it fails.
+	if err := ns.trySetUIDMap(entries); err != nil {
+		ns.uidMapFromParent.RemoveAll()
+		ns.uidMapToParent.RemoveAll()
+		return err
+	}
+	return nil
+}
+
+func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error {
+	for _, e := range entries {
+		// Determine upper bounds and check for overflow. This implicitly
+		// checks for NoID.
+		lastID := e.FirstID + e.Length
+		if lastID <= e.FirstID {
+			return syserror.EINVAL
+		}
+		lastParentID := e.FirstParentID + e.Length
+		if lastParentID <= e.FirstParentID {
+			return syserror.EINVAL
+		}
+		// "3. The mapped user IDs (group IDs) must in turn have a mapping in
+		// the parent user namespace."
+		// Only the root namespace has a nil parent, and root is assigned
+		// mappings when it's created, so SetUIDMap would have returned EPERM
+		// without reaching this point if ns is root.
+		if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) {
+			return syserror.EPERM
+		}
+		// If either of these Adds fail, we have an overlapping range.
+		if !ns.uidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
+			return syserror.EINVAL
+		}
+		if !ns.uidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
+			return syserror.EINVAL
+		}
+	}
+	return nil
+}
+
+// SetGIDMap instructs ns to translate GIDs as specified by entries.
+func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) error {
+	c := CredentialsFromContext(ctx)
+
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	if !ns.gidMapFromParent.IsEmpty() {
+		return syserror.EPERM
+	}
+	if len(entries) == 0 {
+		return syserror.EINVAL
+	}
+	if !c.HasCapabilityIn(linux.CAP_SETGID, ns) {
+		return syserror.EPERM
+	}
+	if c.UserNamespace != ns && c.UserNamespace != ns.parent {
+		return syserror.EPERM
+	}
+	if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) {
+		if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 {
+			return syserror.EPERM
+		}
+		// It's correct for this to still be UID.
+		if c.EffectiveKUID != ns.owner {
+			return syserror.EPERM
+		}
+		// "In the case of gid_map, use of the setgroups(2) system call must
+		// first be denied by writing "deny" to the /proc/[pid]/setgroups file
+		// (see below) before writing to gid_map." (This file isn't implemented
+		// in the version of Linux we're emulating; see comment in
+		// UserNamespace.)
+	}
+	if err := ns.trySetGIDMap(entries); err != nil {
+		ns.gidMapFromParent.RemoveAll()
+		ns.gidMapToParent.RemoveAll()
+		return err
+	}
+	return nil
+}
+
+func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error {
+	for _, e := range entries {
+		lastID := e.FirstID + e.Length
+		if lastID <= e.FirstID {
+			return syserror.EINVAL
+		}
+		lastParentID := e.FirstParentID + e.Length
+		if lastParentID <= e.FirstParentID {
+			return syserror.EINVAL
+		}
+		if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) {
+			return syserror.EPERM
+		}
+		if !ns.gidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
+			return syserror.EINVAL
+		}
+		if !ns.gidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
+			return syserror.EINVAL
+		}
+	}
+	return nil
+}
+
+// UIDMap returns the user ID mappings configured for ns. If no mappings
+// have been configured, UIDMap returns nil.
+func (ns *UserNamespace) UIDMap() []IDMapEntry {
+	return ns.getIDMap(&ns.uidMapToParent)
+}
+
+// GIDMap returns the group ID mappings configured for ns. If no mappings
+// have been configured, GIDMap returns nil.
+func (ns *UserNamespace) GIDMap() []IDMapEntry {
+	return ns.getIDMap(&ns.gidMapToParent)
+}
+
+func (ns *UserNamespace) getIDMap(m *idMapSet) []IDMapEntry {
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	var entries []IDMapEntry
+	for it := m.FirstSegment(); it.Ok(); it = it.NextSegment() {
+		entries = append(entries, IDMapEntry{
+			FirstID:       it.Start(),
+			FirstParentID: it.Value(),
+			Length:        it.Range().Length(),
+		})
+	}
+	return entries
+}
diff --git a/pkg/sentry/kernel/auth/id_map_functions.go b/pkg/sentry/kernel/auth/id_map_functions.go
new file mode 100644
index 000000000..889291d96
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map_functions.go
@@ -0,0 +1,45 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+// idMapFunctions "implements" generic interface segment.Functions for
+// idMapSet. An idMapSet maps non-overlapping ranges of contiguous IDs in one
+// user namespace to non-overlapping ranges of contiguous IDs in another user
+// namespace. Each such ID mapping is implemented as a range-to-value mapping
+// in the set such that [range.Start(), range.End()) => [value, value +
+// range.Length()).
+type idMapFunctions struct{}
+
+func (idMapFunctions) MinKey() uint32 {
+	return 0
+}
+
+func (idMapFunctions) MaxKey() uint32 {
+	return NoID
+}
+
+func (idMapFunctions) ClearValue(*uint32) {}
+
+func (idMapFunctions) Merge(r1 idMapRange, val1 uint32, r2 idMapRange, val2 uint32) (uint32, bool) {
+	// Mapped ranges have to be contiguous.
+	if val1+r1.Length() != val2 {
+		return 0, false
+	}
+	return val1, true
+}
+
+func (idMapFunctions) Split(r idMapRange, val uint32, split uint32) (uint32, uint32) {
+	return val, val + (split - r.Start)
+}
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
new file mode 100644
index 000000000..0980aeadf
--- /dev/null
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -0,0 +1,130 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"math"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// A UserNamespace represents a user namespace. See user_namespaces(7) for
+// details.
+type UserNamespace struct {
+	// parent is this namespace's parent. If this is the root namespace, parent
+	// is nil. The parent pointer is immutable.
+	parent *UserNamespace
+
+	// owner is the effective UID of the namespace's creator in the root
+	// namespace. owner is immutable.
+	owner KUID
+
+	// mu protects the following fields.
+	//
+	// If mu will be locked in multiple UserNamespaces, it must be locked in
+	// descendant namespaces before ancestors.
+	mu sync.Mutex `state:"nosave"`
+
+	// Mappings of user/group IDs between this namespace and its parent.
+	//
+	// All ID maps, once set, cannot be changed. This means that successful
+	// UID/GID translations cannot be racy.
+	uidMapFromParent idMapSet
+	uidMapToParent   idMapSet
+	gidMapFromParent idMapSet
+	gidMapToParent   idMapSet
+
+	// TODO: Consider supporting disabling setgroups(2), which "was
+	// added in Linux 3.19, but was backported to many earlier stable kernel
+	// series, because it addresses a security issue" - user_namespaces(7). (It
+	// was not backported to 3.11.10, which we are currently imitating.)
+}
+
+// NewRootUserNamespace returns a UserNamespace that is appropriate for a
+// system's root user namespace.
+func NewRootUserNamespace() *UserNamespace {
+	var ns UserNamespace
+	// """
+	// The initial user namespace has no parent namespace, but, for
+	// consistency, the kernel provides dummy user and group ID mapping files
+	// for this namespace. Looking at the uid_map file (gid_map is the same)
+	// from a shell in the initial namespace shows:
+	//
+	// $ cat /proc/$$/uid_map
+	// 0          0 4294967295
+	// """ - user_namespaces(7)
+	for _, m := range []*idMapSet{
+		&ns.uidMapFromParent,
+		&ns.uidMapToParent,
+		&ns.gidMapFromParent,
+		&ns.gidMapToParent,
+	} {
+		if !m.Add(idMapRange{0, math.MaxUint32}, 0) {
+			panic("Failed to insert into empty ID map")
+		}
+	}
+	return &ns
+}
+
+// Root returns the root of the user namespace tree containing ns.
+func (ns *UserNamespace) Root() *UserNamespace {
+	for ns.parent != nil {
+		ns = ns.parent
+	}
+	return ns
+}
+
+// "The kernel imposes (since version 3.11) a limit of 32 nested levels of user
+// namespaces." - user_namespaces(7)
+const maxUserNamespaceDepth = 32
+
+func (ns *UserNamespace) depth() int {
+	var i int
+	for ns != nil {
+		i++
+		ns = ns.parent
+	}
+	return i
+}
+
+// NewChildUserNamespace returns a new user namespace created by a caller with
+// credentials c.
+func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) {
+	if c.UserNamespace.depth() >= maxUserNamespaceDepth {
+		// "... Calls to unshare(2) or clone(2) that would cause this limit to
+		// be exceeded fail with the error EUSERS." - user_namespaces(7)
+		return nil, syserror.EUSERS
+	}
+	// "EPERM: CLONE_NEWUSER was specified in flags, but either the effective
+	// user ID or the effective group ID of the caller does not have a mapping
+	// in the parent namespace (see user_namespaces(7))." - clone(2)
+	// "CLONE_NEWUSER requires that the user ID and group ID of the calling
+	// process are mapped to user IDs and group IDs in the user namespace of
+	// the calling process at the time of the call." - unshare(2)
+	if !c.EffectiveKUID.In(c.UserNamespace).Ok() {
+		return nil, syserror.EPERM
+	}
+	if !c.EffectiveKGID.In(c.UserNamespace).Ok() {
+		return nil, syserror.EPERM
+	}
+	return &UserNamespace{
+		parent: c.UserNamespace,
+		owner:  c.EffectiveKUID,
+		// "When a user namespace is created, it starts without a mapping of
+		// user IDs (group IDs) to the parent user namespace." -
+		// user_namespaces(7)
+	}, nil
+}
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
new file mode 100644
index 000000000..261ca6f7a
--- /dev/null
+++ b/pkg/sentry/kernel/context.go
@@ -0,0 +1,135 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the kernel package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxCanTrace is a Context.Value key for a function with the same
+	// signature and semantics as kernel.Task.CanTrace.
+	CtxCanTrace contextID = iota
+
+	// CtxKernel is a Context.Value key for a Kernel.
+	CtxKernel
+
+	// CtxPIDNamespace is a Context.Value key for a PIDNamespace.
+	CtxPIDNamespace
+
+	// CtxTask is a Context.Value key for a Task.
+	CtxTask
+
+	// CtxUTSNamespace is a Context.Value key for a UTSNamespace.
+	CtxUTSNamespace
+
+	// CtxIPCNamespace is a Context.Value key for a IPCNamespace.
+	CtxIPCNamespace
+)
+
+// ContextCanTrace returns true if ctx is permitted to trace t, in the same sense
+// as kernel.Task.CanTrace.
+func ContextCanTrace(ctx context.Context, t *Task, attach bool) bool {
+	if v := ctx.Value(CtxCanTrace); v != nil {
+		return v.(func(*Task, bool) bool)(t, attach)
+	}
+	return false
+}
+
+// KernelFromContext returns the Kernel in which ctx is executing, or nil if
+// there is no such Kernel.
+func KernelFromContext(ctx context.Context) *Kernel {
+	if v := ctx.Value(CtxKernel); v != nil {
+		return v.(*Kernel)
+	}
+	return nil
+}
+
+// PIDNamespaceFromContext returns the PID namespace in which ctx is executing,
+// or nil if there is no such PID namespace.
+func PIDNamespaceFromContext(ctx context.Context) *PIDNamespace {
+	if v := ctx.Value(CtxPIDNamespace); v != nil {
+		return v.(*PIDNamespace)
+	}
+	return nil
+}
+
+// UTSNamespaceFromContext returns the UTS namespace in which ctx is executing,
+// or nil if there is no such UTS namespace.
+func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace {
+	if v := ctx.Value(CtxUTSNamespace); v != nil {
+		return v.(*UTSNamespace)
+	}
+	return nil
+}
+
+// IPCNamespaceFromContext returns the IPC namespace in which ctx is executing,
+// or nil if there is no such IPC namespace.
+func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace {
+	if v := ctx.Value(CtxIPCNamespace); v != nil {
+		return v.(*IPCNamespace)
+	}
+	return nil
+}
+
+// TaskFromContext returns the Task associated with ctx, or nil if there is no
+// such Task.
+func TaskFromContext(ctx context.Context) *Task {
+	if v := ctx.Value(CtxTask); v != nil {
+		return v.(*Task)
+	}
+	return nil
+}
+
+// AsyncContext returns a context.Context that may be used by goroutines that
+// do work on behalf of t and therefore share its contextual values, but are
+// not t's task goroutine (e.g. asynchronous I/O).
+func (t *Task) AsyncContext() context.Context {
+	return taskAsyncContext{t: t}
+}
+
+type taskAsyncContext struct {
+	context.NoopSleeper
+	t *Task
+}
+
+// Debugf implements log.Logger.Debugf.
+func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
+	ctx.t.Debugf(format, v...)
+}
+
+// Infof implements log.Logger.Infof.
+func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
+	ctx.t.Infof(format, v...)
+}
+
+// Warningf implements log.Logger.Warningf.
+func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
+	ctx.t.Warningf(format, v...)
+}
+
+// IsLogging implements log.Logger.IsLogging.
+func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
+	return ctx.t.IsLogging(level)
+}
+
+// Value implements context.Context.Value.
+func (ctx taskAsyncContext) Value(key interface{}) interface{} {
+	return ctx.t.Value(key)
+}
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
new file mode 100644
index 000000000..04651d961
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -0,0 +1,52 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "epoll_autogen_state",
+    srcs = [
+        "epoll.go",
+        "epoll_state.go",
+    ],
+    out = "epoll_autogen_state.go",
+    package = "epoll",
+)
+
+go_library(
+    name = "epoll",
+    srcs = [
+        "epoll.go",
+        "epoll_autogen_state.go",
+        "epoll_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/ilist",
+        "//pkg/refs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "epoll_test",
+    size = "small",
+    srcs = [
+        "epoll_test.go",
+    ],
+    embed = [":epoll"],
+    deps = [
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs/filetest",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
new file mode 100644
index 000000000..b572fcd7e
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -0,0 +1,466 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package epoll provides an implementation of Linux's IO event notification
+// facility. See epoll(7) for more details.
+package epoll
+
+import (
+	"fmt"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Event describes the event mask that was observed and the user data to be
+// returned when one of the events occurs. It has this format to match the linux
+// format to avoid extra copying/allocation when writing events to userspace.
+type Event struct {
+	// Events is the event mask containing the set of events that have been
+	// observed on an entry.
+	Events uint32
+
+	// Data is an opaque 64-bit value provided by the caller when adding the
+	// entry, and returned to the caller when the entry reports an event.
+	Data [2]int32
+}
+
+// EntryFlags is a bitmask that holds an entry's flags.
+type EntryFlags int
+
+// Valid entry flags.
+const (
+	OneShot EntryFlags = 1 << iota
+	EdgeTriggered
+)
+
+// FileIdentifier identifies a file. We cannot use just the FD because it could
+// potentially be reassigned. We also cannot use just the file pointer because
+// it is possible to have multiple entries for the same file object as long as
+// they are created with different FDs (i.e., the FDs point to the same file).
+type FileIdentifier struct {
+	File *fs.File
+	Fd   kdefs.FD
+}
+
+// pollEntry holds all the state associated with an event poll entry, that is,
+// a file being observed by an event poll object.
+type pollEntry struct {
+	ilist.Entry
+	file     *refs.WeakRef  `state:"manual"`
+	id       FileIdentifier `state:"wait"`
+	userData [2]int32
+	waiter   waiter.Entry `state:"manual"`
+	mask     waiter.EventMask
+	flags    EntryFlags
+
+	epoll *EventPoll
+
+	// We cannot save the current list pointer as it points into EventPoll
+	// struct, while state framework currently does not support such
+	// in-struct pointers. Instead, EventPoll will properly set this field
+	// in its loading logic.
+	curList *ilist.List `state:"nosave"`
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+// weakReferenceGone is called when the file in the weak reference is destroyed.
+// The poll entry is removed in response to this.
+func (p *pollEntry) WeakRefGone() {
+	p.epoll.RemoveEntry(p.id)
+}
+
+// EventPoll holds all the state associated with an event poll object, that is,
+// collection of files to observe and their current state.
+type EventPoll struct {
+	fsutil.PipeSeek      `state:"zerovalue"`
+	fsutil.NotDirReaddir `state:"zerovalue"`
+	fsutil.NoFsync       `state:"zerovalue"`
+	fsutil.NoopFlush     `state:"zerovalue"`
+	fsutil.NoMMap        `state:"zerovalue"`
+	fsutil.NoIoctl       `state:"zerovalue"`
+
+	// Wait queue is used to notify interested parties when the event poll
+	// object itself becomes readable or writable.
+	waiter.Queue
+
+	// files is the map of all the files currently being observed, it is
+	// protected by mu.
+	mu    sync.Mutex `state:"nosave"`
+	files map[FileIdentifier]*pollEntry
+
+	// listsMu protects manipulation of the lists below. It needs to be a
+	// different lock to avoid circular lock acquisition order involving
+	// the wait queue mutexes and mu. The full order is mu, observed file
+	// wait queue mutex, then listsMu; this allows listsMu to be acquired
+	// when readyCallback is called.
+	//
+	// An entry is always in one of the following lists:
+	//	readyList -- when there's a chance that it's ready to have
+	//		events delivered to epoll waiters. Given that being
+	//		ready is a transient state, the Readiness() and
+	//		readEvents() functions always call the entry's file
+	//		Readiness() function to confirm it's ready.
+	//	waitingList -- when there's no chance that the entry is ready,
+	//		so it's waiting for the readyCallback to be called
+	//		on it before it gets moved to the readyList.
+	//	disabledList -- when the entry is disabled. This happens when
+	//		a one-shot entry gets delivered via readEvents().
+	listsMu      sync.Mutex `state:"nosave"`
+	readyList    ilist.List
+	waitingList  ilist.List
+	disabledList ilist.List
+}
+
+// cycleMu is used to serialize all the cycle checks. This is only used when
+// an event poll file is added as an entry to another event poll. Such checks
+// are serialized to avoid lock acquisition order inversion: if a thread is
+// adding A to B, and another thread is adding B to A, each would acquire A's
+// and B's mutexes in reverse order, and could cause deadlocks. Having this
+// lock prevents this by allowing only one check at a time to happen.
+//
+// We do the cycle check to prevent callers from introducing potentially
+// infinite recursions. If a caller were to add A to B and then B to A, for
+// event poll A to know if it's readable, it would need to check event poll B,
+// which in turn would need event poll A and so on indefinitely.
+var cycleMu sync.Mutex
+
+// NewEventPoll allocates and initializes a new event poll object.
+func NewEventPoll(ctx context.Context) *fs.File {
+	// name matches fs/eventpoll.c:epoll_create1.
+	dirent := fs.NewDirent(anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]"))
+	return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{
+		files: make(map[FileIdentifier]*pollEntry),
+	})
+}
+
+// Release implements fs.FileOperations.Release.
+func (e *EventPoll) Release() {
+	// We need to take the lock now because files may be attempting to
+	// remove entries in parallel if they get destroyed.
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Go through all entries and clean up.
+	for _, entry := range e.files {
+		entry.id.File.EventUnregister(&entry.waiter)
+		entry.file.Drop()
+	}
+}
+
+// Read implements fs.FileOperations.Read.
+func (*EventPoll) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syscall.ENOSYS
+}
+
+// Write implements fs.FileOperations.Write.
+func (*EventPoll) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syscall.ENOSYS
+}
+
+// eventsAvailable determines if 'e' has events available for delivery.
+func (e *EventPoll) eventsAvailable() bool {
+	e.listsMu.Lock()
+
+	for it := e.readyList.Front(); it != nil; {
+		entry := it.(*pollEntry)
+		it = it.Next()
+
+		// If the entry is ready, we know 'e' has at least one entry
+		// ready for delivery.
+		ready := entry.id.File.Readiness(entry.mask)
+		if ready != 0 {
+			e.listsMu.Unlock()
+			return true
+		}
+
+		// Entry is not ready, so move it to waiting list.
+		e.readyList.Remove(entry)
+		e.waitingList.PushBack(entry)
+		entry.curList = &e.waitingList
+	}
+
+	e.listsMu.Unlock()
+
+	return false
+}
+
+// Readiness determines if the event poll object is currently readable (i.e.,
+// if there are pending events for delivery).
+func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ready := waiter.EventMask(0)
+
+	if (mask&waiter.EventIn) != 0 && e.eventsAvailable() {
+		ready |= waiter.EventIn
+	}
+
+	return ready
+}
+
+// ReadEvents returns up to max available events.
+func (e *EventPoll) ReadEvents(max int) []Event {
+	var local ilist.List
+	var ret []Event
+
+	e.listsMu.Lock()
+
+	// Go through all entries we believe may be ready.
+	for it := e.readyList.Front(); it != nil && len(ret) < max; {
+		entry := it.(*pollEntry)
+		it = it.Next()
+
+		// Check the entry's readiness. It it's not really ready, we
+		// just put it back in the waiting list and move on to the next
+		// entry.
+		ready := entry.id.File.Readiness(entry.mask) & entry.mask
+		if ready == 0 {
+			e.readyList.Remove(entry)
+			e.waitingList.PushBack(entry)
+			entry.curList = &e.waitingList
+
+			continue
+		}
+
+		// Add event to the array that will be returned to caller.
+		ret = append(ret, Event{
+			Events: uint32(ready),
+			Data:   entry.userData,
+		})
+
+		// The entry is consumed, so we must move it to the disabled
+		// list in case it's one-shot, or back to the wait list if it's
+		// edge-triggered. If it's neither, we leave it in the ready
+		// list so that its readiness can be checked the next time
+		// around; however, we must move it to the end of the list so
+		// that other events can be delivered as well.
+		e.readyList.Remove(entry)
+		if entry.flags&OneShot != 0 {
+			e.disabledList.PushBack(entry)
+			entry.curList = &e.disabledList
+		} else if entry.flags&EdgeTriggered != 0 {
+			e.waitingList.PushBack(entry)
+			entry.curList = &e.waitingList
+		} else {
+			local.PushBack(entry)
+		}
+	}
+
+	e.readyList.PushBackList(&local)
+
+	e.listsMu.Unlock()
+
+	return ret
+}
+
+// readyCallback is called when one of the files we're polling becomes ready. It
+// moves said file to the readyList if it's currently in the waiting list.
+type readyCallback struct{}
+
+// Callback implements waiter.EntryCallback.Callback.
+func (*readyCallback) Callback(w *waiter.Entry) {
+	entry := w.Context.(*pollEntry)
+	e := entry.epoll
+
+	e.listsMu.Lock()
+
+	if entry.curList == &e.waitingList {
+		e.waitingList.Remove(entry)
+		e.readyList.PushBack(entry)
+		entry.curList = &e.readyList
+
+		e.Notify(waiter.EventIn)
+	}
+
+	e.listsMu.Unlock()
+}
+
+// initEntryReadiness initializes the entry's state with regards to its
+// readiness by placing it in the appropriate list and registering for
+// notifications.
+func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
+	// A new entry starts off in the waiting list.
+	e.listsMu.Lock()
+	e.waitingList.PushBack(entry)
+	entry.curList = &e.waitingList
+	e.listsMu.Unlock()
+
+	// Register for event notifications.
+	f := entry.id.File
+	f.EventRegister(&entry.waiter, entry.mask)
+
+	// Check if the file happens to already be in a ready state.
+	ready := f.Readiness(entry.mask) & entry.mask
+	if ready != 0 {
+		(*readyCallback).Callback(nil, &entry.waiter)
+	}
+}
+
+// observes checks if event poll object e is directly or indirectly observing
+// event poll object ep. It uses a bounded recursive depth-first search.
+func (e *EventPoll) observes(ep *EventPoll, depthLeft int) bool {
+	// If we reached the maximum depth, we'll consider that we found it
+	// because we don't want to allow chains that are too long.
+	if depthLeft <= 0 {
+		return true
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Go through each observed file and check if it is or observes ep.
+	for id := range e.files {
+		f, ok := id.File.FileOperations.(*EventPoll)
+		if !ok {
+			continue
+		}
+
+		if f == ep || f.observes(ep, depthLeft-1) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// AddEntry adds a new file to the collection of files observed by e.
+func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
+	// Acquire cycle check lock if another event poll is being added.
+	ep, ok := id.File.FileOperations.(*EventPoll)
+	if ok {
+		cycleMu.Lock()
+		defer cycleMu.Unlock()
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Fail if the file already has an entry.
+	if _, ok := e.files[id]; ok {
+		return syscall.EEXIST
+	}
+
+	// Check if a cycle would be created. We use 4 as the limit because
+	// that's the value used by linux and we want to emulate it.
+	if ep != nil {
+		if e == ep {
+			return syscall.EINVAL
+		}
+
+		if ep.observes(e, 4) {
+			return syscall.ELOOP
+		}
+	}
+
+	// Create new entry and add it to map.
+	//
+	// N.B. Even though we are creating a weak reference here, we know it
+	//      won't trigger a callback because we hold a reference to the file
+	//      throughout the execution of this function.
+	entry := &pollEntry{
+		id:       id,
+		userData: data,
+		epoll:    e,
+		flags:    flags,
+		waiter:   waiter.Entry{Callback: &readyCallback{}},
+		mask:     mask,
+	}
+	entry.waiter.Context = entry
+	e.files[id] = entry
+	entry.file = refs.NewWeakRef(id.File, entry)
+
+	// Initialize the readiness state of the new entry.
+	e.initEntryReadiness(entry)
+
+	return nil
+}
+
+// UpdateEntry updates the flags, mask and user data associated with a file that
+// is already part of the collection of observed files.
+func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Fail if the file doesn't have an entry.
+	entry, ok := e.files[id]
+	if !ok {
+		return syscall.ENOENT
+	}
+
+	// Unregister the old mask and remove entry from the list it's in, so
+	// readyCallback is guaranteed to not be called on this entry anymore.
+	entry.id.File.EventUnregister(&entry.waiter)
+
+	// Remove entry from whatever list it's in. This ensure that no other
+	// threads have access to this entry as the only way left to find it
+	// is via e.files, but we hold e.mu, which prevents that.
+	e.listsMu.Lock()
+	entry.curList.Remove(entry)
+	e.listsMu.Unlock()
+
+	// Initialize new readiness state.
+	entry.flags = flags
+	entry.mask = mask
+	entry.userData = data
+	e.initEntryReadiness(entry)
+
+	return nil
+}
+
+// RemoveEntry a files from the collection of observed files.
+func (e *EventPoll) RemoveEntry(id FileIdentifier) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Fail if the file doesn't have an entry.
+	entry, ok := e.files[id]
+	if !ok {
+		return syscall.ENOENT
+	}
+
+	// Unregister from file first so that no concurrent attempts will be
+	// made to manipulate the file.
+	entry.id.File.EventUnregister(&entry.waiter)
+
+	// Remove from the current list.
+	e.listsMu.Lock()
+	entry.curList.Remove(entry)
+	entry.curList = nil
+	e.listsMu.Unlock()
+
+	// Remove file from map, and drop weak reference.
+	delete(e.files, id)
+	entry.file.Drop()
+
+	return nil
+}
+
+// UnregisterEpollWaiters removes the epoll waiter objects from the waiting
+// queues. This is different from Release() as the file is not dereferenced.
+func (e *EventPoll) UnregisterEpollWaiters() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	for _, entry := range e.files {
+		entry.id.File.EventUnregister(&entry.waiter)
+	}
+}
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
new file mode 100644
index 000000000..dabb32f49
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -0,0 +1,51 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package epoll
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// afterLoad is invoked by stateify.
+func (p *pollEntry) afterLoad() {
+	p.waiter = waiter.Entry{Callback: &readyCallback{}}
+	p.waiter.Context = p
+	p.file = refs.NewWeakRef(p.id.File, p)
+	p.id.File.EventRegister(&p.waiter, p.mask)
+}
+
+// afterLoad is invoked by stateify.
+func (e *EventPoll) afterLoad() {
+	e.listsMu.Lock()
+	defer e.listsMu.Unlock()
+
+	for _, ls := range []*ilist.List{&e.waitingList, &e.readyList, &e.disabledList} {
+		for it := ls.Front(); it != nil; it = it.Next() {
+			it.(*pollEntry).curList = ls
+		}
+	}
+
+	for it := e.waitingList.Front(); it != nil; it = it.Next() {
+		p := it.(*pollEntry)
+		if p.id.File.Readiness(p.mask) != 0 {
+			e.waitingList.Remove(p)
+			e.readyList.PushBack(p)
+			p.curList = &e.readyList
+			e.Notify(waiter.EventIn)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go
new file mode 100644
index 000000000..bc869fc13
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll_test.go
@@ -0,0 +1,54 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package epoll
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/filetest"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+func TestFileDestroyed(t *testing.T) {
+	f := filetest.NewTestFile(t)
+	id := FileIdentifier{f, 12}
+
+	efile := NewEventPoll(contexttest.Context(t))
+	e := efile.FileOperations.(*EventPoll)
+	if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil {
+		t.Fatalf("addEntry failed: %v", err)
+	}
+
+	// Check that we get an event reported twice in a row.
+	evt := e.ReadEvents(1)
+	if len(evt) != 1 {
+		t.Fatalf("Unexpected number of ready events: want %v, got %v", 1, len(evt))
+	}
+
+	evt = e.ReadEvents(1)
+	if len(evt) != 1 {
+		t.Fatalf("Unexpected number of ready events: want %v, got %v", 1, len(evt))
+	}
+
+	// Destroy the file. Check that we get no more events.
+	f.DecRef()
+
+	evt = e.ReadEvents(1)
+	if len(evt) != 0 {
+		t.Fatalf("Unexpected number of ready events: want %v, got %v", 0, len(evt))
+	}
+
+}
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
new file mode 100644
index 000000000..2d5a3c693
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -0,0 +1,46 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "eventfd_state",
+    srcs = [
+        "eventfd.go",
+    ],
+    out = "eventfd_state.go",
+    package = "eventfd",
+)
+
+go_library(
+    name = "eventfd",
+    srcs = [
+        "eventfd.go",
+        "eventfd_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/refs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "eventfd_test",
+    size = "small",
+    srcs = ["eventfd_test.go"],
+    embed = [":eventfd"],
+    deps = [
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
new file mode 100644
index 000000000..c9333719e
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -0,0 +1,172 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package eventfd provides an implementation of Linux's file-based event
+// notification.
+package eventfd
+
+import (
+	"math"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// EventOperations represents an event with the semantics of Linux's file-based event
+// notification (eventfd).
+type EventOperations struct {
+	fsutil.NoopRelease   `state:"nosave"`
+	fsutil.PipeSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+	fsutil.NoIoctl       `state:"nosave"`
+
+	// Mutex that protects accesses to the fields of this event.
+	mu sync.Mutex `state:"nosave"`
+
+	// Queue is used to notify interested parties when the event object
+	// becomes readable or writable.
+	waiter.Queue `state:"nosave"`
+
+	// val is the current value of the event counter.
+	val uint64
+
+	// semMode specifies whether the event is in "semaphore" mode.
+	semMode bool
+}
+
+// New creates a new event object with the supplied initial value and mode.
+func New(ctx context.Context, initVal uint64, semMode bool) *fs.File {
+	// name matches fs/eventfd.c:eventfd_file_create.
+	dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[eventfd]")
+	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{
+		val:     initVal,
+		semMode: semMode,
+	})
+}
+
+// Read implements fs.FileOperations.Read.
+func (e *EventOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	if dst.NumBytes() < 8 {
+		return 0, syscall.EINVAL
+	}
+	if err := e.read(ctx, dst); err != nil {
+		return 0, err
+	}
+	return 8, nil
+}
+
+// Write implements fs.FileOperations.Write.
+func (e *EventOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	if src.NumBytes() < 8 {
+		return 0, syscall.EINVAL
+	}
+	if err := e.write(ctx, src); err != nil {
+		return 0, err
+	}
+	return 8, nil
+}
+
+func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) error {
+	e.mu.Lock()
+
+	// We can't complete the read if the value is currently zero.
+	if e.val == 0 {
+		e.mu.Unlock()
+		return syserror.ErrWouldBlock
+	}
+
+	// Update the value based on the mode the event is operating in.
+	var val uint64
+	if e.semMode {
+		val = 1
+		// Consistent with Linux, this is done even if writing to memory fails.
+		e.val--
+	} else {
+		val = e.val
+		e.val = 0
+	}
+
+	e.mu.Unlock()
+
+	// Notify writers. We do this even if we were already writable because
+	// it is possible that a writer is waiting to write the maximum value
+	// to the event.
+	e.Notify(waiter.EventOut)
+
+	var buf [8]byte
+	usermem.ByteOrder.PutUint64(buf[:], val)
+	_, err := dst.CopyOut(ctx, buf[:])
+	return err
+}
+
+func (e *EventOperations) write(ctx context.Context, src usermem.IOSequence) error {
+	var buf [8]byte
+	if _, err := src.CopyIn(ctx, buf[:]); err != nil {
+		return err
+	}
+	val := usermem.ByteOrder.Uint64(buf[:])
+
+	return e.Signal(val)
+}
+
+// Signal is an internal function to signal the event fd.
+func (e *EventOperations) Signal(val uint64) error {
+	if val == math.MaxUint64 {
+		return syscall.EINVAL
+	}
+
+	e.mu.Lock()
+
+	// We only allow writes that won't cause the value to go over the max
+	// uint64 minus 1.
+	if val > math.MaxUint64-1-e.val {
+		e.mu.Unlock()
+		return syserror.ErrWouldBlock
+	}
+
+	e.val += val
+	e.mu.Unlock()
+
+	// Always trigger a notification.
+	e.Notify(waiter.EventIn)
+
+	return nil
+}
+
+// Readiness returns the ready events for the event fd.
+func (e *EventOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ready := waiter.EventMask(0)
+
+	e.mu.Lock()
+	if e.val > 0 {
+		ready |= waiter.EventIn
+	}
+
+	if e.val < math.MaxUint64-1 {
+		ready |= waiter.EventOut
+	}
+	e.mu.Unlock()
+
+	return mask & ready
+}
diff --git a/pkg/sentry/kernel/eventfd/eventfd_test.go b/pkg/sentry/kernel/eventfd/eventfd_test.go
new file mode 100644
index 000000000..71326b62f
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/eventfd_test.go
@@ -0,0 +1,78 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package eventfd
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+func TestEventfd(t *testing.T) {
+	initVals := []uint64{
+		0,
+		// Using a non-zero initial value verifies that writing to an
+		// eventfd signals when the eventfd's counter was already
+		// non-zero.
+		343,
+	}
+
+	for _, initVal := range initVals {
+		ctx := contexttest.Context(t)
+
+		// Make a new event that is writable.
+		event := New(ctx, initVal, false)
+
+		// Register a callback for a write event.
+		w, ch := waiter.NewChannelEntry(nil)
+		event.EventRegister(&w, waiter.EventIn)
+		defer event.EventUnregister(&w)
+
+		data := []byte("00000124")
+		// Create and submit a write request.
+		n, err := event.Writev(ctx, usermem.BytesIOSequence(data))
+		if err != nil {
+			t.Fatal(err)
+		}
+		if n != 8 {
+			t.Errorf("eventfd.write wrote %d bytes, not full int64", n)
+		}
+
+		// Check if the callback fired due to the write event.
+		select {
+		case <-ch:
+		default:
+			t.Errorf("Didn't get notified of EventIn after write")
+		}
+	}
+}
+
+func TestEventfdStat(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	// Make a new event that is writable.
+	event := New(ctx, 0, false)
+
+	// Create and submit an stat request.
+	uattr, err := event.Dirent.Inode.UnstableAttr(ctx)
+	if err != nil {
+		t.Fatalf("eventfd stat request failed: %v", err)
+	}
+	if uattr.Size != 0 {
+		t.Fatal("EventFD size should be 0")
+	}
+}
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
new file mode 100644
index 000000000..ef73125fd
--- /dev/null
+++ b/pkg/sentry/kernel/fd_map.go
@@ -0,0 +1,340 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// FDs is an ordering of FD's that can be made stable.
+type FDs []kdefs.FD
+
+func (f FDs) Len() int {
+	return len(f)
+}
+
+func (f FDs) Swap(i, j int) {
+	f[i], f[j] = f[j], f[i]
+}
+
+func (f FDs) Less(i, j int) bool {
+	return f[i] < f[j]
+}
+
+// FDFlags define flags for an individual descriptor.
+type FDFlags struct {
+	// CloseOnExec indicates the descriptor should be closed on exec.
+	CloseOnExec bool
+}
+
+// descriptor holds the details about a file descriptor, namely a pointer the
+// file itself and the descriptor flags.
+type descriptor struct {
+	file  *fs.File
+	flags FDFlags
+}
+
+// FDMap is used to manage File references and flags.
+type FDMap struct {
+	refs.AtomicRefCount
+	k     *Kernel
+	files map[kdefs.FD]descriptor
+	mu    sync.RWMutex `state:"nosave"`
+	uid   uint64
+}
+
+// ID returns a unique identifier for this FDMap.
+func (f *FDMap) ID() uint64 {
+	return f.uid
+}
+
+// NewFDMap allocates a new FDMap that may be used by tasks in k.
+func (k *Kernel) NewFDMap() *FDMap {
+	return &FDMap{
+		k:     k,
+		files: make(map[kdefs.FD]descriptor),
+		uid:   atomic.AddUint64(&k.fdMapUids, 1),
+	}
+}
+
+// destroy removes all of the file descriptors from the map.
+func (f *FDMap) destroy() {
+	f.RemoveIf(func(*fs.File, FDFlags) bool {
+		return true
+	})
+}
+
+// DecRef implements RefCounter.DecRef with destructor f.destroy.
+func (f *FDMap) DecRef() {
+	f.DecRefWithDestructor(f.destroy)
+}
+
+// Size returns the number of file descriptor slots currently allocated.
+func (f *FDMap) Size() int {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	return len(f.files)
+}
+
+// String is a stringer for FDMap.
+func (f *FDMap) String() string {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	var b bytes.Buffer
+	for k, v := range f.files {
+		n, _ := v.file.Dirent.FullName(nil /* root */)
+		b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", k, n))
+	}
+	return b.String()
+}
+
+// NewFDFrom allocates a new FD guaranteed to be the lowest number available
+// greater than or equal to from. This property is important as Unix programs
+// tend to count on this allocation order.
+func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error) {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return 0, syscall.EINVAL
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// Finds the lowest fd not in the handles map.
+	lim := limitSet.Get(limits.NumberOfFiles)
+	for i := fd; lim.Cur == limits.Infinity || i < kdefs.FD(lim.Cur); i++ {
+		if _, ok := f.files[i]; !ok {
+			file.IncRef()
+			f.files[i] = descriptor{file, flags}
+			return i, nil
+		}
+	}
+
+	return -1, syscall.EMFILE
+}
+
+// NewFDAt sets the file reference for the given FD. If there is an
+// active reference for that FD, the ref count for that existing reference
+// is decremented.
+func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return syscall.EBADF
+	}
+
+	// In this one case we do not do a defer of the Unlock.  The
+	// reason is that we must have done all the work needed for
+	// discarding any old open file before we return to the
+	// caller. In other words, the DecRef(), below, must have
+	// completed by the time we return to the caller to ensure
+	// side effects are, in fact, effected. A classic example is
+	// dup2(fd1, fd2); if fd2 was already open, it must be closed,
+	// and we don't want to resume the caller until it is; we have
+	// to block on the DecRef(). Hence we can not just do a 'go
+	// oldfile.DecRef()', since there would be no guarantee that
+	// it would be done before we the caller resumed. Since we
+	// must wait for the DecRef() to finish, and that could take
+	// time, it's best to first call f.muUnlock beore so we are
+	// not blocking other uses of this FDMap on the DecRef() call.
+	f.mu.Lock()
+	oldDesc, oldExists := f.files[fd]
+	lim := limitSet.Get(limits.NumberOfFiles).Cur
+	// if we're closing one then the effective limit is one
+	// more than the actual limit.
+	if oldExists && lim != limits.Infinity {
+		lim++
+	}
+	if lim != limits.Infinity && fd >= kdefs.FD(lim) {
+		f.mu.Unlock()
+		return syscall.EMFILE
+	}
+
+	file.IncRef()
+	f.files[fd] = descriptor{file, flags}
+	f.mu.Unlock()
+
+	if oldExists {
+		oldDesc.file.DecRef()
+	}
+	return nil
+}
+
+// SetFlags sets the flags for the given file descriptor, if it is valid.
+func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	desc, ok := f.files[fd]
+	if !ok {
+		return
+	}
+
+	f.files[fd] = descriptor{desc.file, flags}
+}
+
+// GetDescriptor returns a reference to the file and the flags for the FD. It
+// bumps its reference count as well. It returns nil if there is no File
+// for the FD, i.e. if the FD is invalid. The caller must use DecRef
+// when they are done.
+func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags) {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	if desc, ok := f.files[fd]; ok {
+		desc.file.IncRef()
+		return desc.file, desc.flags
+	}
+	return nil, FDFlags{}
+}
+
+// GetFile returns a reference to the File for the FD and bumps
+// its reference count as well. It returns nil if there is no File
+// for the FD, i.e. if the FD is invalid. The caller must use DecRef
+// when they are done.
+func (f *FDMap) GetFile(fd kdefs.FD) *fs.File {
+	f.mu.RLock()
+	if desc, ok := f.files[fd]; ok {
+		desc.file.IncRef()
+		f.mu.RUnlock()
+		return desc.file
+	}
+	f.mu.RUnlock()
+	return nil
+}
+
+// fds returns an ordering of FDs.
+func (f *FDMap) fds() FDs {
+	fds := make(FDs, 0, len(f.files))
+	for fd := range f.files {
+		fds = append(fds, fd)
+	}
+	sort.Sort(fds)
+	return fds
+}
+
+// GetFDs returns a list of valid fds.
+func (f *FDMap) GetFDs() FDs {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+	return f.fds()
+}
+
+// GetRefs returns a stable slice of references to all files and bumps the
+// reference count on each.  The caller must use DecRef on each reference when
+// they're done using the slice.
+func (f *FDMap) GetRefs() []*fs.File {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	fds := f.fds()
+	fs := make([]*fs.File, 0, len(fds))
+	for _, fd := range fds {
+		desc := f.files[fd]
+		desc.file.IncRef()
+		fs = append(fs, desc.file)
+	}
+	return fs
+}
+
+// Fork returns an independent FDMap pointing to the same descriptors.
+func (f *FDMap) Fork() *FDMap {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	clone := f.k.NewFDMap()
+
+	// Grab a extra reference for every file.
+	for fd, desc := range f.files {
+		desc.file.IncRef()
+		clone.files[fd] = desc
+	}
+
+	// That's it!
+	return clone
+}
+
+// unlock releases all file locks held by this FDMap's uid.  Must only be
+// called on a non-nil *fs.File.
+func (f *FDMap) unlock(file *fs.File) {
+	id := lock.UniqueID(f.ID())
+	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(id, lock.LockRange{0, lock.LockEOF})
+}
+
+// inotifyFileClose generates the appropriate inotify events for f being closed.
+func inotifyFileClose(f *fs.File) {
+	var ev uint32
+	d := f.Dirent
+
+	if fs.IsDir(d.Inode.StableAttr) {
+		ev |= linux.IN_ISDIR
+	}
+
+	if f.Flags().Write {
+		ev |= linux.IN_CLOSE_WRITE
+	} else {
+		ev |= linux.IN_CLOSE_NOWRITE
+	}
+
+	d.InotifyEvent(ev, 0)
+}
+
+// Remove removes an FD from the FDMap, and returns (File, true) if a File
+// one was found. Callers are expected to decrement the reference count on
+// the File. Otherwise returns (nil, false).
+func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool) {
+	f.mu.Lock()
+	desc := f.files[fd]
+	delete(f.files, fd)
+	f.mu.Unlock()
+	if desc.file != nil {
+		f.unlock(desc.file)
+		inotifyFileClose(desc.file)
+		return desc.file, true
+	}
+	return nil, false
+}
+
+// RemoveIf removes all FDs where cond is true.
+func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool) {
+	var removed []*fs.File
+	f.mu.Lock()
+	for fd, desc := range f.files {
+		if desc.file != nil && cond(desc.file, desc.flags) {
+			delete(f.files, fd)
+			removed = append(removed, desc.file)
+		}
+	}
+	f.mu.Unlock()
+
+	for _, file := range removed {
+		f.unlock(file)
+		inotifyFileClose(file)
+		file.DecRef()
+	}
+}
diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go
new file mode 100644
index 000000000..e1ac900e8
--- /dev/null
+++ b/pkg/sentry/kernel/fd_map_test.go
@@ -0,0 +1,134 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/filetest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+const (
+	// maxFD is the maximum FD to try to create in the map.
+	// This number of open files has been seen in the wild.
+	maxFD = 2 * 1024
+)
+
+func newTestFDMap() *FDMap {
+	return &FDMap{
+		files: make(map[kdefs.FD]descriptor),
+	}
+}
+
+// TestFDMapMany allocates maxFD FDs, i.e. maxes out the FDMap,
+// until there is no room, then makes sure that NewFDAt works
+// and also that if we remove one and add one that works too.
+func TestFDMapMany(t *testing.T) {
+	file := filetest.NewTestFile(t)
+	limitSet := limits.NewLimitSet()
+	limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD})
+
+	f := newTestFDMap()
+	for i := 0; i < maxFD; i++ {
+		if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
+			t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD)
+		}
+	}
+
+	if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil {
+		t.Fatalf("f.NewFDFrom(0, r) in full map: got nil, wanted error")
+	}
+
+	if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil {
+		t.Fatalf("f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+	}
+}
+
+// TestFDMap does a set of simple tests to make sure simple adds,
+// removes, GetRefs, and DecRefs work. The ordering is just weird
+// enough that a table-driven approach seemed clumsy.
+func TestFDMap(t *testing.T) {
+	file := filetest.NewTestFile(t)
+	limitSet := limits.NewLimitSet()
+	limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD})
+
+	f := newTestFDMap()
+	if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
+		t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err)
+	}
+
+	if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil {
+		t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error")
+	}
+
+	largeLimit := limits.Limit{maxFD, maxFD}
+	limitSet.Set(limits.NumberOfFiles, largeLimit)
+
+	if fd, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
+		t.Fatalf("Adding an FD to a resized map: got %v, want nil", err)
+	} else if fd != kdefs.FD(1) {
+		t.Fatalf("Added an FD to a resized map: got %v, want 1", fd)
+	}
+
+	if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil {
+		t.Fatalf("Replacing FD 1 via f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+	}
+
+	if err := f.NewFDAt(maxFD+1, file, FDFlags{}, limitSet); err == nil {
+		t.Fatalf("Using an FD that was too large via f.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1)
+	}
+
+	if ref := f.GetFile(1); ref == nil {
+		t.Fatalf("f.GetFile(1): got nil, wanted %v", file)
+	}
+
+	if ref := f.GetFile(2); ref != nil {
+		t.Fatalf("f.GetFile(2): got a %v, wanted nil", ref)
+	}
+
+	ref, ok := f.Remove(1)
+	if !ok {
+		t.Fatalf("f.Remove(1) for an existing FD: failed, want success")
+	}
+	ref.DecRef()
+
+	if ref, ok := f.Remove(1); ok {
+		ref.DecRef()
+		t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
+	}
+
+}
+
+func TestDescriptorFlags(t *testing.T) {
+	file := filetest.NewTestFile(t)
+	f := newTestFDMap()
+	limitSet := limits.NewLimitSet()
+	limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD})
+
+	if err := f.NewFDAt(2, file, FDFlags{CloseOnExec: true}, limitSet); err != nil {
+		t.Fatalf("f.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err)
+	}
+
+	newFile, flags := f.GetDescriptor(2)
+	if newFile == nil {
+		t.Fatalf("f.GetFile(2): got a %v, wanted nil", newFile)
+	}
+
+	if !flags.CloseOnExec {
+		t.Fatalf("new File flags %d don't match original %d\n", flags, 0)
+	}
+}
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
new file mode 100644
index 000000000..9aa6fa951
--- /dev/null
+++ b/pkg/sentry/kernel/fs_context.go
@@ -0,0 +1,172 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// FSContext contains filesystem context.
+//
+// This includes umask and working directory.
+type FSContext struct {
+	refs.AtomicRefCount
+
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// root is the filesystem root. Will be nil iff the FSContext has been
+	// destroyed.
+	root *fs.Dirent
+
+	// cwd is the current working directory. Will be nil iff the FSContext
+	// has been destroyed.
+	cwd *fs.Dirent
+
+	// umask is the current file mode creation mask. When a thread using this
+	// context invokes a syscall that creates a file, bits set in umask are
+	// removed from the permissions that the file is created with.
+	umask uint
+}
+
+// newFSContext returns a new filesystem context.
+func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
+	root.IncRef()
+	cwd.IncRef()
+	return &FSContext{
+		root:  root,
+		cwd:   cwd,
+		umask: umask,
+	}
+}
+
+// destroy is the destructor for an FSContext.
+//
+// This will call DecRef on both root and cwd Dirents.  If either call to
+// DecRef returns an error, then it will be propigated.  If both calls to
+// DecRef return an error, then the one from root.DecRef will be propigated.
+//
+// Note that there may still be calls to WorkingDirectory() or RootDirectory()
+// (that return nil).  This is because valid references may still be held via
+// proc files or other mechanisms.
+func (f *FSContext) destroy() {
+	f.root.DecRef()
+	f.root = nil
+
+	f.cwd.DecRef()
+	f.cwd = nil
+}
+
+// DecRef implements RefCounter.DecRef with destructor f.destroy.
+func (f *FSContext) DecRef() {
+	f.DecRefWithDestructor(f.destroy)
+}
+
+// Fork forks this FSContext.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) Fork() *FSContext {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.cwd.IncRef()
+	f.root.IncRef()
+	return &FSContext{
+		cwd:   f.cwd,
+		root:  f.root,
+		umask: f.umask,
+	}
+}
+
+// WorkingDirectory returns the current working directory.
+// You should call DecRef on the returned Dirent when finished.
+//
+// This will return nil if called after destroy().
+func (f *FSContext) WorkingDirectory() *fs.Dirent {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if f.cwd != nil {
+		f.cwd.IncRef()
+	}
+	return f.cwd
+}
+
+// SetWorkingDirectory sets the current working directory.
+// This will take an extra reference on the Dirent.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
+	if d == nil {
+		panic("FSContext.SetWorkingDirectory called with nil dirent")
+	}
+	if f.cwd == nil {
+		panic(fmt.Sprintf("FSContext.SetWorkingDirectory(%v)) called after destroy", d))
+	}
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	old := f.cwd
+	f.cwd = d
+	d.IncRef()
+	old.DecRef()
+}
+
+// RootDirectory returns the current filesystem root.
+// You should call DecRef on the returned Dirent when finished.
+//
+// This will return nil if called after destroy().
+func (f *FSContext) RootDirectory() *fs.Dirent {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.root.IncRef()
+	return f.root
+}
+
+// SetRootDirectory sets the root directory.
+// This will take an extra reference on the Dirent.
+//
+// This is not a valid call after free.
+func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
+	if d == nil {
+		panic("FSContext.SetRootDirectory called with nil dirent")
+	}
+	if f.root == nil {
+		panic(fmt.Sprintf("FSContext.SetRootDirectory(%v)) called after destroy", d))
+	}
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	old := f.root
+	f.root = d
+	d.IncRef()
+	old.DecRef()
+}
+
+// Umask returns the current umask.
+func (f *FSContext) Umask() uint {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return f.umask
+}
+
+// SwapUmask atomically sets the current umask and returns the old umask.
+func (f *FSContext) SwapUmask(mask uint) uint {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	old := f.umask
+	f.umask = mask
+	return old
+}
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
new file mode 100644
index 000000000..de9897c58
--- /dev/null
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -0,0 +1,48 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_template_instance(
+    name = "waiter_list",
+    out = "waiter_list.go",
+    package = "futex",
+    prefix = "waiter",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*Waiter",
+    },
+)
+
+go_stateify(
+    name = "futex_state",
+    srcs = [
+        "futex.go",
+        "waiter_list.go",
+    ],
+    out = "futex_state.go",
+    package = "futex",
+)
+
+go_library(
+    name = "futex",
+    srcs = [
+        "futex.go",
+        "futex_state.go",
+        "waiter_list.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "futex_test",
+    size = "small",
+    srcs = ["futex_test.go"],
+    embed = [":futex"],
+)
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
new file mode 100644
index 000000000..b3ba57a2c
--- /dev/null
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -0,0 +1,405 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package futex provides an implementation of the futex interface as found in
+// the Linux kernel. It allows one to easily transform Wait() calls into waits
+// on a channel, which is useful in a Go-based kernel, for example.
+package futex
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Checker abstracts memory accesses. This is useful because the "addresses"
+// used in this package may not be real addresses (they could be indices of an
+// array, for example), or they could be mapped via some special mechanism.
+//
+// TODO: Replace this with usermem.IO.
+type Checker interface {
+	// Check should validate that given address contains the given value.
+	// If it does not contain the value, syserror.EAGAIN must be returned.
+	// Any other error may be returned, which will be propagated.
+	Check(addr uintptr, val uint32) error
+
+	// Op should atomically perform the operation encoded in op on the data
+	// pointed to by addr, then apply the comparison encoded in op to the
+	// original value at addr, returning the result.
+	// Note that op is an opaque operation whose behaviour is defined
+	// outside of the futex manager.
+	Op(addr uintptr, op uint32) (bool, error)
+}
+
+// Waiter is the struct which gets enqueued into buckets for wake up routines
+// and requeue routines to scan and notify. Once a Waiter has been enqueued by
+// WaitPrepare(), callers may listen on C for wake up events.
+type Waiter struct {
+	// Synchronization:
+	//
+	// - A Waiter that is not enqueued in a bucket is exclusively owned (no
+	// synchronization applies).
+	//
+	// - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this,
+	// waiterEntry, complete, and addr are protected by the bucket.mu ("bucket
+	// lock") of the containing bucket, and bitmask is immutable. complete and
+	// addr are additionally mutated using atomic memory operations, ensuring
+	// that they can be read using atomic memory operations without holding the
+	// bucket lock.
+	//
+	// - A Waiter is only guaranteed to be no longer queued after calling
+	// WaitComplete().
+
+	// waiterEntry links Waiter into bucket.waiters.
+	waiterEntry
+
+	// complete is 1 if the Waiter was removed from its bucket by a wakeup and
+	// 0 otherwise.
+	complete int32
+
+	// C is sent to when the Waiter is woken.
+	C chan struct{}
+
+	// addr is the address being waited on.
+	addr uintptr
+
+	// The bitmask we're waiting on.
+	// This is used the case of a FUTEX_WAKE_BITSET.
+	bitmask uint32
+}
+
+// NewWaiter returns a new unqueued Waiter.
+func NewWaiter() *Waiter {
+	return &Waiter{
+		C: make(chan struct{}, 1),
+	}
+}
+
+// bucket holds a list of waiters for a given address hash.
+type bucket struct {
+	// mu protects waiters and contained Waiter state. See comment in Waiter.
+	mu sync.Mutex `state:"nosave"`
+
+	waiters waiterList `state:"zerovalue"`
+}
+
+// wakeLocked wakes up to n waiters matching the bitmask at the addr for this
+// bucket and returns the number of waiters woken.
+//
+// Preconditions: b.mu must be locked.
+func (b *bucket) wakeLocked(addr uintptr, bitmask uint32, n int) int {
+	done := 0
+	for w := b.waiters.Front(); done < n && w != nil; {
+		if w.addr != addr || w.bitmask&bitmask == 0 {
+			// Not matching.
+			w = w.Next()
+			continue
+		}
+
+		// Remove from the bucket and wake the waiter.
+		woke := w
+		w = w.Next() // Next iteration.
+		b.waiters.Remove(woke)
+		woke.C <- struct{}{}
+
+		// NOTE: The above channel write establishes a write barrier
+		// according to the memory model, so nothing may be ordered
+		// around it. Since we've dequeued w and will never touch it
+		// again, we can safely store 1 to w.complete here and allow
+		// the WaitComplete() to short-circuit grabbing the bucket
+		// lock. If they somehow miss the w.complete, we are still
+		// holding the lock, so we can know that they won't dequeue w,
+		// assume it's free and have the below operation afterwards.
+		atomic.StoreInt32(&woke.complete, 1)
+		done++
+	}
+	return done
+}
+
+// requeueLocked takes n waiters from the bucket and moves them to naddr on the
+// bucket "to".
+//
+// Preconditions: b and to must be locked.
+func (b *bucket) requeueLocked(to *bucket, addr, naddr uintptr, n int) int {
+	done := 0
+	for w := b.waiters.Front(); done < n && w != nil; {
+		if w.addr != addr {
+			// Not matching.
+			w = w.Next()
+			continue
+		}
+
+		requeued := w
+		w = w.Next() // Next iteration.
+		b.waiters.Remove(requeued)
+		atomic.StoreUintptr(&requeued.addr, naddr)
+		to.waiters.PushBack(requeued)
+		done++
+	}
+	return done
+}
+
+const (
+	// bucketCount is the number of buckets per Manager. By having many of
+	// these we reduce contention when concurrent yet unrelated calls are made.
+	bucketCount     = 1 << bucketCountBits
+	bucketCountBits = 10
+)
+
+func checkAddr(addr uintptr) error {
+	// Ensure the address is aligned.
+	// It must be a DWORD boundary.
+	if addr&0x3 != 0 {
+		return syserror.EINVAL
+	}
+
+	return nil
+}
+
+// bucketIndexForAddr returns the index into Manager.buckets for addr.
+func bucketIndexForAddr(addr uintptr) uintptr {
+	// - The bottom 2 bits of addr must be 0, per checkAddr.
+	//
+	// - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
+	// for a canonical address, and (on all existing platforms) bit 47 must be
+	// 0 for an application address.
+	//
+	// Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful"
+	// bits. We choose one of the simplest possible hash functions that at
+	// least uses all 45 useful bits in the output, given that bucketCountBits
+	// == 10. This hash function also has the property that it will usually map
+	// adjacent addresses to adjacent buckets, slightly improving memory
+	// locality when an application synchronization structure uses multiple
+	// nearby futexes.
+	//
+	// Note that despite the large number of arithmetic operations in the
+	// function, many components can be computed in parallel, such that the
+	// critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This
+	// is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... +
+	// (addr >> 42)" without any additional grouping, the compiler puts all 4
+	// additions in the critical path.
+	h1 := (addr >> 2) + (addr >> 12) + (addr >> 22)
+	h2 := (addr >> 32) + (addr >> 42)
+	return (h1 + h2) % bucketCount
+}
+
+// Manager holds futex state for a single virtual address space.
+type Manager struct {
+	buckets [bucketCount]bucket
+}
+
+// NewManager returns an initialized futex manager.
+// N.B. we use virtual address to tag futexes, so it only works for private
+// (within a single process) futex.
+func NewManager() *Manager {
+	return &Manager{}
+}
+
+// lockBucket returns a locked bucket for the given addr.
+//
+// Preconditions: checkAddr(addr) == nil.
+func (m *Manager) lockBucket(addr uintptr) *bucket {
+	b := &m.buckets[bucketIndexForAddr(addr)]
+	b.mu.Lock()
+	return b
+}
+
+// lockBuckets returns locked buckets for the given addrs.
+//
+// Preconditions: checkAddr(addr1) == checkAddr(addr2) == nil.
+func (m *Manager) lockBuckets(addr1 uintptr, addr2 uintptr) (*bucket, *bucket) {
+	i1 := bucketIndexForAddr(addr1)
+	i2 := bucketIndexForAddr(addr2)
+	b1 := &m.buckets[i1]
+	b2 := &m.buckets[i2]
+
+	// Ensure that buckets are locked in a consistent order (lowest index
+	// first) to avoid circular locking.
+	switch {
+	case i1 < i2:
+		b1.mu.Lock()
+		b2.mu.Lock()
+	case i2 < i1:
+		b2.mu.Lock()
+		b1.mu.Lock()
+	default:
+		b1.mu.Lock()
+	}
+
+	return b1, b2
+}
+
+// Wake wakes up to n waiters matching the bitmask on the given addr.
+// The number of waiters woken is returned.
+func (m *Manager) Wake(addr uintptr, bitmask uint32, n int) (int, error) {
+	if err := checkAddr(addr); err != nil {
+		return 0, err
+	}
+
+	b := m.lockBucket(addr)
+	// This function is very hot; avoid defer.
+	r := b.wakeLocked(addr, bitmask, n)
+	b.mu.Unlock()
+	return r, nil
+}
+
+func (m *Manager) doRequeue(c Checker, addr uintptr, val uint32, naddr uintptr, nwake int, nreq int) (int, error) {
+	if err := checkAddr(addr); err != nil {
+		return 0, err
+	}
+	if err := checkAddr(naddr); err != nil {
+		return 0, err
+	}
+
+	b1, b2 := m.lockBuckets(addr, naddr)
+	defer b1.mu.Unlock()
+	if b2 != b1 {
+		defer b2.mu.Unlock()
+	}
+
+	// Check our value.
+	// This only applied for RequeueCmp().
+	if c != nil {
+		if err := c.Check(addr, val); err != nil {
+			return 0, err
+		}
+	}
+
+	// Wake the number required.
+	done := b1.wakeLocked(addr, ^uint32(0), nwake)
+
+	// Requeue the number required.
+	b1.requeueLocked(b2, addr, naddr, nreq)
+
+	return done, nil
+}
+
+// Requeue wakes up to nwake waiters on the given addr, and unconditionally
+// requeues up to nreq waiters on naddr.
+func (m *Manager) Requeue(addr uintptr, naddr uintptr, nwake int, nreq int) (int, error) {
+	return m.doRequeue(nil, addr, 0, naddr, nwake, nreq)
+}
+
+// RequeueCmp atomically checks that the addr contains val (via the Checker),
+// wakes up to nwake waiters on addr and then unconditionally requeues nreq
+// waiters on naddr.
+func (m *Manager) RequeueCmp(c Checker, addr uintptr, val uint32, naddr uintptr, nwake int, nreq int) (int, error) {
+	return m.doRequeue(c, addr, val, naddr, nwake, nreq)
+}
+
+// WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
+// waiters unconditionally from addr1, and, based on the original value at addr2
+// and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
+// It returns the total number of waiters woken.
+func (m *Manager) WakeOp(c Checker, addr1 uintptr, addr2 uintptr, nwake1 int, nwake2 int, op uint32) (int, error) {
+	if err := checkAddr(addr1); err != nil {
+		return 0, err
+	}
+	if err := checkAddr(addr2); err != nil {
+		return 0, err
+	}
+
+	b1, b2 := m.lockBuckets(addr1, addr2)
+
+	done := 0
+	cond, err := c.Op(addr2, op)
+	if err == nil {
+		// Wake up up to nwake1 entries from the first bucket.
+		done = b1.wakeLocked(addr1, ^uint32(0), nwake1)
+
+		// Wake up up to nwake2 entries from the second bucket if the
+		// operation yielded true.
+		if cond {
+			done += b2.wakeLocked(addr2, ^uint32(0), nwake2)
+		}
+	}
+
+	b1.mu.Unlock()
+	if b2 != b1 {
+		b2.mu.Unlock()
+	}
+	return done, err
+}
+
+// WaitPrepare atomically checks that addr contains val (via the Checker), then
+// enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
+// Waiter must be subsequently removed by calling WaitComplete, whether or not
+// a wakeup is received on w.C.
+func (m *Manager) WaitPrepare(w *Waiter, c Checker, addr uintptr, val uint32, bitmask uint32) error {
+	if err := checkAddr(addr); err != nil {
+		return err
+	}
+
+	// Prepare the Waiter before taking the bucket lock.
+	w.complete = 0
+	select {
+	case <-w.C:
+	default:
+	}
+	w.addr = addr
+	w.bitmask = bitmask
+
+	b := m.lockBucket(addr)
+	// This function is very hot; avoid defer.
+
+	// Perform our atomic check.
+	if err := c.Check(addr, val); err != nil {
+		b.mu.Unlock()
+		return err
+	}
+
+	// Add the waiter to the bucket.
+	b.waiters.PushBack(w)
+
+	b.mu.Unlock()
+	return nil
+}
+
+// WaitComplete must be called when a Waiter previously added by WaitPrepare is
+// no longer eligible to be woken.
+func (m *Manager) WaitComplete(w *Waiter) {
+	// Can we short-circuit acquiring the lock?
+	// This is the happy path where a notification
+	// was received and we don't need to dequeue this
+	// waiter from any list (or take any locks).
+	if atomic.LoadInt32(&w.complete) != 0 {
+		return
+	}
+
+	// Take the bucket lock. Note that without holding the bucket lock, the
+	// waiter is not guaranteed to stay in that bucket, so after we take the
+	// bucket lock, we must ensure that the bucket hasn't changed: if it
+	// happens to have changed, we release the old bucket lock and try again
+	// with the new bucket; if it hasn't changed, we know it won't change now
+	// because we hold the lock.
+	var b *bucket
+	for {
+		addr := atomic.LoadUintptr(&w.addr)
+		b = m.lockBucket(addr)
+		// We still have to use an atomic load here, because if w was racily
+		// requeued then w.addr is not protected by b.mu.
+		if addr == atomic.LoadUintptr(&w.addr) {
+			break
+		}
+		b.mu.Unlock()
+	}
+
+	// Remove waiter from the bucket. w.complete can only be stored with b.mu
+	// locked, so this load doesn't need to use sync/atomic.
+	if w.complete == 0 {
+		b.waiters.Remove(w)
+	}
+	b.mu.Unlock()
+}
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
new file mode 100644
index 000000000..7b81358ec
--- /dev/null
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -0,0 +1,500 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package futex
+
+import (
+	"math"
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+const (
+	testMutexSize            = 4
+	testMutexLocked   uint32 = 1
+	testMutexUnlocked uint32 = 0
+)
+
+// testData implements the Checker interface, and allows us to
+// treat the address passed for futex operations as an index in
+// a byte slice for testing simplicity.
+type testData []byte
+
+func newTestData(size uint) testData {
+	return make([]byte, size)
+}
+
+func (t testData) Check(addr uintptr, val uint32) error {
+	if val != atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))) {
+		return syscall.EAGAIN
+	}
+	return nil
+}
+
+func (t testData) Op(addr uintptr, val uint32) (bool, error) {
+	return val == 0, nil
+}
+
+// testMutex ties together a testData slice, an address, and a
+// futex manager in order to implement the sync.Locker interface.
+// Beyond being used as a Locker, this is a simple mechanism for
+// changing the underlying values for simpler tests.
+type testMutex struct {
+	a uintptr
+	d testData
+	m *Manager
+}
+
+func newTestMutex(addr uintptr, d testData, m *Manager) *testMutex {
+	return &testMutex{a: addr, d: d, m: m}
+}
+
+// Lock acquires the testMutex.
+// This may wait for it to be available via the futex manager.
+func (t *testMutex) Lock() {
+	for {
+		// Attempt to grab the lock.
+		if atomic.CompareAndSwapUint32(
+			((*uint32)(unsafe.Pointer(&t.d[t.a]))),
+			testMutexUnlocked,
+			testMutexLocked) {
+			// Lock held.
+			return
+		}
+
+		// Wait for it to be "not locked".
+		w := NewWaiter()
+		err := t.m.WaitPrepare(w, t.d, t.a, testMutexLocked, ^uint32(0))
+		if err == syscall.EAGAIN {
+			continue
+		}
+		if err != nil {
+			// Should never happen.
+			panic("WaitPrepare returned unexpected error: " + err.Error())
+		}
+		<-w.C
+		t.m.WaitComplete(w)
+	}
+}
+
+// Unlock releases the testMutex.
+// This will notify any waiters via the futex manager.
+func (t *testMutex) Unlock() {
+	// Unlock.
+	atomic.StoreUint32(((*uint32)(unsafe.Pointer(&t.d[t.a]))), testMutexUnlocked)
+
+	// Notify all waiters.
+	t.m.Wake(t.a, ^uint32(0), math.MaxInt32)
+}
+
+func TestFutexWake(t *testing.T) {
+	m := NewManager()
+	d := newTestData(testMutexSize)
+
+	// Wait for it to be locked.
+	// (This won't trigger the wake in testMutex)
+	w := NewWaiter()
+	m.WaitPrepare(w, d, 0, testMutexUnlocked, ^uint32(0))
+
+	// Wake the single thread.
+	if _, err := m.Wake(0, ^uint32(0), 1); err != nil {
+		t.Error("wake error:", err)
+	}
+
+	<-w.C
+	m.WaitComplete(w)
+}
+
+func TestFutexWakeBitmask(t *testing.T) {
+	m := NewManager()
+	d := newTestData(testMutexSize)
+
+	// Wait for it to be locked.
+	// (This won't trigger the wake in testMutex)
+	w := NewWaiter()
+	m.WaitPrepare(w, d, 0, testMutexUnlocked, 0x0000ffff)
+
+	// Wake the single thread, not using the bitmask.
+	if _, err := m.Wake(0, 0xffff0000, 1); err != nil {
+		t.Error("wake non-matching bitmask error:", err)
+	}
+
+	select {
+	case <-w.C:
+		t.Error("w is alive?")
+	default:
+	}
+
+	// Now use a matching bitmask.
+	if _, err := m.Wake(0, 0x00000001, 1); err != nil {
+		t.Error("wake matching bitmask error:", err)
+	}
+
+	<-w.C
+	m.WaitComplete(w)
+}
+
+func TestFutexWakeTwo(t *testing.T) {
+	m := NewManager()
+	d := newTestData(testMutexSize)
+
+	// Wait for it to be locked.
+	// (This won't trigger the wake in testMutex)
+	w1 := NewWaiter()
+	w2 := NewWaiter()
+	w3 := NewWaiter()
+	m.WaitPrepare(w1, d, 0, testMutexUnlocked, ^uint32(0))
+	m.WaitPrepare(w2, d, 0, testMutexUnlocked, ^uint32(0))
+	m.WaitPrepare(w3, d, 0, testMutexUnlocked, ^uint32(0))
+
+	// Wake exactly two threads.
+	if _, err := m.Wake(0, ^uint32(0), 2); err != nil {
+		t.Error("wake error:", err)
+	}
+
+	// Ensure exactly two are alive.
+	// We don't get guarantees about exactly which two,
+	// (although we expect them to be w1 and w2).
+	awake := 0
+	for {
+		select {
+		case <-w1.C:
+			awake++
+		case <-w2.C:
+			awake++
+		case <-w3.C:
+			awake++
+		default:
+			if awake != 2 {
+				t.Error("awake != 2?")
+			}
+
+			// Success.
+			return
+		}
+	}
+}
+
+func TestFutexWakeUnrelated(t *testing.T) {
+	m := NewManager()
+	d := newTestData(2 * testMutexSize)
+
+	// Wait for it to be locked.
+	w1 := NewWaiter()
+	w2 := NewWaiter()
+	m.WaitPrepare(w1, d, 0*testMutexSize, testMutexUnlocked, ^uint32(0))
+	m.WaitPrepare(w2, d, 1*testMutexSize, testMutexUnlocked, ^uint32(0))
+
+	// Wake only the second one.
+	if _, err := m.Wake(1*testMutexSize, ^uint32(0), 2); err != nil {
+		t.Error("wake error:", err)
+	}
+
+	// Ensure only r2 is alive.
+	select {
+	case <-w1.C:
+		t.Error("w1 is alive?")
+	default:
+	}
+	<-w2.C
+}
+
+// This function was shamelessly stolen from mutex_test.go.
+func HammerMutex(l sync.Locker, loops int, cdone chan bool) {
+	for i := 0; i < loops; i++ {
+		l.Lock()
+		runtime.Gosched()
+		l.Unlock()
+	}
+	cdone <- true
+}
+
+func TestFutexStress(t *testing.T) {
+	m := NewManager()
+	d := newTestData(testMutexSize)
+	tm := newTestMutex(0*testMutexSize, d, m)
+	c := make(chan bool)
+
+	for i := 0; i < 10; i++ {
+		go HammerMutex(tm, 1000, c)
+	}
+
+	for i := 0; i < 10; i++ {
+		<-c
+	}
+}
+
+func TestWakeOpEmpty(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 0 {
+		t.Fatalf("Invalid number of wakes: want 0, got %d", n)
+	}
+}
+
+func TestWakeOpFirstNonEmpty(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add two waiters on address 0.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	// Wake up all waiters on address 0.
+	n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 2 {
+		t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+	}
+}
+
+func TestWakeOpSecondNonEmpty(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add two waiters on address 4.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	// Wake up all waiters on address 4.
+	n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 2 {
+		t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+	}
+}
+
+func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add two waiters on address 4.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	// Wake up all waiters on address 4.
+	n, err := m.WakeOp(d, 0, 4, 10, 10, 1)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 0 {
+		t.Fatalf("Invalid number of wakes: want 0, got %d", n)
+	}
+}
+
+func TestWakeOpAllNonEmpty(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add two waiters on address 0.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	// Add two waiters on address 4.
+	w3 := NewWaiter()
+	if err := m.WaitPrepare(w3, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w3)
+
+	w4 := NewWaiter()
+	if err := m.WaitPrepare(w4, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w4)
+
+	// Wake up all waiters on both addresses.
+	n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 4 {
+		t.Fatalf("Invalid number of wakes: want 4, got %d", n)
+	}
+}
+
+func TestWakeOpAllNonEmptyFailingOp(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add two waiters on address 0.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	// Add two waiters on address 4.
+	w3 := NewWaiter()
+	if err := m.WaitPrepare(w3, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w3)
+
+	w4 := NewWaiter()
+	if err := m.WaitPrepare(w4, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w4)
+
+	// Wake up all waiters on both addresses.
+	n, err := m.WakeOp(d, 0, 4, 10, 10, 1)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 2 {
+		t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+	}
+}
+
+func TestWakeOpSameAddress(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add four waiters on address 0.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	w3 := NewWaiter()
+	if err := m.WaitPrepare(w3, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w3)
+
+	w4 := NewWaiter()
+	if err := m.WaitPrepare(w4, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w4)
+
+	// Use the same address, with one at most one waiter from each.
+	n, err := m.WakeOp(d, 0, 0, 1, 1, 0)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 2 {
+		t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+	}
+}
+
+func TestWakeOpSameAddressFailingOp(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add four waiters on address 0.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	w3 := NewWaiter()
+	if err := m.WaitPrepare(w3, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w3)
+
+	w4 := NewWaiter()
+	if err := m.WaitPrepare(w4, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w4)
+
+	// Use the same address, with one at most one waiter from each.
+	n, err := m.WakeOp(d, 0, 0, 1, 1, 1)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 1 {
+		t.Fatalf("Invalid number of wakes: want 1, got %d", n)
+	}
+}
diff --git a/pkg/sentry/kernel/g3doc/run_states.dot b/pkg/sentry/kernel/g3doc/run_states.dot
new file mode 100644
index 000000000..7861fe1f5
--- /dev/null
+++ b/pkg/sentry/kernel/g3doc/run_states.dot
@@ -0,0 +1,99 @@
+digraph {
+  subgraph {
+    App;
+  }
+  subgraph {
+    Interrupt;
+    InterruptAfterSignalDeliveryStop;
+  }
+  subgraph {
+    Syscall;
+    SyscallAfterPtraceEventSeccomp;
+    SyscallEnter;
+    SyscallAfterSyscallEnterStop;
+    SyscallAfterSysemuStop;
+    SyscallInvoke;
+    SyscallAfterPtraceEventClone;
+    SyscallAfterExecStop;
+    SyscallAfterVforkStop;
+    SyscallReinvoke;
+    SyscallExit;
+  }
+  subgraph {
+    Vsyscall;
+    VsyscallAfterPtraceEventSeccomp;
+    VsyscallInvoke;
+  }
+  subgraph {
+    Exit;
+    ExitMain; // leave thread group, release resources, reparent children, kill PID namespace and wait if TGID 1
+    ExitNotify; // signal parent/tracer, become waitable
+    ExitDone; // represented by t.runState == nil
+  }
+
+  // Task exit
+  Exit -> ExitMain;
+  ExitMain -> ExitNotify;
+  ExitNotify -> ExitDone;
+
+  // Execution of untrusted application code
+  App -> App;
+
+  // Interrupts (usually signal delivery)
+  App -> Interrupt;
+  Interrupt -> Interrupt; // if other interrupt conditions may still apply
+  Interrupt -> Exit; // if killed
+
+  // Syscalls
+  App -> Syscall;
+  Syscall -> SyscallEnter;
+  SyscallEnter -> SyscallInvoke;
+  SyscallInvoke -> SyscallExit;
+  SyscallExit -> App;
+
+  // exit, exit_group
+  SyscallInvoke -> Exit;
+
+  // execve
+  SyscallInvoke -> SyscallAfterExecStop;
+  SyscallAfterExecStop -> SyscallExit;
+  SyscallAfterExecStop -> App; // fatal signal pending
+
+  // vfork
+  SyscallInvoke -> SyscallAfterVforkStop;
+  SyscallAfterVforkStop -> SyscallExit;
+
+  // Vsyscalls
+  App -> Vsyscall;
+  Vsyscall -> VsyscallInvoke;
+  Vsyscall -> App; // fault while reading return address from stack
+  VsyscallInvoke -> App;
+
+  // ptrace-specific branches
+  Interrupt -> InterruptAfterSignalDeliveryStop;
+  InterruptAfterSignalDeliveryStop -> Interrupt;
+  SyscallEnter -> SyscallAfterSyscallEnterStop;
+  SyscallAfterSyscallEnterStop -> SyscallInvoke;
+  SyscallAfterSyscallEnterStop -> SyscallExit; // skipped by tracer
+  SyscallAfterSyscallEnterStop -> App; // fatal signal pending
+  SyscallEnter -> SyscallAfterSysemuStop;
+  SyscallAfterSysemuStop -> SyscallExit;
+  SyscallAfterSysemuStop -> App; // fatal signal pending
+  SyscallInvoke -> SyscallAfterPtraceEventClone;
+  SyscallAfterPtraceEventClone -> SyscallExit;
+  SyscallAfterPtraceEventClone -> SyscallAfterVforkStop;
+
+  // seccomp
+  Syscall -> App; // SECCOMP_RET_TRAP, SECCOMP_RET_ERRNO, SECCOMP_RET_KILL, SECCOMP_RET_TRACE without tracer
+  Syscall -> SyscallAfterPtraceEventSeccomp; // SECCOMP_RET_TRACE
+  SyscallAfterPtraceEventSeccomp -> SyscallEnter;
+  SyscallAfterPtraceEventSeccomp -> SyscallExit; // skipped by tracer
+  SyscallAfterPtraceEventSeccomp -> App; // fatal signal pending
+  Vsyscall -> VsyscallAfterPtraceEventSeccomp;
+  VsyscallAfterPtraceEventSeccomp -> VsyscallInvoke;
+  VsyscallAfterPtraceEventSeccomp -> App;
+
+  // Autosave
+  SyscallInvoke -> SyscallReinvoke;
+  SyscallReinvoke -> SyscallInvoke;
+}
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
new file mode 100644
index 000000000..78737f58f
--- /dev/null
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -0,0 +1,43 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore"
+)
+
+// IPCNamespace represents an IPC namespace.
+type IPCNamespace struct {
+	semaphores *semaphore.Registry
+}
+
+// NewIPCNamespace creates a new IPC namespace.
+func NewIPCNamespace() *IPCNamespace {
+	return &IPCNamespace{
+		semaphores: semaphore.NewRegistry(),
+	}
+}
+
+// SemaphoreRegistry returns the semanphore set registry for this namespace.
+func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry {
+	return i.semaphores
+}
+
+// IPCNamespace returns the task's IPC namespace.
+func (t *Task) IPCNamespace() *IPCNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.ipcns
+}
diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD
new file mode 100644
index 000000000..b6c00042a
--- /dev/null
+++ b/pkg/sentry/kernel/kdefs/BUILD
@@ -0,0 +1,10 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "kdefs",
+    srcs = ["kdefs.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
+    visibility = ["//:sandbox"],
+)
diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go
new file mode 100644
index 000000000..bbb476544
--- /dev/null
+++ b/pkg/sentry/kernel/kdefs/kdefs.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kdefs defines common kernel definitions.
+//
+package kdefs
+
+// FD is a File Descriptor.
+type FD int32
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
new file mode 100644
index 000000000..0932965e0
--- /dev/null
+++ b/pkg/sentry/kernel/kernel.go
@@ -0,0 +1,957 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kernel provides an emulation of the Linux kernel.
+//
+// See README.md for a detailed overview.
+//
+// Lock order (outermost locks must be taken first):
+//
+// Kernel.extMu
+//   TaskSet.mu
+//     SignalHandlers.mu
+//       Task.mu
+//
+// Locking SignalHandlers.mu in multiple SignalHandlers requires locking
+// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
+// time requires locking all of their signal mutexes first.
+package kernel
+
+import (
+	"fmt"
+	"io"
+	"path/filepath"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port"
+	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+// Kernel represents an emulated Linux kernel. It must be initialized by calling
+// Init() or LoadFrom().
+type Kernel struct {
+	// extMu serializes external changes to the Kernel with calls to
+	// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
+	// remains frozen for the duration of the call; it requires that the Kernel
+	// is paused as a precondition, which ensures that none of the tasks
+	// running within the Kernel can affect its state, but extMu is required to
+	// ensure that concurrent users of the Kernel *outside* the Kernel's
+	// control cannot affect its state by calling e.g.
+	// Kernel.SendExternalSignal.)
+	extMu sync.Mutex `state:"nosave"`
+
+	// started is true if Start has been called. Unless otherwise specified,
+	// all Kernel fields become immutable once started becomes true.
+	started bool `state:"nosave"`
+
+	// All of the following fields are immutable unless otherwise specified.
+
+	// Platform is the platform that is used to execute tasks in the
+	// created Kernel. It is embedded so that Kernel can directly serve as
+	// Platform in mm logic and also serve as platform.MemoryProvider in
+	// filemem S/R logic.
+	platform.Platform `state:"nosave"`
+
+	// See InitKernelArgs for the meaning of these fields.
+	featureSet        *cpuid.FeatureSet
+	timekeeper        *Timekeeper
+	tasks             *TaskSet
+	rootUserNamespace *auth.UserNamespace
+	networkStack      inet.Stack `state:"nosave"`
+	applicationCores  uint
+	useHostCores      bool
+	extraAuxv         []arch.AuxEntry
+	vdso              *loader.VDSO
+	rootUTSNamespace  *UTSNamespace
+	rootIPCNamespace  *IPCNamespace
+
+	// mounts holds the state of the virtual filesystem. mounts is initially
+	// nil, and must be set by calling Kernel.SetRootMountNamespace before
+	// Kernel.CreateProcess can succeed.
+	mounts *fs.MountNamespace
+
+	// globalInit is the thread group whose leader has ID 1 in the root PID
+	// namespace. globalInit is stored separately so that it is accessible even
+	// after all tasks in the thread group have exited, such that ID 1 is no
+	// longer mapped.
+	//
+	// globalInit is mutable until it is assigned by the first successful call
+	// to CreateProcess, and is protected by extMu.
+	globalInit *ThreadGroup
+
+	// realtimeClock is a ktime.Clock based on timekeeper's Realtime.
+	realtimeClock *timekeeperClock
+
+	// monotonicClock is a ktime.Clock based on timekeeper's Monotonic.
+	monotonicClock *timekeeperClock
+
+	// syslog is the kernel log.
+	syslog syslog
+
+	// cpuClock is incremented every linux.ClockTick. cpuClock is used to
+	// measure task CPU usage, since sampling monotonicClock twice on every
+	// syscall turns out to be unreasonably expensive. This is similar to how
+	// Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING),
+	// although Linux also uses scheduler timing information to improve
+	// resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do
+	// since "preeemptive" scheduling is managed by the Go runtime, which
+	// doesn't provide this information.
+	//
+	// cpuClock is mutable, and is accessed using atomic memory operations.
+	cpuClock uint64
+
+	// cpuClockTicker increments cpuClock.
+	cpuClockTicker *ktime.Timer `state:"nosave"`
+
+	// fdMapUids is an ever-increasing counter for generating FDMap uids.
+	//
+	// fdMapUids is mutable, and is accessed using atomic memory operations.
+	fdMapUids uint64
+
+	// uniqueID is used to generate unique identifiers.
+	//
+	// uniqueID is mutable, and is accessed using atomic memory operations.
+	uniqueID uint64
+
+	// nextInotifyCookie is a monotonically increasing counter used for
+	// generating unique inotify event cookies.
+	//
+	// nextInotifyCookie is mutable, and is accesed using atomic memory
+	// operations.
+	nextInotifyCookie uint32
+
+	// netlinkPorts manages allocation of netlink socket port IDs.
+	netlinkPorts *port.Manager
+
+	// exitErr is the error causing the sandbox to exit, if any. It is
+	// protected by extMu.
+	exitErr error
+}
+
+// InitKernelArgs holds arguments to Init.
+type InitKernelArgs struct {
+	// FeatureSet is the emulated CPU feature set.
+	FeatureSet *cpuid.FeatureSet
+
+	// Timekeeper manages time for all tasks in the system.
+	Timekeeper *Timekeeper
+
+	// RootUserNamespace is the root user namespace.
+	RootUserNamespace *auth.UserNamespace
+
+	// NetworkStack is the TCP/IP network stack. NetworkStack may be nil.
+	NetworkStack inet.Stack
+
+	// ApplicationCores is the number of logical CPUs visible to sandboxed
+	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
+	// ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
+	// most significant bit in cpu_possible_mask + 1.
+	ApplicationCores uint
+
+	// If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
+	// instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
+	// no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
+	// will be overridden.
+	UseHostCores bool
+
+	// ExtraAuxv contains additional auxiliary vector entries that are added to
+	// each process by the ELF loader.
+	ExtraAuxv []arch.AuxEntry
+
+	// Vdso holds the VDSO and its parameter page.
+	Vdso *loader.VDSO
+
+	// RootUTSNamespace is the root UTS namepsace.
+	RootUTSNamespace *UTSNamespace
+
+	// RootIPCNamespace is the root IPC namepsace.
+	RootIPCNamespace *IPCNamespace
+}
+
+// Init initialize the Kernel with no tasks.
+//
+// Callers must manually set Kernel.Platform before caling Init.
+func (k *Kernel) Init(args InitKernelArgs) error {
+	if args.FeatureSet == nil {
+		return fmt.Errorf("FeatureSet is nil")
+	}
+	if args.Timekeeper == nil {
+		return fmt.Errorf("Timekeeper is nil")
+	}
+	if args.RootUserNamespace == nil {
+		return fmt.Errorf("RootUserNamespace is nil")
+	}
+	if args.ApplicationCores == 0 {
+		return fmt.Errorf("ApplicationCores is 0")
+	}
+
+	k.featureSet = args.FeatureSet
+	k.timekeeper = args.Timekeeper
+	k.tasks = newTaskSet()
+	k.rootUserNamespace = args.RootUserNamespace
+	k.rootUTSNamespace = args.RootUTSNamespace
+	k.rootIPCNamespace = args.RootIPCNamespace
+	k.networkStack = args.NetworkStack
+	k.applicationCores = args.ApplicationCores
+	if args.UseHostCores {
+		k.useHostCores = true
+		maxCPU, err := hostcpu.MaxPossibleCPU()
+		if err != nil {
+			return fmt.Errorf("Failed to get maximum CPU number: %v", err)
+		}
+		minAppCores := uint(maxCPU) + 1
+		if k.applicationCores < minAppCores {
+			log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
+			k.applicationCores = minAppCores
+		}
+	}
+	k.extraAuxv = args.ExtraAuxv
+	k.vdso = args.Vdso
+	k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime}
+	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
+	k.netlinkPorts = port.New()
+
+	return nil
+}
+
+// SaveTo saves the state of k to w.
+//
+// Preconditions: The kernel must be paused throughout the call to SaveTo.
+func (k *Kernel) SaveTo(w io.Writer) error {
+	saveStart := time.Now()
+	ctx := k.SupervisorContext()
+
+	// Do not allow other Kernel methods to affect it while it's being saved.
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+
+	// Stop time.
+	k.pauseTimeLocked()
+	defer k.resumeTimeLocked()
+
+	// Flush write operations on open files so data reaches backing storage.
+	if err := k.tasks.flushWritesToFiles(ctx); err != nil {
+		return err
+	}
+
+	// Remove all epoll waiter objects from underlying wait queues.
+	// NOTE: for programs to resume execution in future snapshot scenarios,
+	// we will need to re-establish these waiter objects after saving.
+	k.tasks.unregisterEpollWaiters()
+
+	// Clear the dirent cache before saving because Dirents must be Loaded in a
+	// particular order (parents before children), and Loading dirents from a cache
+	// breaks that order.
+	k.mounts.FlushMountSourceRefs()
+
+	// Ensure that all pending asynchronous work is complete:
+	//   - inode and mount release
+	//   - asynchronuous IO
+	fs.AsyncBarrier()
+
+	// Once all fs work has completed (flushed references have all been released),
+	// reset mount mappings. This allows individual mounts to save how inodes map
+	// to filesystem resources. Without this, fs.Inodes cannot be restored.
+	fs.SaveInodeMappings()
+
+	// Discard unsavable mappings, such as those for host file descriptors.
+	// This must be done after waiting for "asynchronous fs work", which
+	// includes async I/O that may touch application memory.
+	if err := k.invalidateUnsavableMappings(ctx); err != nil {
+		return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+	}
+
+	// Save the kernel state.
+	kernelStart := time.Now()
+	var stats state.Stats
+	if err := state.Save(w, k, &stats); err != nil {
+		return err
+	}
+	log.Infof("Kernel save stats: %s", &stats)
+	log.Infof("Kernel save took [%s].", time.Since(kernelStart))
+
+	// Save the memory state.
+	//
+	// FIXME: In the future, this should not be dispatched via
+	// an abstract memory type. This should be dispatched to a single
+	// memory implementation that belongs to the kernel. (There is
+	// currently a single implementation anyways, it just needs to be
+	// "unabstracted" and reparented appropriately.)
+	memoryStart := time.Now()
+	if err := k.Platform.Memory().SaveTo(w); err != nil {
+		return err
+	}
+	log.Infof("Memory save took [%s].", time.Since(memoryStart))
+
+	log.Infof("Overall save took [%s].", time.Since(saveStart))
+
+	return nil
+}
+
+func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
+	ts.mu.RLock()
+	defer ts.mu.RUnlock()
+	for t := range ts.Root.tids {
+		if fdmap := t.FDMap(); fdmap != nil {
+			for _, desc := range fdmap.files {
+				if flags := desc.file.Flags(); !flags.Write {
+					continue
+				}
+				if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
+					continue
+				}
+				// Here we need all metadata synced.
+				syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+				if err := fs.SaveFileFsyncError(syncErr); err != nil {
+					name, _ := desc.file.Dirent.FullName(nil /* root */)
+					return fmt.Errorf("%q was not sufficiently synced: %v", name, err)
+				}
+			}
+		}
+	}
+	return nil
+}
+
+// Preconditions: The kernel must be paused.
+func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
+	invalidated := make(map[*mm.MemoryManager]struct{})
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+	for t := range k.tasks.Root.tids {
+		// We can skip locking Task.mu here since the kernel is paused.
+		if mm := t.tc.MemoryManager; mm != nil {
+			if _, ok := invalidated[mm]; !ok {
+				if err := mm.InvalidateUnsavable(ctx); err != nil {
+					return err
+				}
+				invalidated[mm] = struct{}{}
+			}
+		}
+		// I really wish we just had a sync.Map of all MMs...
+		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
+			if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (ts *TaskSet) unregisterEpollWaiters() {
+	ts.mu.RLock()
+	defer ts.mu.RUnlock()
+	for t := range ts.Root.tids {
+		if fdmap := t.FDMap(); fdmap != nil {
+			for _, desc := range fdmap.files {
+				if desc.file != nil {
+					if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok {
+						e.UnregisterEpollWaiters()
+					}
+				}
+			}
+		}
+	}
+}
+
+// LoadFrom returns a new Kernel loaded from args.
+func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) error {
+	loadStart := time.Now()
+	if p == nil {
+		return fmt.Errorf("Platform is nil")
+	}
+
+	k.Platform = p
+	k.networkStack = net
+
+	initAppCores := k.applicationCores
+
+	// Load the kernel state.
+	kernelStart := time.Now()
+	var stats state.Stats
+	if err := state.Load(r, k, &stats); err != nil {
+		return err
+	}
+	log.Infof("Kernel load stats: %s", &stats)
+	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
+
+	// Load the memory state.
+	//
+	// See the note in SaveTo.
+	memoryStart := time.Now()
+	if err := k.Platform.Memory().LoadFrom(r); err != nil {
+		return err
+	}
+	log.Infof("Memory load took [%s].", time.Since(memoryStart))
+
+	// Ensure that all pending asynchronous work is complete:
+	//   - namedpipe opening
+	//   - inode file opening
+	fs.AsyncBarrier()
+
+	log.Infof("Overall load took [%s]", time.Since(loadStart))
+
+	// Applications may size per-cpu structures based on k.applicationCores, so
+	// it can't change across save/restore. When we are virtualizing CPU
+	// numbers, this isn't a problem. However, when we are exposing host CPU
+	// assignments, we can't tolerate an increase in the number of host CPUs,
+	// which could result in getcpu(2) returning CPUs that applications expect
+	// not to exist.
+	if k.useHostCores && initAppCores > k.applicationCores {
+		return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
+	}
+
+	return nil
+}
+
+// Destroy releases resources owned by k.
+//
+// Preconditions: There must be no task goroutines running in k.
+func (k *Kernel) Destroy() {
+	if k.mounts != nil {
+		k.mounts.DecRef()
+		k.mounts = nil
+	}
+}
+
+// UniqueID returns a unique identifier.
+func (k *Kernel) UniqueID() uint64 {
+	id := atomic.AddUint64(&k.uniqueID, 1)
+	if id == 0 {
+		panic("unique identifier generator wrapped around")
+	}
+	return id
+}
+
+// CreateProcessArgs holds arguments to kernel.CreateProcess.
+type CreateProcessArgs struct {
+	// Filename is the filename to load.
+	//
+	// If this is provided as "", then the file will be guessed via Argv[0].
+	Filename string
+
+	// Argvv is a list of arguments.
+	Argv []string
+
+	// Envv is a list of environment variables.
+	Envv []string
+
+	// WorkingDirectory is the initial working directory.
+	//
+	// This defaults to the root if empty.
+	WorkingDirectory string
+
+	// Credentials is the initial credentials.
+	Credentials *auth.Credentials
+
+	// FDMap is the initial set of file descriptors. If CreateProcess succeeds,
+	// it takes a reference on FDMap.
+	FDMap *FDMap
+
+	// Umask is the initial umask.
+	Umask uint
+
+	// Limits is the initial resource limits.
+	Limits *limits.LimitSet
+
+	// MaxSymlinkTraversals is the maximum number of symlinks to follow
+	// during resolution.
+	MaxSymlinkTraversals uint
+
+	// UTSNamespace is the initial UTS namespace.
+	UTSNamespace *UTSNamespace
+
+	// IPCNamespace is the initial IPC namespace.
+	IPCNamespace *IPCNamespace
+}
+
+// NewContext returns a context.Context that represents the task that will be
+// created by args.NewContext(k).
+func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext {
+	return &createProcessContext{
+		Logger: log.Log(),
+		k:      k,
+		args:   args,
+	}
+}
+
+// createProcessContext is a context.Context that represents the context
+// associated with a task that is being created.
+type createProcessContext struct {
+	context.NoopSleeper
+	log.Logger
+	k    *Kernel
+	args *CreateProcessArgs
+}
+
+// Value implements context.Context.Value.
+func (ctx *createProcessContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxKernel:
+		return ctx.k
+	case CtxPIDNamespace:
+		// "The new task ... is in the root PID namespace." -
+		// Kernel.CreateProcess
+		return ctx.k.tasks.Root
+	case CtxUTSNamespace:
+		return ctx.args.UTSNamespace
+	case CtxIPCNamespace:
+		return ctx.args.IPCNamespace
+	case auth.CtxCredentials:
+		return ctx.args.Credentials
+	case fs.CtxRoot:
+		if ctx.k.mounts == nil {
+			return nil
+		}
+		return ctx.k.mounts.Root()
+	case ktime.CtxRealtimeClock:
+		return ctx.k.RealtimeClock()
+	case limits.CtxLimits:
+		return ctx.args.Limits
+	case platform.CtxPlatform:
+		return ctx.k
+	case uniqueid.CtxGlobalUniqueID:
+		return ctx.k.UniqueID()
+	case uniqueid.CtxInotifyCookie:
+		return ctx.k.GenerateInotifyCookie()
+	default:
+		return nil
+	}
+}
+
+// CreateProcess creates a new task in a new thread group with the given
+// options. The new task has no parent and is in the root PID namespace.
+//
+// If k.Start() has already been called, the created task will begin running
+// immediately. Otherwise, it will be started when k.Start() is called.
+//
+// CreateProcess has no analogue in Linux; it is used to create the initial
+// application task, as well as processes started by the control server.
+func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	log.Infof("EXEC: %v", args.Argv)
+
+	if k.mounts == nil {
+		return nil, fmt.Errorf("no kernel MountNamespace")
+	}
+
+	tg := NewThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+	ctx := args.NewContext(k)
+
+	// Grab the root directory.
+	root := fs.RootFromContext(ctx)
+	defer root.DecRef()
+
+	// Grab the working directory.
+	wd := root // Default.
+	if args.WorkingDirectory != "" {
+		var err error
+		wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, args.MaxSymlinkTraversals)
+		if err != nil {
+			return nil, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+		}
+		defer wd.DecRef()
+	}
+
+	if args.Filename == "" {
+		// Was anything provided?
+		if len(args.Argv) == 0 {
+			return nil, fmt.Errorf("no filename or command provided")
+		}
+		if !filepath.IsAbs(args.Argv[0]) {
+			return nil, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
+		}
+		args.Filename = args.Argv[0]
+	}
+
+	// Create a fresh task context.
+	tc, err := k.LoadTaskImage(ctx, k.mounts, root, wd, args.MaxSymlinkTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
+	if err != nil {
+		return nil, err
+	}
+	tr := newTaskResources(args.FDMap, newFSContext(root, wd, args.Umask))
+	// NewTask unconditionally takes ownership of tr, so we never have to call
+	// tr.release.
+
+	// Create the task.
+	config := &TaskConfig{
+		Kernel:         k,
+		ThreadGroup:    tg,
+		TaskContext:    tc,
+		TaskResources:  tr,
+		Credentials:    args.Credentials,
+		UTSNamespace:   args.UTSNamespace,
+		IPCNamespace:   args.IPCNamespace,
+		AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores),
+	}
+	t, err := k.tasks.NewTask(config)
+	if err != nil {
+		return nil, err
+	}
+
+	// Success.
+	if k.started {
+		tid := k.tasks.Root.IDOfTask(t)
+		t.Start(tid)
+	} else if k.globalInit == nil {
+		k.globalInit = tg
+	}
+	return tg, nil
+}
+
+// Start starts execution of all tasks in k.
+//
+// Preconditions: Start may be called exactly once.
+func (k *Kernel) Start() error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+
+	if k.globalInit == nil {
+		return fmt.Errorf("kernel contains no tasks")
+	}
+	if k.started {
+		return fmt.Errorf("kernel already started")
+	}
+
+	k.started = true
+	k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, kernelCPUClockListener{k})
+	k.cpuClockTicker.Swap(ktime.Setting{
+		Enabled: true,
+		Period:  linux.ClockTick,
+	})
+	// If k was created by LoadKernelFrom, timers were stopped during
+	// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
+	// this is a no-op.
+	k.resumeTimeLocked()
+	// Start task goroutines.
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+	for t, tid := range k.tasks.Root.tids {
+		t.Start(tid)
+	}
+	return nil
+}
+
+// pauseTimeLocked pauses all Timers and Timekeeper updates.
+//
+// Preconditions: Any task goroutines running in k must be stopped. k.extMu
+// must be locked.
+func (k *Kernel) pauseTimeLocked() {
+	// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
+	// Kernel.Start().
+	if k.cpuClockTicker != nil {
+		k.cpuClockTicker.Pause()
+	}
+
+	// By precondition, nothing else can be interacting with PIDNamespace.tids
+	// or FDMap.files, so we can iterate them without synchronization. (We
+	// can't hold the TaskSet mutex when pausing thread group timers because
+	// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
+	// mutex, while holding the Timer mutex.)
+	for t := range k.tasks.Root.tids {
+		if t == t.tg.leader {
+			t.tg.tm.pause()
+		}
+		// This means we'll iterate FDMaps shared by multiple tasks repeatedly,
+		// but ktime.Timer.Pause is idempotent so this is harmless.
+		if fdm := t.tr.FDMap; fdm != nil {
+			for _, desc := range fdm.files {
+				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+					tfd.PauseTimer()
+				}
+			}
+		}
+	}
+	k.timekeeper.PauseUpdates()
+}
+
+// resumeTimeLocked resumes all Timers and Timekeeper updates. If
+// pauseTimeLocked has not been previously called, resumeTimeLocked has no
+// effect.
+//
+// Preconditions: Any task goroutines running in k must be stopped. k.extMu
+// must be locked.
+func (k *Kernel) resumeTimeLocked() {
+	if k.cpuClockTicker != nil {
+		k.cpuClockTicker.Resume()
+	}
+
+	k.timekeeper.ResumeUpdates()
+	for t := range k.tasks.Root.tids {
+		if t == t.tg.leader {
+			t.tg.tm.resume()
+		}
+		if fdm := t.tr.FDMap; fdm != nil {
+			for _, desc := range fdm.files {
+				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+					tfd.ResumeTimer()
+				}
+			}
+		}
+	}
+}
+
+// WaitExited blocks until all tasks in k have exited.
+func (k *Kernel) WaitExited() {
+	k.tasks.liveGoroutines.Wait()
+}
+
+// Kill requests that all tasks in k immediately exit as if group exiting with
+// status es. Kill does not wait for tasks to exit.
+func (k *Kernel) Kill(es ExitStatus) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.Kill(es)
+}
+
+// Pause requests that all tasks in k temporarily stop executing, and blocks
+// until all tasks in k have stopped. Multiple calls to Pause nest and require
+// an equal number of calls to Unpause to resume execution.
+func (k *Kernel) Pause() {
+	k.extMu.Lock()
+	k.tasks.BeginExternalStop()
+	k.extMu.Unlock()
+	k.tasks.runningGoroutines.Wait()
+}
+
+// Unpause ends the effect of a previous call to Pause. If Unpause is called
+// without a matching preceding call to Pause, Unpause may panic.
+func (k *Kernel) Unpause() {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.EndExternalStop()
+}
+
+// SendExternalSignal injects a signal into the kernel.
+//
+// context is used only for debugging to describe how the signal was received.
+//
+// Returns false if signal could not be sent because the Kernel is not fully
+// initialized yet.
+func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) bool {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return k.sendExternalSignal(info, context)
+}
+
+// FeatureSet returns the FeatureSet.
+func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
+	return k.featureSet
+}
+
+// Timekeeper returns the Timekeeper.
+func (k *Kernel) Timekeeper() *Timekeeper {
+	return k.timekeeper
+}
+
+// TaskSet returns the TaskSet.
+func (k *Kernel) TaskSet() *TaskSet {
+	return k.tasks
+}
+
+// RootUserNamespace returns the root UserNamespace.
+func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
+	return k.rootUserNamespace
+}
+
+// RootUTSNamespace returns the root UTSNamespace.
+func (k *Kernel) RootUTSNamespace() *UTSNamespace {
+	return k.rootUTSNamespace
+}
+
+// RootIPCNamespace returns the root IPCNamespace.
+func (k *Kernel) RootIPCNamespace() *IPCNamespace {
+	return k.rootIPCNamespace
+}
+
+// RootMountNamespace returns the MountNamespace.
+func (k *Kernel) RootMountNamespace() *fs.MountNamespace {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return k.mounts
+}
+
+// SetRootMountNamespace sets the MountNamespace.
+func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.mounts = mounts
+}
+
+// NetworkStack returns the network stack. NetworkStack may return nil if no
+// network stack is available.
+func (k *Kernel) NetworkStack() inet.Stack {
+	return k.networkStack
+}
+
+// GlobalInit returns the thread group with ID 1 in the root PID namespace, or
+// nil if no such thread group exists. GlobalInit may return a thread group
+// containing no tasks if the thread group has already exited.
+func (k *Kernel) GlobalInit() *ThreadGroup {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return k.globalInit
+}
+
+// ApplicationCores returns the number of CPUs visible to sandboxed
+// applications.
+func (k *Kernel) ApplicationCores() uint {
+	return k.applicationCores
+}
+
+// RealtimeClock returns the application CLOCK_REALTIME clock.
+func (k *Kernel) RealtimeClock() ktime.Clock {
+	return k.realtimeClock
+}
+
+// MonotonicClock returns the application CLOCK_MONOTONIC clock.
+func (k *Kernel) MonotonicClock() ktime.Clock {
+	return k.monotonicClock
+}
+
+// CPUClockNow returns the current value of k.cpuClock.
+func (k *Kernel) CPUClockNow() uint64 {
+	return atomic.LoadUint64(&k.cpuClock)
+}
+
+// Syslog returns the syslog.
+func (k *Kernel) Syslog() *syslog {
+	return &k.syslog
+}
+
+// GenerateInotifyCookie generates a unique inotify event cookie.
+//
+// Returned values may overlap with previously returned values if the value
+// space is exhausted. 0 is not a valid cookie value, all other values
+// representable in a uint32 are allowed.
+func (k *Kernel) GenerateInotifyCookie() uint32 {
+	id := atomic.AddUint32(&k.nextInotifyCookie, 1)
+	// Wrap-around is explicitly allowed for inotify event cookies.
+	if id == 0 {
+		id = atomic.AddUint32(&k.nextInotifyCookie, 1)
+	}
+	return id
+}
+
+// NetlinkPorts returns the netlink port manager.
+func (k *Kernel) NetlinkPorts() *port.Manager {
+	return k.netlinkPorts
+}
+
+// ExitError returns the sandbox error that caused the kernel to exit.
+func (k *Kernel) ExitError() error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return k.exitErr
+}
+
+// SetExitError sets the sandbox error that caused the kernel to exit, if one is
+// not already set.
+func (k *Kernel) SetExitError(err error) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	if k.exitErr == nil {
+		k.exitErr = err
+	}
+}
+
+// SupervisorContext returns a Context with maximum privileges in k. It should
+// only be used by goroutines outside the control of the emulated kernel
+// defined by e.
+//
+// Callers are responsible for ensuring that the returned Context is not used
+// concurrently with changes to the Kernel.
+func (k *Kernel) SupervisorContext() context.Context {
+	return supervisorContext{
+		Logger: log.Log(),
+		k:      k,
+	}
+}
+
+type supervisorContext struct {
+	context.NoopSleeper
+	log.Logger
+	k *Kernel
+}
+
+// Value implements context.Context.
+func (ctx supervisorContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxCanTrace:
+		// The supervisor context can trace anything. (None of
+		// supervisorContext's users are expected to invoke ptrace, but ptrace
+		// permissions are required for certain file accesses.)
+		return func(*Task, bool) bool { return true }
+	case CtxKernel:
+		return ctx.k
+	case CtxPIDNamespace:
+		return ctx.k.tasks.Root
+	case CtxUTSNamespace:
+		return ctx.k.rootUTSNamespace
+	case CtxIPCNamespace:
+		return ctx.k.rootIPCNamespace
+	case auth.CtxCredentials:
+		// The supervisor context is global root.
+		return auth.NewRootCredentials(ctx.k.rootUserNamespace)
+	case fs.CtxRoot:
+		return ctx.k.mounts.Root()
+	case ktime.CtxRealtimeClock:
+		return ctx.k.RealtimeClock()
+	case limits.CtxLimits:
+		// No limits apply.
+		return limits.NewLimitSet()
+	case platform.CtxPlatform:
+		return ctx.k
+	case uniqueid.CtxGlobalUniqueID:
+		return ctx.k.UniqueID()
+	case uniqueid.CtxInotifyCookie:
+		return ctx.k.GenerateInotifyCookie()
+	default:
+		return nil
+	}
+}
+
+type kernelCPUClockListener struct {
+	k *Kernel
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (l kernelCPUClockListener) Notify(exp uint64) {
+	atomic.AddUint64(&l.k.cpuClock, exp)
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (l kernelCPUClockListener) Destroy() {
+}
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
new file mode 100644
index 000000000..c7779e1d5
--- /dev/null
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -0,0 +1,31 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "memevent",
+    srcs = ["memory_events.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/memevent",
+    visibility = ["//:sandbox"],
+    deps = [
+        ":memory_events_go_proto",
+        "//pkg/eventchannel",
+        "//pkg/log",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/usage",
+    ],
+)
+
+proto_library(
+    name = "memory_events_proto",
+    srcs = ["memory_events.proto"],
+    visibility = ["//visibility:public"],
+)
+
+go_proto_library(
+    name = "memory_events_go_proto",
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto",
+    proto = ":memory_events_proto",
+    visibility = ["//visibility:public"],
+)
diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
new file mode 100644
index 000000000..ecc9151de
--- /dev/null
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -0,0 +1,98 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memevent implements the memory usage events controller, which
+// periodically emits events via the eventchannel.
+package memevent
+
+import (
+	"sync"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	pb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// MemoryEvents describes the configuration for the global memory event emitter.
+type MemoryEvents struct {
+	k *kernel.Kernel
+
+	// The period is how often to emit an event. The memory events goroutine
+	// will ensure a minimum of one event is emitted per this period, regardless
+	// how of much memory usage has changed.
+	period time.Duration
+
+	// Writing to this channel indicates the memory goroutine should stop.
+	stop chan struct{}
+
+	// done is used to signal when the memory event goroutine has exited.
+	done sync.WaitGroup
+}
+
+// New creates a new MemoryEvents.
+func New(k *kernel.Kernel, period time.Duration) *MemoryEvents {
+	return &MemoryEvents{
+		k:      k,
+		period: period,
+		stop:   make(chan struct{}),
+	}
+}
+
+// Stop stops the memory usage events emitter goroutine. Stop must not be called
+// concurrently with Start and may only be called once.
+func (m *MemoryEvents) Stop() {
+	close(m.stop)
+	m.done.Wait()
+}
+
+// Start starts the memory usage events emitter goroutine. Start must not be
+// called concurrently with Stop and may only be called once.
+func (m *MemoryEvents) Start() {
+	if m.period == 0 {
+		return
+	}
+	go m.run() // S/R-SAFE: doesn't interact with saved state.
+}
+
+func (m *MemoryEvents) run() {
+	m.done.Add(1)
+
+	ticker := time.NewTicker(m.period)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-m.stop:
+			m.done.Done()
+			return
+		case <-ticker.C:
+			m.emit()
+		}
+	}
+}
+
+func (m *MemoryEvents) emit() {
+	totalPlatform, err := m.k.Platform.Memory().TotalUsage()
+	if err != nil {
+		log.Warningf("Failed to fetch memory usage for memory events: %v", err)
+		return
+	}
+	snapshot, _ := usage.MemoryAccounting.Copy()
+	total := totalPlatform + snapshot.Mapped
+
+	eventchannel.Emit(&pb.MemoryUsageEvent{Total: total})
+}
diff --git a/pkg/sentry/kernel/memevent/memory_events.proto b/pkg/sentry/kernel/memevent/memory_events.proto
new file mode 100644
index 000000000..e6e0bd628
--- /dev/null
+++ b/pkg/sentry/kernel/memevent/memory_events.proto
@@ -0,0 +1,25 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package gvisor;
+
+// MemoryUsageEvent describes the memory usage of the sandbox at a single
+// instant in time. These messages are emitted periodically on the eventchannel.
+message MemoryUsageEvent {
+  // The total memory usage of the sandboxed application in bytes, calculated
+  // using the 'fast' method.
+  uint64 total = 1;
+}
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
new file mode 100644
index 000000000..d8701f47a
--- /dev/null
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -0,0 +1,126 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+const (
+	// stdSignalCap is the maximum number of instances of a given standard
+	// signal that may be pending. ("[If] multiple instances of a standard
+	// signal are delivered while that signal is currently blocked, then only
+	// one instance is queued.") - signal(7)
+	stdSignalCap = 1
+
+	// rtSignalCap is the maximum number of instances of a given realtime
+	// signal that may be pending.
+	//
+	// TODO: In Linux, the minimum signal queue size is
+	// RLIMIT_SIGPENDING, which is by default max_threads/2.
+	rtSignalCap = 32
+)
+
+// pendingSignals holds a collection of pending signals. The zero value of
+// pendingSignals is a valid empty collection. pendingSignals is thread-unsafe;
+// users must provide synchronization.
+type pendingSignals struct {
+	// signals contains all pending signals.
+	//
+	// Note that signals is zero-indexed, but signal 1 is the first valid
+	// signal, so signals[0] contains signals with signo 1 etc. This offset is
+	// usually handled by using Signal.index().
+	signals [linux.SignalMaximum]pendingSignalQueue
+
+	// Bit i of pendingSet is set iff there is at least one signal with signo
+	// i+1 pending.
+	pendingSet linux.SignalSet
+}
+
+// pendingSignalQueue holds a pendingSignalList for a single signal number.
+type pendingSignalQueue struct {
+	pendingSignalList
+	length int
+}
+
+type pendingSignal struct {
+	// pendingSignalEntry links into a pendingSignalList.
+	pendingSignalEntry
+	*arch.SignalInfo
+}
+
+// enqueue enqueues the given signal. enqueue returns true on success and false
+// on failure (if the given signal's queue is full).
+//
+// Preconditions: info represents a valid signal.
+func (p *pendingSignals) enqueue(info *arch.SignalInfo) bool {
+	sig := linux.Signal(info.Signo)
+	q := &p.signals[sig.Index()]
+	if sig.IsStandard() {
+		if q.length >= stdSignalCap {
+			return false
+		}
+	} else if q.length >= rtSignalCap {
+		return false
+	}
+	q.pendingSignalList.PushBack(&pendingSignal{SignalInfo: info})
+	q.length++
+	p.pendingSet |= linux.SignalSetOf(sig)
+	return true
+}
+
+// dequeue dequeues and returns any pending signal not masked by mask. If no
+// unmasked signals are pending, dequeue returns nil.
+func (p *pendingSignals) dequeue(mask linux.SignalSet) *arch.SignalInfo {
+	// "Real-time signals are delivered in a guaranteed order. Multiple
+	// real-time signals of the same type are delivered in the order they were
+	// sent. If different real-time signals are sent to a process, they are
+	// delivered starting with the lowest-numbered signal. (I.e., low-numbered
+	// signals have highest priority.) By contrast, if multiple standard
+	// signals are pending for a process, the order in which they are delivered
+	// is unspecified. If both standard and real-time signals are pending for a
+	// process, POSIX leaves it unspecified which is delivered first. Linux,
+	// like many other implementations, gives priority to standard signals in
+	// this case." - signal(7)
+	lowestPendingUnblockedBit := bits.TrailingZeros64(uint64(p.pendingSet &^ mask))
+	if lowestPendingUnblockedBit >= linux.SignalMaximum {
+		return nil
+	}
+	return p.dequeueSpecific(linux.Signal(lowestPendingUnblockedBit + 1))
+}
+
+func (p *pendingSignals) dequeueSpecific(sig linux.Signal) *arch.SignalInfo {
+	q := &p.signals[sig.Index()]
+	ps := q.pendingSignalList.Front()
+	if ps == nil {
+		return nil
+	}
+	q.pendingSignalList.Remove(ps)
+	q.length--
+	if q.length == 0 {
+		p.pendingSet &^= linux.SignalSetOf(sig)
+	}
+	return ps.SignalInfo
+}
+
+// discardSpecific causes all pending signals with number sig to be discarded.
+func (p *pendingSignals) discardSpecific(sig linux.Signal) {
+	q := &p.signals[sig.Index()]
+	q.pendingSignalList.Reset()
+	q.length = 0
+	p.pendingSet &^= linux.SignalSetOf(sig)
+}
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
new file mode 100644
index 000000000..ca9825f9d
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -0,0 +1,68 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "pipe_state",
+    srcs = [
+        "buffers.go",
+        "node.go",
+        "pipe.go",
+        "reader.go",
+        "reader_writer.go",
+        "writer.go",
+    ],
+    out = "pipe_state.go",
+    package = "pipe",
+)
+
+go_library(
+    name = "pipe",
+    srcs = [
+        "buffers.go",
+        "device.go",
+        "node.go",
+        "pipe.go",
+        "pipe_state.go",
+        "reader.go",
+        "reader_writer.go",
+        "writer.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/ilist",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "pipe_test",
+    size = "small",
+    srcs = [
+        "node_test.go",
+        "pipe_test.go",
+    ],
+    embed = [":pipe"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/usermem",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/pipe/buffers.go b/pkg/sentry/kernel/pipe/buffers.go
new file mode 100644
index 000000000..f300537c5
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/buffers.go
@@ -0,0 +1,50 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+)
+
+// Buffer encapsulates a queueable byte buffer that can
+// easily be truncated.  It is designed only for use with pipes.
+type Buffer struct {
+	ilist.Entry
+	data []byte
+}
+
+// newBuffer initializes a Buffer.
+func newBuffer(buf []byte) *Buffer {
+	return &Buffer{data: buf}
+}
+
+// bytes returns the bytes contained in the buffer.
+func (b *Buffer) bytes() []byte {
+	return b.data
+}
+
+// size returns the number of bytes contained in the buffer.
+func (b *Buffer) size() int {
+	return len(b.data)
+}
+
+// truncate removes the first n bytes from the buffer.
+func (b *Buffer) truncate(n int) int {
+	if n > len(b.data) {
+		panic("Trying to truncate past end of array.")
+	}
+	b.data = b.data[n:]
+	return len(b.data)
+}
diff --git a/pkg/sentry/kernel/pipe/device.go b/pkg/sentry/kernel/pipe/device.go
new file mode 100644
index 000000000..8d383577a
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+// pipeDevice is used for all pipe files.
+var pipeDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
new file mode 100644
index 000000000..5b47427ef
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -0,0 +1,175 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/amutex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// inodeOperations wraps fs.InodeOperations operations with common pipe opening semantics.
+type inodeOperations struct {
+	fs.InodeOperations
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// p is the underlying Pipe object representing this fifo.
+	p *Pipe
+
+	// Channels for synchronizing the creation of new readers and writers of
+	// this fifo. See waitFor and newHandleLocked.
+	//
+	// These are not saved/restored because all waiters are unblocked on save,
+	// and either automatically restart (via ERESTARTSYS) or return EINTR on
+	// resume. On restarts via ERESTARTSYS, the appropriate channel will be
+	// recreated.
+	rWakeup chan struct{} `state:"nosave"`
+	wWakeup chan struct{} `state:"nosave"`
+}
+
+// NewInodeOperations creates a new pipe fs.InodeOperations.
+func NewInodeOperations(base fs.InodeOperations, p *Pipe) fs.InodeOperations {
+	return &inodeOperations{
+		InodeOperations: base,
+		p:               p,
+	}
+}
+
+// GetFile implements fs.InodeOperations.GetFile. Named pipes have special blocking
+// semantics during open:
+//
+// "Normally, opening the FIFO blocks until the other end is opened also. A
+// process can open a FIFO in nonblocking mode. In this case, opening for
+// read-only will succeed even if no-one has opened on the write side yet,
+// opening for write-only will fail with ENXIO (no such device or address)
+// unless the other end has already been opened. Under Linux, opening a FIFO
+// for read and write will succeed both in blocking and nonblocking mode. POSIX
+// leaves this behavior undefined. This can be used to open a FIFO for writing
+// while there are no readers available." - fifo(7)
+func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	switch {
+	case flags.Read && !flags.Write: // O_RDONLY.
+		r := i.p.ROpen(ctx)
+		i.newHandleLocked(&i.rWakeup)
+
+		if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() {
+			if !i.waitFor(&i.wWakeup, ctx) {
+				r.DecRef()
+				return nil, syserror.ErrInterrupted
+			}
+		}
+
+		// By now, either we're doing a nonblocking open or we have a writer. On
+		// a nonblocking read-only open, the open succeeds even if no-one has
+		// opened the write side yet.
+		return r, nil
+
+	case flags.Write && !flags.Read: // O_WRONLY.
+		w := i.p.WOpen(ctx)
+		i.newHandleLocked(&i.wWakeup)
+
+		if i.p.isNamed && !i.p.HasReaders() {
+			// On a nonblocking, write-only open, the open fails with ENXIO if the
+			// read side isn't open yet.
+			if flags.NonBlocking {
+				w.DecRef()
+				return nil, syserror.ENXIO
+			}
+
+			if !i.waitFor(&i.rWakeup, ctx) {
+				w.DecRef()
+				return nil, syserror.ErrInterrupted
+			}
+		}
+		return w, nil
+
+	case flags.Read && flags.Write: // O_RDWR.
+		// Pipes opened for read-write always succeeds without blocking.
+		rw := i.p.RWOpen(ctx)
+		i.newHandleLocked(&i.rWakeup)
+		i.newHandleLocked(&i.wWakeup)
+		return rw, nil
+
+	default:
+		return nil, syserror.EINVAL
+	}
+}
+
+// waitFor blocks until the underlying pipe has at least one reader/writer is
+// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this
+// function will block for either readers or writers, depending on where
+// 'wakeupChan' points.
+//
+// f.mu must be held by the caller. waitFor returns with f.mu held, but it will
+// drop f.mu before blocking for any reader/writers.
+func (i *inodeOperations) waitFor(wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool {
+	// Ideally this function would simply use a condition variable. However, the
+	// wait needs to be interruptible via 'sleeper', so we must sychronize via a
+	// channel. The synchronization below relies on the fact that closing a
+	// channel unblocks all receives on the channel.
+
+	// Does an appropriate wakeup channel already exist? If not, create a new
+	// one. This is all done under f.mu to avoid races.
+	if *wakeupChan == nil {
+		*wakeupChan = make(chan struct{})
+	}
+
+	// Grab a local reference to the wakeup channel since it may disappear as
+	// soon as we drop f.mu.
+	wakeup := *wakeupChan
+
+	// Drop the lock and prepare to sleep.
+	i.mu.Unlock()
+	cancel := sleeper.SleepStart()
+
+	// Wait for either a new reader/write to be signalled via 'wakeup', or
+	// for the sleep to be cancelled.
+	select {
+	case <-wakeup:
+		sleeper.SleepFinish(true)
+	case <-cancel:
+		sleeper.SleepFinish(false)
+	}
+
+	// Take the lock and check if we were woken. If we were woken and
+	// interrupted, the former takes priority.
+	i.mu.Lock()
+	select {
+	case <-wakeup:
+		return true
+	default:
+		return false
+	}
+}
+
+// newHandleLocked signals a new pipe reader or writer depending on where
+// 'wakeupChan' points. This unblocks any corresponding reader or writer
+// waiting for the other end of the channel to be opened, see Fifo.waitFor.
+//
+// i.mu must be held.
+func (*inodeOperations) newHandleLocked(wakeupChan *chan struct{}) {
+	if *wakeupChan != nil {
+		close(*wakeupChan)
+		*wakeupChan = nil
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
new file mode 100644
index 000000000..cc1ebf4f6
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -0,0 +1,308 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"testing"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+type sleeper struct {
+	context.Context
+	ch chan struct{}
+}
+
+func newSleeperContext(t *testing.T) context.Context {
+	return &sleeper{
+		Context: contexttest.Context(t),
+		ch:      make(chan struct{}),
+	}
+}
+
+func (s *sleeper) SleepStart() <-chan struct{} {
+	return s.ch
+}
+
+func (s *sleeper) SleepFinish(bool) {
+}
+
+func (s *sleeper) Cancel() {
+	s.ch <- struct{}{}
+}
+
+type openResult struct {
+	*fs.File
+	error
+}
+
+func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, doneChan chan<- struct{}) (*fs.File, error) {
+	file, err := n.GetFile(ctx, nil, flags)
+	if err != nil {
+		t.Fatalf("open with flags %+v failed: %v", flags, err)
+	}
+	if doneChan != nil {
+		doneChan <- struct{}{}
+	}
+	return file, err
+}
+
+func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, resChan chan<- openResult) (*fs.File, error) {
+	file, err := n.GetFile(ctx, nil, flags)
+	if resChan != nil {
+		resChan <- openResult{file, err}
+	}
+	return file, err
+}
+
+func newNamedPipe(t *testing.T) *Pipe {
+	return NewPipe(contexttest.Context(t), true, DefaultPipeSize, usermem.PageSize)
+}
+
+func newAnonPipe(t *testing.T) *Pipe {
+	return NewPipe(contexttest.Context(t), false, DefaultPipeSize, usermem.PageSize)
+}
+
+// assertRecvBlocks ensures that a recv attempt on c blocks for at least
+// blockDuration. This is useful for checking that a goroutine that is supposed
+// to be executing a blocking operation is actually blocking.
+func assertRecvBlocks(t *testing.T, c <-chan struct{}, blockDuration time.Duration, failMsg string) {
+	select {
+	case <-c:
+		t.Fatalf(failMsg)
+	case <-time.After(blockDuration):
+		// Ok, blocked for the required duration.
+	}
+}
+
+func TestReadOpenBlocksForWriteOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	// Verify that the open for read is blocking.
+	assertRecvBlocks(t, rDone, time.Millisecond*100,
+		"open for read not blocking with no writers")
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	<-wDone
+	<-rDone
+}
+
+func TestWriteOpenBlocksForReadOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	// Verify that the open for write is blocking
+	assertRecvBlocks(t, wDone, time.Millisecond*100,
+		"open for write not blocking with no readers")
+
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	<-rDone
+	<-wDone
+}
+
+func TestMultipleWriteOpenDoesntCountAsReadOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	rDone1 := make(chan struct{})
+	rDone2 := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone1)
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone2)
+
+	assertRecvBlocks(t, rDone1, time.Millisecond*100,
+		"open for read didn't block with no writers")
+	assertRecvBlocks(t, rDone2, time.Millisecond*100,
+		"open for read didn't block with no writers")
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	<-wDone
+	<-rDone2
+	<-rDone1
+}
+
+func TestClosedReaderBlocksWriteOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil)
+	rFile.DecRef()
+
+	wDone := make(chan struct{})
+	// This open for write should block because the reader is now gone.
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+	assertRecvBlocks(t, wDone, time.Millisecond*100,
+		"open for write didn't block with no concurrent readers")
+
+	// Open for read again. This should unblock the open for write.
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	<-rDone
+	<-wDone
+}
+
+func TestReadWriteOpenNeverBlocks(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	rwDone := make(chan struct{})
+	// Open for read-write never wait for a reader or writer, even if the
+	// nonblocking flag is not set.
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true, NonBlocking: false}, rwDone)
+	<-rwDone
+}
+
+func TestReadWriteOpenUnblocksReadOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	rwDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true}, rwDone)
+
+	<-rwDone
+	<-rDone
+}
+
+func TestReadWriteOpenUnblocksWriteOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	rwDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true}, rwDone)
+
+	<-rwDone
+	<-wDone
+}
+
+func TestBlockedOpenIsCancellable(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	done := make(chan openResult)
+	go testOpen(ctx, t, f, fs.FileFlags{Read: true}, done)
+	select {
+	case <-done:
+		t.Fatalf("open for read didn't block with no writers")
+	case <-time.After(time.Millisecond * 100):
+		// Ok.
+	}
+
+	ctx.(*sleeper).Cancel()
+	// If the cancel on the sleeper didn't work, the open for read would never
+	// return.
+	res := <-done
+	if res.error != syserror.ErrInterrupted {
+		t.Fatalf("Cancellation didn't cause GetFile to return fs.ErrInterrupted, got %v.",
+			res.error)
+	}
+}
+
+func TestNonblockingReadOpenNoWriters(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil {
+		t.Fatalf("Nonblocking open for read failed with error %v.", err)
+	}
+}
+
+func TestNonblockingWriteOpenNoReaders(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != syserror.ENXIO {
+		t.Fatalf("Nonblocking open for write failed unexpected error %v.", err)
+	}
+}
+
+func TestNonBlockingReadOpenWithWriter(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	// Open for write blocks since there are no readers yet.
+	assertRecvBlocks(t, wDone, time.Millisecond*100,
+		"Open for write didn't block with no reader.")
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil {
+		t.Fatalf("Nonblocking open for read failed with error %v.", err)
+	}
+
+	// Open for write should now be unblocked.
+	<-wDone
+}
+
+func TestNonBlockingWriteOpenWithReader(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	// Open for write blocked, since no reader yet.
+	assertRecvBlocks(t, rDone, time.Millisecond*100,
+		"Open for reader didn't block with no writer.")
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != nil {
+		t.Fatalf("Nonblocking open for write failed with error %v.", err)
+	}
+
+	// Open for write should now be unblocked.
+	<-rDone
+}
+
+func TestAnonReadOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newAnonPipe(t))
+	ctx := newSleeperContext(t)
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true}, nil); err != nil {
+		t.Fatalf("open anon pipe for read failed: %v", err)
+	}
+}
+
+func TestAnonWriteOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newAnonPipe(t))
+	ctx := newSleeperContext(t)
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true}, nil); err != nil {
+		t.Fatalf("open anon pipe for write failed: %v", err)
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
new file mode 100644
index 000000000..1656c6ff3
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -0,0 +1,335 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pipe provides an in-memory implementation of a unidirectional
+// pipe.
+//
+// The goal of this pipe is to emulate the pipe syscall in all of its
+// edge cases and guarantees of atomic IO.
+package pipe
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// DefaultPipeSize is the system-wide default size of a pipe in bytes.
+const DefaultPipeSize = 65536
+
+// Pipe is an encapsulation of a platform-independent pipe.
+// It manages a buffered byte queue shared between a reader/writer
+// pair.
+type Pipe struct {
+	waiter.Queue `state:"nosave"`
+
+	// Whether this is a named or anonymous pipe.
+	isNamed bool
+
+	// The dirent backing this pipe. Shared by all readers and writers.
+	dirent *fs.Dirent
+
+	// The buffered byte queue.
+	data ilist.List
+
+	// Max size of the pipe in bytes.  When this max has been reached,
+	// writers will get EWOULDBLOCK.
+	max int
+
+	// Current size of the pipe in bytes.
+	size int
+
+	// Max number of bytes the pipe can guarantee to read or write
+	// atomically.
+	atomicIOBytes int
+
+	// The number of active readers for this pipe. Load/store atomically.
+	readers int32
+
+	// The number of active writes for this pipe. Load/store atomically.
+	writers int32
+
+	// This flag indicates if this pipe ever had a writer. Note that this does
+	// not necessarily indicate there is *currently* a writer, just that there
+	// has been a writer at some point since the pipe was created.
+	//
+	// Protected by mu.
+	hadWriter bool
+
+	// Lock protecting all pipe internal state.
+	mu sync.Mutex `state:"nosave"`
+}
+
+// NewPipe initializes and returns a pipe. A pipe created by this function is
+// persistent, and will remain valid even without any open fds to it. Named
+// pipes for mknod(2) are created via this function. Note that the
+// implementation of blocking semantics for opening the read and write ends of a
+// named pipe are left to filesystems.
+func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int) *Pipe {
+	p := &Pipe{
+		isNamed:       isNamed,
+		max:           sizeBytes,
+		atomicIOBytes: atomicIOBytes,
+	}
+
+	// Build the fs.Dirent of this pipe, shared by all fs.Files associated
+	// with this pipe.
+	ino := pipeDevice.NextIno()
+	base := fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
+		FSType: linux.PIPEFS_MAGIC,
+		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+			Owner: fs.FileOwnerFromContext(ctx),
+			Perms: fs.FilePermissions{
+				User: fs.PermMask{Read: true, Write: true},
+			},
+			Links: 1,
+		}),
+	})
+	sattr := fs.StableAttr{
+		Type:      fs.Pipe,
+		DeviceID:  pipeDevice.DeviceID(),
+		InodeID:   ino,
+		BlockSize: int64(atomicIOBytes),
+	}
+	// There is no real filesystem backing this pipe, so we pass in a nil
+	// Filesystem.
+	sb := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	p.dirent = fs.NewDirent(fs.NewInode(NewInodeOperations(base, p), sb, sattr), fmt.Sprintf("pipe:[%d]", ino))
+
+	return p
+}
+
+// NewConnectedPipe initializes a pipe and returns a pair of objects (which
+// implement kio.File) representing the read and write ends of the pipe. A pipe
+// created by this function becomes invalid as soon as either the read or write
+// end is closed, and errors on subsequent operations on either end. Pipes
+// for pipe(2) and pipe2(2) are generally created this way.
+func NewConnectedPipe(ctx context.Context, sizeBytes int, atomicIOBytes int) (*fs.File, *fs.File) {
+	p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes)
+	return p.ROpen(ctx), p.WOpen(ctx)
+}
+
+// ROpen opens the pipe for reading.
+func (p *Pipe) ROpen(ctx context.Context) *fs.File {
+	p.rOpen()
+	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true}, &Reader{
+		ReaderWriter: ReaderWriter{Pipe: p},
+	})
+}
+
+// WOpen opens the pipe for writing.
+func (p *Pipe) WOpen(ctx context.Context) *fs.File {
+	p.wOpen()
+	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Write: true}, &Writer{
+		ReaderWriter: ReaderWriter{Pipe: p},
+	})
+}
+
+// RWOpen opens the pipe for both reading and writing.
+func (p *Pipe) RWOpen(ctx context.Context) *fs.File {
+	p.rOpen()
+	p.wOpen()
+	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true, Write: true}, &ReaderWriter{
+		Pipe: p,
+	})
+}
+
+// read reads data from the pipe into dst and returns the number of bytes
+// read, or returns ErrWouldBlock if the pipe is empty.
+func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	if !p.HasReaders() {
+		return 0, syscall.EBADF
+	}
+
+	// Don't block for a zero-length read even if the pipe is empty.
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	// If there is nothing to read at the moment but there is a writer, tell the
+	// caller to block.
+	if p.size == 0 {
+		if !p.HasWriters() {
+			// There are no writers, return EOF.
+			return 0, nil
+		}
+		return 0, syserror.ErrWouldBlock
+	}
+	var n int64
+	for b := p.data.Front(); b != nil; b = p.data.Front() {
+		buffer := b.(*Buffer)
+		n0, err := dst.CopyOut(ctx, buffer.bytes())
+		n += int64(n0)
+		p.size -= n0
+		if buffer.truncate(n0) == 0 {
+			p.data.Remove(b)
+		}
+		dst = dst.DropFirst(n0)
+		if dst.NumBytes() == 0 || err != nil {
+			return n, err
+		}
+	}
+	return n, nil
+}
+
+// write writes data from sv into the pipe and returns the number of bytes
+// written. If no bytes are written because the pipe is full (or has less than
+// atomicIOBytes free capacity), write returns ErrWouldBlock.
+func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	if !p.HasWriters() {
+		return 0, syscall.EBADF
+	}
+	if !p.HasReaders() {
+		return 0, syscall.EPIPE
+	}
+
+	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
+	// atomic, but requires no atomicity for writes larger than this. However,
+	// Linux appears to provide stronger semantics than this in practice:
+	// unmerged writes are done one PAGE_SIZE buffer at a time, so for larger
+	// writes, the writing of each PIPE_BUF-sized chunk is atomic. We implement
+	// this by writing at most atomicIOBytes at a time if we can't service the
+	// write in its entirety.
+	canWrite := src.NumBytes()
+	if canWrite > int64(p.max-p.size) {
+		if p.max-p.size >= p.atomicIOBytes {
+			canWrite = int64(p.atomicIOBytes)
+		} else {
+			return 0, syserror.ErrWouldBlock
+		}
+	}
+
+	// Copy data from user memory into a pipe-owned buffer.
+	buf := make([]byte, canWrite)
+	n, err := src.CopyIn(ctx, buf)
+	if n > 0 {
+		p.data.PushBack(newBuffer(buf[:n]))
+		p.size += n
+	}
+	if int64(n) < src.NumBytes() && err == nil {
+		// Partial write due to full pipe.
+		err = syserror.ErrWouldBlock
+	}
+	return int64(n), err
+}
+
+// rOpen signals a new reader of the pipe.
+func (p *Pipe) rOpen() {
+	atomic.AddInt32(&p.readers, 1)
+}
+
+// wOpen signals a new writer of the pipe.
+func (p *Pipe) wOpen() {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	p.hadWriter = true
+	atomic.AddInt32(&p.writers, 1)
+}
+
+// rClose signals that a reader has closed their end of the pipe.
+func (p *Pipe) rClose() {
+	newReaders := atomic.AddInt32(&p.readers, -1)
+	if newReaders < 0 {
+		panic(fmt.Sprintf("Refcounting bug, pipe has negative readers: %v", newReaders))
+	}
+}
+
+// wClose signals that a writer has closed their end of the pipe.
+func (p *Pipe) wClose() {
+	newWriters := atomic.AddInt32(&p.writers, -1)
+	if newWriters < 0 {
+		panic(fmt.Sprintf("Refcounting bug, pipe has negative writers: %v.", newWriters))
+	}
+}
+
+// HasReaders returns whether the pipe has any active readers.
+func (p *Pipe) HasReaders() bool {
+	return atomic.LoadInt32(&p.readers) > 0
+}
+
+// HasWriters returns whether the pipe has any active writers.
+func (p *Pipe) HasWriters() bool {
+	return atomic.LoadInt32(&p.writers) > 0
+}
+
+func (p *Pipe) rReadinessLocked() waiter.EventMask {
+	ready := waiter.EventMask(0)
+	if p.HasReaders() && p.data.Front() != nil {
+		ready |= waiter.EventIn
+	}
+	if !p.HasWriters() && p.hadWriter {
+		// POLLHUP must be supressed until the pipe has had at least one writer
+		// at some point. Otherwise a reader thread may poll and immediately get
+		// a POLLHUP before the writer ever opens the pipe, which the reader may
+		// interpret as the writer opening then closing the pipe.
+		ready |= waiter.EventHUp
+	}
+	return ready
+}
+
+// rReadiness returns a mask that states whether the read end of the pipe is
+// ready for reading.
+func (p *Pipe) rReadiness() waiter.EventMask {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.rReadinessLocked()
+}
+
+func (p *Pipe) wReadinessLocked() waiter.EventMask {
+	ready := waiter.EventMask(0)
+	if p.HasWriters() && p.size < p.max {
+		ready |= waiter.EventOut
+	}
+	if !p.HasReaders() {
+		ready |= waiter.EventErr
+	}
+	return ready
+}
+
+// wReadiness returns a mask that states whether the write end of the pipe
+// is ready for writing.
+func (p *Pipe) wReadiness() waiter.EventMask {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.wReadinessLocked()
+}
+
+// rwReadiness returns a mask that states whether a read-write handle to the
+// pipe is ready for IO.
+func (p *Pipe) rwReadiness() waiter.EventMask {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.rReadinessLocked() | p.wReadinessLocked()
+}
+
+func (p *Pipe) queuedSize() int {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.size
+}
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
new file mode 100644
index 000000000..49ef8c8ac
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -0,0 +1,138 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"bytes"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+func TestPipeRW(t *testing.T) {
+	ctx := contexttest.Context(t)
+	r, w := NewConnectedPipe(ctx, 65536, 4096)
+	defer r.DecRef()
+	defer w.DecRef()
+
+	msg := []byte("here's some bytes")
+	wantN := int64(len(msg))
+	n, err := w.Writev(ctx, usermem.BytesIOSequence(msg))
+	if n != wantN || err != nil {
+		t.Fatalf("Writev: got (%d, %v), wanted (%d, nil)", n, err, wantN)
+	}
+
+	buf := make([]byte, len(msg))
+	n, err = r.Readv(ctx, usermem.BytesIOSequence(buf))
+	if n != wantN || err != nil || !bytes.Equal(buf, msg) {
+		t.Fatalf("Readv: got (%d, %v) %q, wanted (%d, nil) %q", n, err, buf, wantN, msg)
+	}
+}
+
+func TestPipeReadBlock(t *testing.T) {
+	ctx := contexttest.Context(t)
+	r, w := NewConnectedPipe(ctx, 65536, 4096)
+	defer r.DecRef()
+	defer w.DecRef()
+
+	n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1)))
+	if n != 0 || err != syserror.ErrWouldBlock {
+		t.Fatalf("Readv: got (%d, %v), wanted (0, %v)", n, err, syserror.ErrWouldBlock)
+	}
+}
+
+func TestPipeWriteBlock(t *testing.T) {
+	const atomicIOBytes = 2
+
+	ctx := contexttest.Context(t)
+	r, w := NewConnectedPipe(ctx, 10, atomicIOBytes)
+	defer r.DecRef()
+	defer w.DecRef()
+
+	msg := []byte("here's some bytes")
+	n, err := w.Writev(ctx, usermem.BytesIOSequence(msg))
+	if wantN, wantErr := int64(atomicIOBytes), syserror.ErrWouldBlock; n != wantN || err != wantErr {
+		t.Fatalf("Writev: got (%d, %v), wanted (%d, %v)", n, err, wantN, wantErr)
+	}
+}
+
+func TestPipeWriteUntilEnd(t *testing.T) {
+	const atomicIOBytes = 2
+
+	ctx := contexttest.Context(t)
+	r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes)
+	defer r.DecRef()
+	defer w.DecRef()
+
+	msg := []byte("here's some bytes")
+
+	wDone := make(chan struct{}, 0)
+	rDone := make(chan struct{}, 0)
+	defer func() {
+		// Signal the reader to stop and wait until it does so.
+		close(wDone)
+		<-rDone
+	}()
+
+	go func() {
+		defer close(rDone)
+		// Read from r until done is closed.
+		ctx := contexttest.Context(t)
+		buf := make([]byte, len(msg)+1)
+		dst := usermem.BytesIOSequence(buf)
+		e, ch := waiter.NewChannelEntry(nil)
+		r.EventRegister(&e, waiter.EventIn)
+		defer r.EventUnregister(&e)
+		for {
+			n, err := r.Readv(ctx, dst)
+			dst = dst.DropFirst64(n)
+			if err == syserror.ErrWouldBlock {
+				select {
+				case <-ch:
+					continue
+				case <-wDone:
+					// We expect to have 1 byte left in dst since len(buf) ==
+					// len(msg)+1.
+					if dst.NumBytes() != 1 || !bytes.Equal(buf[:len(msg)], msg) {
+						t.Errorf("Reader: got %q (%d bytes remaining), wanted %q", buf, dst.NumBytes(), msg)
+					}
+					return
+				}
+			}
+			if err != nil {
+				t.Fatalf("Readv: got unexpected error %v", err)
+			}
+		}
+	}()
+
+	src := usermem.BytesIOSequence(msg)
+	e, ch := waiter.NewChannelEntry(nil)
+	w.EventRegister(&e, waiter.EventOut)
+	defer w.EventUnregister(&e)
+	for src.NumBytes() != 0 {
+		n, err := w.Writev(ctx, src)
+		src = src.DropFirst64(n)
+		if err == syserror.ErrWouldBlock {
+			<-ch
+			continue
+		}
+		if err != nil {
+			t.Fatalf("Writev: got (%d, %v)", n, err)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
new file mode 100644
index 000000000..40d5e4943
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -0,0 +1,37 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Reader satisfies the fs.FileOperations interface for read-only pipes.
+// Reader should be used with !fs.FileFlags.Write to reject writes.
+type Reader struct {
+	ReaderWriter
+}
+
+// Release implements fs.FileOperations.Release.
+func (r *Reader) Release() {
+	r.Pipe.rClose()
+	// Wake up writers.
+	r.Pipe.Notify(waiter.EventOut)
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (r *Reader) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return r.Pipe.rReadiness() & mask
+}
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
new file mode 100644
index 000000000..dc642a3a6
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -0,0 +1,91 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"fmt"
+	"math"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// ReaderWriter satisfies the FileOperations interface and services both
+// read and write requests. This should only be used directly for named pipes.
+// pipe(2) and pipe2(2) only support unidirectional pipes and should use
+// either pipe.Reader or pipe.Writer.
+type ReaderWriter struct {
+	fsutil.PipeSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+	*Pipe
+}
+
+// Release implements fs.FileOperations.Release.
+func (rw *ReaderWriter) Release() {
+	rw.Pipe.rClose()
+	rw.Pipe.wClose()
+	// Wake up readers and writers.
+	rw.Pipe.Notify(waiter.EventIn | waiter.EventOut)
+}
+
+// Read implements fs.FileOperations.Read.
+func (rw *ReaderWriter) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	n, err := rw.Pipe.read(ctx, dst)
+	if n > 0 {
+		rw.Pipe.Notify(waiter.EventOut)
+	}
+	return n, err
+}
+
+// Write implements fs.FileOperations.Write.
+func (rw *ReaderWriter) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	n, err := rw.Pipe.write(ctx, src)
+	if n > 0 {
+		rw.Pipe.Notify(waiter.EventIn)
+	}
+	return n, err
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (rw *ReaderWriter) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return rw.Pipe.rwReadiness() & mask
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (rw *ReaderWriter) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Switch on ioctl request.
+	switch int(args[1].Int()) {
+	case syscall.TIOCINQ:
+		v := rw.queuedSize()
+		if v > math.MaxInt32 {
+			panic(fmt.Sprintf("Impossibly large pipe queued size: %d", v))
+		}
+		// Copy result to user-space.
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	default:
+		return 0, syscall.ENOTTY
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
new file mode 100644
index 000000000..fd13008ac
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -0,0 +1,37 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Writer satisfies the fs.FileOperations interface for write-only pipes.
+// Writer should be used with !fs.FileFlags.Read to reject reads.
+type Writer struct {
+	ReaderWriter
+}
+
+// Release implements fs.FileOperations.Release.
+func (w *Writer) Release() {
+	w.Pipe.wClose()
+	// Wake up readers.
+	w.Pipe.Notify(waiter.EventHUp)
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (w *Writer) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return w.Pipe.wReadiness() & mask
+}
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
new file mode 100644
index 000000000..20b1c4cd4
--- /dev/null
+++ b/pkg/sentry/kernel/ptrace.go
@@ -0,0 +1,1054 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// ptrace constants from Linux's include/uapi/linux/ptrace.h.
+const (
+	_PTRACE_EVENT_SECCOMP  = 7
+	PTRACE_SEIZE           = 0x4206
+	PTRACE_INTERRUPT       = 0x4207
+	PTRACE_LISTEN          = 0x4208
+	PTRACE_PEEKSIGINFO     = 0x4209
+	PTRACE_GETSIGMASK      = 0x420a
+	PTRACE_SETSIGMASK      = 0x420b
+	_PTRACE_O_EXITKILL     = 1 << 20
+	_PTRACE_O_TRACESECCOMP = 1 << _PTRACE_EVENT_SECCOMP
+)
+
+// ptraceOptions are the subset of options controlling a task's ptrace behavior
+// that are set by ptrace(PTRACE_SETOPTIONS).
+type ptraceOptions struct {
+	// ExitKill is true if the tracee should be sent SIGKILL when the tracer
+	// exits.
+	ExitKill bool
+
+	// If SysGood is true, set bit 7 in the signal number for
+	// syscall-entry-stop and syscall-exit-stop traps delivered to this task's
+	// tracer.
+	SysGood bool
+
+	// TraceClone is true if the tracer wants to receive PTRACE_EVENT_CLONE
+	// events.
+	TraceClone bool
+
+	// TraceExec is true if the tracer wants to receive PTRACE_EVENT_EXEC
+	// events.
+	TraceExec bool
+
+	// TraceExit is true if the tracer wants to receive PTRACE_EVENT_EXIT
+	// events.
+	TraceExit bool
+
+	// TraceFork is true if the tracer wants to receive PTRACE_EVENT_FORK
+	// events.
+	TraceFork bool
+
+	// TraceSeccomp is true if the tracer wants to receive PTRACE_EVENT_SECCOMP
+	// events.
+	TraceSeccomp bool
+
+	// TraceVfork is true if the tracer wants to receive PTRACE_EVENT_VFORK
+	// events.
+	TraceVfork bool
+
+	// TraceVforkDone is true if the tracer wants to receive
+	// PTRACE_EVENT_VFORK_DONE events.
+	TraceVforkDone bool
+}
+
+// ptraceSyscallMode controls the behavior of a ptraced task at syscall entry
+// and exit.
+type ptraceSyscallMode int
+
+const (
+	// ptraceSyscallNone indicates that the task has never ptrace-stopped, or
+	// that it was resumed from its last ptrace-stop by PTRACE_CONT or
+	// PTRACE_DETACH. The task's syscalls will not be intercepted.
+	ptraceSyscallNone ptraceSyscallMode = iota
+
+	// ptraceSyscallIntercept indicates that the task was resumed from its last
+	// ptrace-stop by PTRACE_SYSCALL. The next time the task enters or exits a
+	// syscall, a ptrace-stop will occur.
+	ptraceSyscallIntercept
+
+	// ptraceSyscallEmu indicates that the task was resumed from its last
+	// ptrace-stop by PTRACE_SYSEMU or PTRACE_SYSEMU_SINGLESTEP. The next time
+	// the task enters a syscall, the syscall will be skipped, and a
+	// ptrace-stop will occur.
+	ptraceSyscallEmu
+)
+
+// CanTrace checks that t is permitted to access target's state, as defined by
+// ptrace(2), subsection "Ptrace access mode checking". If attach is true, it
+// checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access
+// mode PTRACE_MODE_READ.
+func (t *Task) CanTrace(target *Task, attach bool) bool {
+	// "1. If the calling thread and the target thread are in the same thread
+	// group, access is always allowed." - ptrace(2)
+	//
+	// Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access()
+	// should not deny sub-threads", first released in Linux 3.12), the rule
+	// only applies if t and target are the same task. But, as that commit
+	// message puts it, "[any] security check is pointless when the tasks share
+	// the same ->mm."
+	if t.tg == target.tg {
+		return true
+	}
+
+	// """
+	// 2. If the access mode specifies PTRACE_MODE_FSCREDS (ED: snipped,
+	// doesn't exist until Linux 4.5).
+	//
+	// Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the
+	// caller's real UID and GID for the checks in the next step. (Most APIs
+	// that check the caller's UID and GID use the effective IDs. For
+	// historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs
+	// instead.)
+	//
+	// 3. Deny access if neither of the following is true:
+	//
+	// - The real, effective, and saved-set user IDs of the target match the
+	// caller's user ID, *and* the real, effective, and saved-set group IDs of
+	// the target match the caller's group ID.
+	//
+	// - The caller has the CAP_SYS_PTRACE capability in the user namespace of
+	// the target.
+	//
+	// 4. Deny access if the target process "dumpable" attribute has a value
+	// other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in
+	// prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in
+	// the user namespace of the target process.
+	//
+	// 5. The kernel LSM security_ptrace_access_check() interface is invoked to
+	// see if ptrace access is permitted. The results depend on the LSM(s). The
+	// implementation of this interface in the commoncap LSM performs the
+	// following steps:
+	//
+	// a) If the access mode includes PTRACE_MODE_FSCREDS, then use the
+	// caller's effective capability set; otherwise (the access mode specifies
+	// PTRACE_MODE_REALCREDS, so) use the caller's permitted capability set.
+	//
+	// b) Deny access if neither of the following is true:
+	//
+	// - The caller and the target process are in the same user namespace, and
+	// the caller's capabilities are a proper superset of the target process's
+	// permitted capabilities.
+	//
+	// - The caller has the CAP_SYS_PTRACE capability in the target process's
+	// user namespace.
+	//
+	// Note that the commoncap LSM does not distinguish between
+	// PTRACE_MODE_READ and PTRACE_MODE_ATTACH. (ED: From earlier in this
+	// section: "the commoncap LSM ... is always invoked".)
+	// """
+	callerCreds := t.Credentials()
+	targetCreds := target.Credentials()
+	if callerCreds.HasCapabilityIn(linux.CAP_SYS_PTRACE, targetCreds.UserNamespace) {
+		return true
+	}
+	if cuid := callerCreds.RealKUID; cuid != targetCreds.RealKUID || cuid != targetCreds.EffectiveKUID || cuid != targetCreds.SavedKUID {
+		return false
+	}
+	if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID {
+		return false
+	}
+	// TODO: dumpability check
+	if callerCreds.UserNamespace != targetCreds.UserNamespace {
+		return false
+	}
+	if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 {
+		return false
+	}
+	// TODO: Yama LSM
+	return true
+}
+
+// Tracer returns t's ptrace Tracer.
+func (t *Task) Tracer() *Task {
+	return t.ptraceTracer.Load().(*Task)
+}
+
+// hasTracer returns true if t has a ptrace tracer attached.
+func (t *Task) hasTracer() bool {
+	// This isn't just inlined into callers so that if Task.Tracer() turns out
+	// to be too expensive because of e.g. interface conversion, we can switch
+	// to having a separate atomic flag more easily.
+	return t.Tracer() != nil
+}
+
+// ptraceStop is a TaskStop placed on tasks in a ptrace-stop.
+type ptraceStop struct {
+	// If frozen is true, the stopped task's tracer is currently operating on
+	// it, so Task.Kill should not remove the stop.
+	frozen bool
+}
+
+// Killable implements TaskStop.Killable.
+func (s *ptraceStop) Killable() bool {
+	return !s.frozen
+}
+
+// beginPtraceStopLocked initiates an unfrozen ptrace-stop on t. If t has been
+// killed, the stop is skipped, and beginPtraceStopLocked returns false.
+//
+// beginPtraceStopLocked does not signal t's tracer or wake it if it is
+// waiting.
+//
+// Preconditions: The TaskSet mutex must be locked. The caller must be running
+// on the task goroutine.
+func (t *Task) beginPtraceStopLocked() bool {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	// This is analogous to Linux's kernel/signal.c:ptrace_stop() => ... =>
+	// kernel/sched/core.c:__schedule() => signal_pending_state() check, which
+	// is what prevents tasks from entering ptrace-stops after being killed.
+	// Note that if t was SIGKILLed and beingPtraceStopLocked is being called
+	// for PTRACE_EVENT_EXIT, the task will have dequeued the signal before
+	// entering the exit path, so t.killable() will no longer return true. This
+	// is consistent with Linux: "Bugs: ... A SIGKILL signal may still cause a
+	// PTRACE_EVENT_EXIT stop before actual signal death. This may be changed
+	// in the future; SIGKILL is meant to always immediately kill tasks even
+	// under ptrace. Last confirmed on Linux 3.13." - ptrace(2)
+	if t.killedLocked() {
+		return false
+	}
+	t.beginInternalStopLocked(&ptraceStop{})
+	return true
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceTrapLocked(code int32) {
+	t.ptraceCode = code
+	t.ptraceSiginfo = &arch.SignalInfo{
+		Signo: int32(linux.SIGTRAP),
+		Code:  code,
+	}
+	t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
+	t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+	if t.beginPtraceStopLocked() {
+		tracer := t.Tracer()
+		tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP))
+		tracer.tg.eventQueue.Notify(EventTraceeStop)
+	}
+}
+
+// ptraceFreeze checks if t is in a ptraceStop. If so, it freezes the
+// ptraceStop, temporarily preventing it from being removed by a concurrent
+// Task.Kill, and returns true. Otherwise it returns false.
+//
+// Preconditions: The TaskSet mutex must be locked. The caller must be running
+// on the task goroutine of t's tracer.
+func (t *Task) ptraceFreeze() bool {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.stop == nil {
+		return false
+	}
+	s, ok := t.stop.(*ptraceStop)
+	if !ok {
+		return false
+	}
+	s.frozen = true
+	return true
+}
+
+// ptraceUnfreeze ends the effect of a previous successful call to
+// ptraceFreeze.
+//
+// Preconditions: t must be in a frozen ptraceStop.
+func (t *Task) ptraceUnfreeze() {
+	// t.tg.signalHandlers is stable because t is in a frozen ptrace-stop,
+	// preventing its thread group from completing execve.
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	// Do this even if the task has been killed to ensure a panic if t.stop is
+	// nil or not a ptraceStop.
+	t.stop.(*ptraceStop).frozen = false
+	if t.killedLocked() {
+		t.endInternalStopLocked()
+	}
+}
+
+// ptraceUnstop implements ptrace request PTRACE_CONT, PTRACE_SYSCALL,
+// PTRACE_SINGLESTEP, PTRACE_SYSEMU, or PTRACE_SYSEMU_SINGLESTEP depending on
+// mode and singlestep.
+//
+// Preconditions: t must be in a frozen ptrace stop.
+//
+// Postconditions: If ptraceUnstop returns nil, t will no longer be in a ptrace
+// stop.
+func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error {
+	if sig != 0 && !sig.IsValid() {
+		return syserror.EIO
+	}
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.ptraceCode = int32(sig)
+	t.ptraceSyscallMode = mode
+	t.ptraceSinglestep = singlestep
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.endInternalStopLocked()
+	return nil
+}
+
+func (t *Task) ptraceTraceme() error {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if t.hasTracer() {
+		return syserror.EPERM
+	}
+	if t.parent == nil {
+		// In Linux, only init can not have a parent, and init is assumed never
+		// to invoke PTRACE_TRACEME. In the sentry, TGID 1 is an arbitrary user
+		// application that may invoke PTRACE_TRACEME; having no parent can
+		// also occur if all tasks in the parent thread group have exited, and
+		// failed to find a living thread group to reparent to. The former case
+		// is treated as if TGID 1 has an exited parent in an invisible
+		// ancestor PID namespace that is an owner of the root user namespace
+		// (and consequently has CAP_SYS_PTRACE), and the latter case is a
+		// special form of the exited parent case below. In either case,
+		// returning nil here is correct.
+		return nil
+	}
+	if !t.parent.CanTrace(t, true) {
+		return syserror.EPERM
+	}
+	if t.parent.exitState != TaskExitNone {
+		// Fail silently, as if we were successfully attached but then
+		// immediately detached. This is consistent with Linux.
+		return nil
+	}
+	t.ptraceTracer.Store(t.parent)
+	t.parent.ptraceTracees[t] = struct{}{}
+	return nil
+}
+
+// ptraceAttach implements ptrace(PTRACE_ATTACH, target). t is the caller.
+func (t *Task) ptraceAttach(target *Task) error {
+	if t.tg == target.tg {
+		return syserror.EPERM
+	}
+	if !t.CanTrace(target, true) {
+		return syserror.EPERM
+	}
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if target.hasTracer() {
+		return syserror.EPERM
+	}
+	// Attaching to zombies and dead tasks is not permitted; the exit
+	// notification logic relies on this. Linux allows attaching to PF_EXITING
+	// tasks, though.
+	if target.exitState >= TaskExitZombie {
+		return syserror.EPERM
+	}
+	target.ptraceTracer.Store(t)
+	t.ptraceTracees[target] = struct{}{}
+	target.tg.signalHandlers.mu.Lock()
+	target.sendSignalLocked(&arch.SignalInfo{
+		Signo: int32(linux.SIGSTOP),
+		Code:  arch.SignalInfoUser,
+	}, false /* group */)
+	// Undocumented Linux feature: If the tracee is already group-stopped (and
+	// consequently will not report the SIGSTOP just sent), force it to leave
+	// and re-enter the stop so that it will switch to a ptrace-stop.
+	if target.stop == (*groupStop)(nil) {
+		target.groupStopRequired = true
+		target.endInternalStopLocked()
+	}
+	target.tg.signalHandlers.mu.Unlock()
+	return nil
+}
+
+// ptraceDetach implements ptrace(PTRACE_DETACH, target, 0, sig). t is the
+// caller.
+//
+// Preconditions: target must be a tracee of t in a frozen ptrace stop.
+//
+// Postconditions: If ptraceDetach returns nil, target will no longer be in a
+// ptrace stop.
+func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error {
+	if sig != 0 && !sig.IsValid() {
+		return syserror.EIO
+	}
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	target.ptraceCode = int32(sig)
+	target.forgetTracerLocked()
+	delete(t.ptraceTracees, target)
+	return nil
+}
+
+// exitPtrace is called in the exit path to detach all of t's tracees.
+func (t *Task) exitPtrace() {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	for target := range t.ptraceTracees {
+		if target.ptraceOpts.ExitKill {
+			target.tg.signalHandlers.mu.Lock()
+			target.sendSignalLocked(&arch.SignalInfo{
+				Signo: int32(linux.SIGKILL),
+			}, false /* group */)
+			target.tg.signalHandlers.mu.Unlock()
+		}
+		// Leave ptraceCode unchanged so that if the task is ptrace-stopped, it
+		// observes the ptraceCode it set before it entered the stop. I believe
+		// this is consistent with Linux.
+		target.forgetTracerLocked()
+	}
+	// "nil maps cannot be saved"
+	t.ptraceTracees = make(map[*Task]struct{})
+}
+
+// forgetTracerLocked detaches t's tracer and ensures that t is no longer
+// ptrace-stopped.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) forgetTracerLocked() {
+	t.ptraceOpts = ptraceOptions{}
+	t.ptraceSyscallMode = ptraceSyscallNone
+	t.ptraceSinglestep = false
+	t.ptraceTracer.Store((*Task)(nil))
+	if t.exitTracerNotified && !t.exitTracerAcked {
+		t.exitTracerAcked = true
+		t.exitNotifyLocked(true)
+	}
+	// If t is ptrace-stopped, but its thread group is in a group stop and t is
+	// eligible to participate, make it do so. This is essentially the reverse
+	// of the special case in ptraceAttach, which converts a group stop to a
+	// ptrace stop. ("Handling of restart from group-stop is currently buggy,
+	// but the "as planned" behavior is to leave tracee stopped and waiting for
+	// SIGCONT." - ptrace(2))
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.stop == nil {
+		return
+	}
+	if _, ok := t.stop.(*ptraceStop); ok {
+		if t.exitState < TaskExitInitiated && t.tg.groupStopPhase >= groupStopInitiated {
+			t.groupStopRequired = true
+		}
+		t.endInternalStopLocked()
+	}
+}
+
+// ptraceSignalLocked is called after signal dequeueing to check if t should
+// enter ptrace signal-delivery-stop.
+//
+// Preconditions: The signal mutex must be locked. The caller must be running
+// on the task goroutine.
+func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool {
+	if linux.Signal(info.Signo) == linux.SIGKILL {
+		return false
+	}
+	if !t.hasTracer() {
+		return false
+	}
+	// The tracer might change this signal into a stop signal, in which case
+	// any SIGCONT received after the signal was originally dequeued should
+	// cancel it. This is consistent with Linux.
+	if t.tg.groupStopPhase == groupStopNone {
+		t.tg.groupStopPhase = groupStopDequeued
+	}
+	// Can't lock the TaskSet mutex while holding a signal mutex.
+	t.tg.signalHandlers.mu.Unlock()
+	defer t.tg.signalHandlers.mu.Lock()
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	tracer := t.Tracer()
+	if tracer == nil {
+		return false
+	}
+	t.ptraceCode = info.Signo
+	t.ptraceSiginfo = info
+	t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo)
+	if t.beginPtraceStopLocked() {
+		tracer.signalStop(t, arch.CLD_TRAPPED, info.Signo)
+		tracer.tg.eventQueue.Notify(EventTraceeStop)
+	}
+	return true
+}
+
+// ptraceSeccomp is called when a seccomp-bpf filter returns action
+// SECCOMP_RET_TRACE to check if t should enter PTRACE_EVENT_SECCOMP stop. data
+// is the lower 16 bits of the filter's return value.
+func (t *Task) ptraceSeccomp(data uint16) bool {
+	if !t.hasTracer() {
+		return false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if !t.ptraceOpts.TraceSeccomp {
+		return false
+	}
+	t.Debugf("Entering PTRACE_EVENT_SECCOMP stop")
+	t.ptraceEventLocked(_PTRACE_EVENT_SECCOMP, uint64(data))
+	return true
+}
+
+// ptraceSyscallEnter is called immediately before entering a syscall to check
+// if t should enter ptrace syscall-enter-stop.
+func (t *Task) ptraceSyscallEnter() (taskRunState, bool) {
+	if !t.hasTracer() {
+		return nil, false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	switch t.ptraceSyscallMode {
+	case ptraceSyscallNone:
+		return nil, false
+	case ptraceSyscallIntercept:
+		t.Debugf("Entering syscall-enter-stop from PTRACE_SYSCALL")
+		t.ptraceSyscallStopLocked()
+		return (*runSyscallAfterSyscallEnterStop)(nil), true
+	case ptraceSyscallEmu:
+		t.Debugf("Entering syscall-enter-stop from PTRACE_SYSEMU")
+		t.ptraceSyscallStopLocked()
+		return (*runSyscallAfterSysemuStop)(nil), true
+	}
+	panic(fmt.Sprintf("Unknown ptraceSyscallMode: %v", t.ptraceSyscallMode))
+}
+
+// ptraceSyscallExit is called immediately after leaving a syscall to check if
+// t should enter ptrace syscall-exit-stop.
+func (t *Task) ptraceSyscallExit() {
+	if !t.hasTracer() {
+		return
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if t.ptraceSyscallMode != ptraceSyscallIntercept {
+		return
+	}
+	t.Debugf("Entering syscall-exit-stop")
+	t.ptraceSyscallStopLocked()
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceSyscallStopLocked() {
+	code := int32(linux.SIGTRAP)
+	if t.ptraceOpts.SysGood {
+		code |= 0x80
+	}
+	t.ptraceTrapLocked(code)
+}
+
+type ptraceCloneKind int32
+
+const (
+	// ptraceCloneKindClone represents a call to Task.Clone where
+	// TerminationSignal is not SIGCHLD and Vfork is false.
+	ptraceCloneKindClone ptraceCloneKind = iota
+
+	// ptraceCloneKindFork represents a call to Task.Clone where
+	// TerminationSignal is SIGCHLD and Vfork is false.
+	ptraceCloneKindFork
+
+	// ptraceCloneKindVfork represents a call to Task.Clone where Vfork is
+	// true.
+	ptraceCloneKindVfork
+)
+
+// ptraceClone is called at the end of a clone or fork syscall to check if t
+// should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK
+// stop. child is the new task.
+func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool {
+	if !t.hasTracer() {
+		return false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	event := false
+	if !opts.Untraced {
+		switch kind {
+		case ptraceCloneKindClone:
+			if t.ptraceOpts.TraceClone {
+				t.Debugf("Entering PTRACE_EVENT_CLONE stop")
+				t.ptraceEventLocked(syscall.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child]))
+				event = true
+			}
+		case ptraceCloneKindFork:
+			if t.ptraceOpts.TraceFork {
+				t.Debugf("Entering PTRACE_EVENT_FORK stop")
+				t.ptraceEventLocked(syscall.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child]))
+				event = true
+			}
+		case ptraceCloneKindVfork:
+			if t.ptraceOpts.TraceVfork {
+				t.Debugf("Entering PTRACE_EVENT_VFORK stop")
+				t.ptraceEventLocked(syscall.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child]))
+				event = true
+			}
+		default:
+			panic(fmt.Sprintf("Unknown ptraceCloneKind: %v", kind))
+		}
+	}
+	// "If the PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or PTRACE_O_TRACECLONE
+	// options are in effect, then children created by, respectively, vfork(2)
+	// or clone(2) with the CLONE_VFORK flag, fork(2) or clone(2) with the exit
+	// signal set to SIGCHLD, and other kinds of clone(2), are automatically
+	// attached to the same tracer which traced their parent. SIGSTOP is
+	// delivered to the children, causing them to enter signal-delivery-stop
+	// after they exit the system call which created them." - ptrace(2)
+	//
+	// clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is
+	// confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() =>
+	// include/linux/ptrace.h:ptrace_init_task().
+	if event || opts.InheritTracer {
+		tracer := t.Tracer()
+		if tracer != nil {
+			child.ptraceTracer.Store(tracer)
+			tracer.ptraceTracees[child] = struct{}{}
+			// "Flags are inherited by new tracees created and "auto-attached"
+			// via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or
+			// PTRACE_O_TRACECLONE options."
+			child.ptraceOpts = t.ptraceOpts
+			child.tg.signalHandlers.mu.Lock()
+			// If the child is PT_SEIZED (currently not possible in the sentry
+			// because PTRACE_SEIZE is unimplemented, but for future
+			// reference), Linux just sets JOBCTL_TRAP_STOP instead, so the
+			// child skips signal-delivery-stop and goes directly to
+			// group-stop.
+			//
+			// The child will self-t.interrupt() when its task goroutine starts
+			// running, so we don't have to.
+			child.pendingSignals.enqueue(&arch.SignalInfo{
+				Signo: int32(linux.SIGSTOP),
+			})
+			child.tg.signalHandlers.mu.Unlock()
+		}
+	}
+	return event
+}
+
+// ptraceVforkDone is called after the end of a vfork stop to check if t should
+// enter PTRACE_EVENT_VFORK_DONE stop. child is the new task's thread ID in t's
+// PID namespace.
+func (t *Task) ptraceVforkDone(child ThreadID) bool {
+	if !t.hasTracer() {
+		return false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if !t.ptraceOpts.TraceVforkDone {
+		return false
+	}
+	t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop")
+	t.ptraceEventLocked(syscall.PTRACE_EVENT_VFORK_DONE, uint64(child))
+	return true
+}
+
+// ptraceExec is called at the end of an execve syscall to check if t should
+// enter PTRACE_EVENT_EXEC stop. oldTID is t's thread ID, in its *tracer's* PID
+// namespace, prior to the execve. (If t did not have a tracer at the time
+// oldTID was read, oldTID may be 0. This is consistent with Linux.)
+func (t *Task) ptraceExec(oldTID ThreadID) {
+	if !t.hasTracer() {
+		return
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	// Recheck with the TaskSet mutex locked. Most ptrace points don't need to
+	// do this because detaching resets ptrace options, but PTRACE_EVENT_EXEC
+	// is special because both TraceExec and !TraceExec do something if a
+	// tracer is attached.
+	if !t.hasTracer() {
+		return
+	}
+	if t.ptraceOpts.TraceExec {
+		t.Debugf("Entering PTRACE_EVENT_EXEC stop")
+		t.ptraceEventLocked(syscall.PTRACE_EVENT_EXEC, uint64(oldTID))
+		return
+	}
+	// "If the PTRACE_O_TRACEEXEC option is not in effect for the execing
+	// tracee, and if the tracee was PTRACE_ATTACHed rather that [sic]
+	// PTRACE_SEIZEd, the kernel delivers an extra SIGTRAP to the tracee after
+	// execve(2) returns. This is an ordinary signal (similar to one which can
+	// be generated by `kill -TRAP`, not a special kind of ptrace-stop.
+	// Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0
+	// (SI_USER). This signal may be blocked by signal mask, and thus may be
+	// delivered (much) later." - ptrace(2)
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.sendSignalLocked(&arch.SignalInfo{
+		Signo: int32(linux.SIGTRAP),
+		Code:  arch.SignalInfoUser,
+	}, false /* group */)
+}
+
+// ptraceExit is called early in the task exit path to check if t should enter
+// PTRACE_EVENT_EXIT stop.
+func (t *Task) ptraceExit() {
+	if !t.hasTracer() {
+		return
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if !t.ptraceOpts.TraceExit {
+		return
+	}
+	t.tg.signalHandlers.mu.Lock()
+	status := t.exitStatus.Status()
+	t.tg.signalHandlers.mu.Unlock()
+	t.Debugf("Entering PTRACE_EVENT_EXIT stop")
+	t.ptraceEventLocked(syscall.PTRACE_EVENT_EXIT, uint64(status))
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceEventLocked(event int32, msg uint64) {
+	t.ptraceEventMsg = msg
+	// """
+	// PTRACE_EVENT stops are observed by the tracer as waitpid(2) returning
+	// with WIFSTOPPED(status), and WSTOPSIG(status) returns SIGTRAP. An
+	// additional bit is set in the higher byte of the status word: the value
+	// status>>8 will be
+	//
+	//   (SIGTRAP | PTRACE_EVENT_foo << 8).
+	//
+	// ...
+	//
+	// """ - ptrace(2)
+	t.ptraceTrapLocked(int32(linux.SIGTRAP) | (event << 8))
+}
+
+// ptraceKill implements ptrace(PTRACE_KILL, target). t is the caller.
+func (t *Task) ptraceKill(target *Task) error {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if target.Tracer() != t {
+		return syserror.ESRCH
+	}
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	// "This operation is deprecated; do not use it! Instead, send a SIGKILL
+	// directly using kill(2) or tgkill(2). The problem with PTRACE_KILL is
+	// that it requires the tracee to be in signal-delivery-stop, otherwise it
+	// may not work (i.e., may complete successfully but won't kill the
+	// tracee)." - ptrace(2)
+	if target.stop == nil {
+		return nil
+	}
+	if _, ok := target.stop.(*ptraceStop); !ok {
+		return nil
+	}
+	target.ptraceCode = int32(linux.SIGKILL)
+	target.endInternalStopLocked()
+	return nil
+}
+
+// Ptrace implements the ptrace system call.
+func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
+	// PTRACE_TRACEME ignores all other arguments.
+	if req == syscall.PTRACE_TRACEME {
+		return t.ptraceTraceme()
+	}
+	// All other ptrace requests operate on a current or future tracee
+	// specified by pid.
+	target := t.tg.pidns.TaskWithID(pid)
+	if target == nil {
+		return syserror.ESRCH
+	}
+
+	// PTRACE_ATTACH (and PTRACE_SEIZE, which is unimplemented) do not require
+	// that target is not already a tracee.
+	if req == syscall.PTRACE_ATTACH {
+		return t.ptraceAttach(target)
+	}
+	// PTRACE_KILL (and PTRACE_INTERRUPT, which is unimplemented) require that
+	// the target is a tracee, but does not require that it is ptrace-stopped.
+	if req == syscall.PTRACE_KILL {
+		return t.ptraceKill(target)
+	}
+	// All other ptrace requests require that the target is a ptrace-stopped
+	// tracee, and freeze the ptrace-stop so the tracee can be operated on.
+	t.tg.pidns.owner.mu.RLock()
+	if target.Tracer() != t {
+		t.tg.pidns.owner.mu.RUnlock()
+		return syserror.ESRCH
+	}
+	if !target.ptraceFreeze() {
+		t.tg.pidns.owner.mu.RUnlock()
+		// "Most ptrace commands (all except PTRACE_ATTACH, PTRACE_SEIZE,
+		// PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the
+		// tracee to be in a ptrace-stop, otherwise they fail with ESRCH." -
+		// ptrace(2)
+		return syserror.ESRCH
+	}
+	t.tg.pidns.owner.mu.RUnlock()
+	// Even if the target has a ptrace-stop active, the tracee's task goroutine
+	// may not yet have reached Task.doStop; wait for it to do so. This is safe
+	// because there's no way for target to initiate a ptrace-stop and then
+	// block (by calling Task.block) before entering it.
+	//
+	// Caveat: If tasks were just restored, the tracee's first call to
+	// Task.Activate (in Task.run) occurs before its first call to Task.doStop,
+	// which may block if the tracer's address space is active.
+	t.UninterruptibleSleepStart(true)
+	target.waitGoroutineStoppedOrExited()
+	t.UninterruptibleSleepFinish(true)
+
+	// Resuming commands end the ptrace stop, but only if successful.
+	switch req {
+	case syscall.PTRACE_DETACH:
+		if err := t.ptraceDetach(target, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+	case syscall.PTRACE_CONT:
+		if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+	case syscall.PTRACE_SYSCALL:
+		if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+	case syscall.PTRACE_SINGLESTEP:
+		if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+	case syscall.PTRACE_SYSEMU:
+		if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+	case syscall.PTRACE_SYSEMU_SINGLESTEP:
+		if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+	}
+	// All other ptrace requests expect us to unfreeze the stop.
+	defer target.ptraceUnfreeze()
+
+	switch req {
+	case syscall.PTRACE_PEEKTEXT, syscall.PTRACE_PEEKDATA:
+		// "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and
+		// PTRACE_PEEKUSER requests have a different API: they store the result
+		// at the address specified by the data parameter, and the return value
+		// is the error flag." - ptrace(2)
+		word := t.Arch().Native(0)
+		if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{
+			IgnorePermissions: true,
+		}); err != nil {
+			return err
+		}
+		_, err := t.CopyOut(data, word)
+		return err
+
+	case syscall.PTRACE_POKETEXT, syscall.PTRACE_POKEDATA:
+		_, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{
+			IgnorePermissions: true,
+		})
+		return err
+
+	case syscall.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER
+		n, err := target.Arch().PtracePeekUser(uintptr(addr))
+		if err != nil {
+			return err
+		}
+		_, err = t.CopyOut(data, n)
+		return err
+
+	case syscall.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
+		return target.Arch().PtracePokeUser(uintptr(addr), uintptr(data))
+
+	case syscall.PTRACE_GETREGS:
+		// "Copy the tracee's general-purpose ... registers ... to the address
+		// data in the tracer. ... (addr is ignored.) Note that SPARC systems
+		// have the meaning of data and addr reversed ..."
+		_, err := target.Arch().PtraceGetRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case syscall.PTRACE_GETFPREGS:
+		_, err := target.Arch().PtraceGetFPRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case syscall.PTRACE_GETREGSET:
+		// "Read the tracee's registers. addr specifies, in an
+		// architecture-dependent way, the type of registers to be read. ...
+		// data points to a struct iovec, which describes the destination
+		// buffer's location and length. On return, the kernel modifies iov.len
+		// to indicate the actual number of bytes returned." - ptrace(2)
+		ars, err := t.CopyInIovecs(data, 1)
+		if err != nil {
+			return err
+		}
+		ar := ars.Head()
+		n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: ar.Start,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		}, int(ar.Length()))
+		if err != nil {
+			return err
+		}
+		ar.End -= usermem.Addr(n)
+		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+
+	case syscall.PTRACE_SETREGS:
+		_, err := target.Arch().PtraceSetRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case syscall.PTRACE_SETFPREGS:
+		_, err := target.Arch().PtraceSetFPRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case syscall.PTRACE_SETREGSET:
+		ars, err := t.CopyInIovecs(data, 1)
+		if err != nil {
+			return err
+		}
+		ar := ars.Head()
+		n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: ar.Start,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		}, int(ar.Length()))
+		if err != nil {
+			return err
+		}
+		ar.End -= usermem.Addr(n)
+		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+
+	case syscall.PTRACE_GETSIGINFO:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		if target.ptraceSiginfo == nil {
+			return syserror.EINVAL
+		}
+		_, err := t.CopyOut(data, target.ptraceSiginfo)
+		return err
+
+	case syscall.PTRACE_SETSIGINFO:
+		var info arch.SignalInfo
+		if _, err := t.CopyIn(data, &info); err != nil {
+			return err
+		}
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		if target.ptraceSiginfo == nil {
+			return syserror.EINVAL
+		}
+		target.ptraceSiginfo = &info
+		return nil
+
+	case PTRACE_GETSIGMASK:
+		if addr != linux.SignalSetSize {
+			return syserror.EINVAL
+		}
+		target.mu.Lock()
+		defer target.mu.Unlock()
+		_, err := t.CopyOut(data, target.tr.SignalMask)
+		return err
+
+	case PTRACE_SETSIGMASK:
+		if addr != linux.SignalSetSize {
+			return syserror.EINVAL
+		}
+		var mask linux.SignalSet
+		if _, err := t.CopyIn(data, &mask); err != nil {
+			return err
+		}
+		// The target's task goroutine is stopped, so this is safe:
+		target.SetSignalMask(mask &^ UnblockableSignals)
+		return nil
+
+	case syscall.PTRACE_SETOPTIONS:
+		t.tg.pidns.owner.mu.Lock()
+		defer t.tg.pidns.owner.mu.Unlock()
+		validOpts := uintptr(_PTRACE_O_EXITKILL | syscall.PTRACE_O_TRACESYSGOOD | syscall.PTRACE_O_TRACECLONE |
+			syscall.PTRACE_O_TRACEEXEC | syscall.PTRACE_O_TRACEEXIT | syscall.PTRACE_O_TRACEFORK |
+			_PTRACE_O_TRACESECCOMP | syscall.PTRACE_O_TRACEVFORK | syscall.PTRACE_O_TRACEVFORKDONE)
+		if uintptr(data)&^validOpts != 0 {
+			return syserror.EINVAL
+		}
+		target.ptraceOpts = ptraceOptions{
+			ExitKill:       data&_PTRACE_O_EXITKILL != 0,
+			SysGood:        data&syscall.PTRACE_O_TRACESYSGOOD != 0,
+			TraceClone:     data&syscall.PTRACE_O_TRACECLONE != 0,
+			TraceExec:      data&syscall.PTRACE_O_TRACEEXEC != 0,
+			TraceExit:      data&syscall.PTRACE_O_TRACEEXIT != 0,
+			TraceFork:      data&syscall.PTRACE_O_TRACEFORK != 0,
+			TraceSeccomp:   data&_PTRACE_O_TRACESECCOMP != 0,
+			TraceVfork:     data&syscall.PTRACE_O_TRACEVFORK != 0,
+			TraceVforkDone: data&syscall.PTRACE_O_TRACEVFORKDONE != 0,
+		}
+		return nil
+
+	case syscall.PTRACE_GETEVENTMSG:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		_, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg)
+		return err
+
+	default:
+		// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
+		return syserror.EIO
+	}
+}
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
new file mode 100644
index 000000000..635372993
--- /dev/null
+++ b/pkg/sentry/kernel/rseq.go
@@ -0,0 +1,118 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Restartable sequences, as described in https://lwn.net/Articles/650333/.
+
+// RSEQCriticalRegion describes a restartable sequence critical region.
+type RSEQCriticalRegion struct {
+	// When a task in this thread group has its CPU preempted (as defined by
+	// platform.ErrContextCPUPreempted) or has a signal delivered to an
+	// application handler while its instruction pointer is in CriticalSection,
+	// set the instruction pointer to Restart and application register r10 (on
+	// amd64) to the former instruction pointer.
+	CriticalSection usermem.AddrRange
+	Restart         usermem.Addr
+}
+
+// RSEQAvailable returns true if t supports restartable sequences.
+func (t *Task) RSEQAvailable() bool {
+	return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption()
+}
+
+// RSEQCriticalRegion returns a copy of t's thread group's current restartable
+// sequence.
+func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion {
+	return *t.tg.rscr.Load().(*RSEQCriticalRegion)
+}
+
+// SetRSEQCriticalRegion replaces t's thread group's restartable sequence.
+//
+// Preconditions: t.RSEQAvailable() == true.
+func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error {
+	// These checks are somewhat more lenient than in Linux, which (bizarrely)
+	// requires rscr.CriticalSection to be non-empty and rscr.Restart to be
+	// outside of rscr.CriticalSection, even if rscr.CriticalSection.Start == 0
+	// (which disables the critical region).
+	if rscr.CriticalSection.Start == 0 {
+		rscr.CriticalSection.End = 0
+		rscr.Restart = 0
+		t.tg.rscr.Store(&rscr)
+		return nil
+	}
+	if rscr.CriticalSection.Start >= rscr.CriticalSection.End {
+		return syserror.EINVAL
+	}
+	if rscr.CriticalSection.Contains(rscr.Restart) {
+		return syserror.EINVAL
+	}
+	// TODO: check that rscr.CriticalSection and rscr.Restart are in
+	// the application address range, for consistency with Linux
+	t.tg.rscr.Store(&rscr)
+	return nil
+}
+
+// RSEQCPUAddr returns the address that RSEQ will keep updated with t's CPU
+// number.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) RSEQCPUAddr() usermem.Addr {
+	return t.rseqCPUAddr
+}
+
+// SetRSEQCPUAddr replaces the address that RSEQ will keep updated with t's CPU
+// number.
+//
+// Preconditions: t.RSEQAvailable() == true. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error {
+	t.rseqCPUAddr = addr
+	if addr != 0 {
+		if err := t.rseqCopyOutCPU(); err != nil {
+			t.rseqCPUAddr = 0
+			t.rseqCPU = -1
+			return syserror.EINVAL // yes, EINVAL, not err or EFAULT
+		}
+	} else {
+		t.rseqCPU = -1
+	}
+	return nil
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqCopyOutCPU() error {
+	t.rseqCPU = int32(hostcpu.GetCPU())
+	buf := t.CopyScratchBuffer(4)
+	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
+	_, err := t.CopyOutBytes(t.rseqCPUAddr, buf)
+	return err
+}
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) rseqInterrupt() {
+	rscr := t.tg.rscr.Load().(*RSEQCriticalRegion)
+	if ip := t.Arch().IP(); rscr.CriticalSection.Contains(usermem.Addr(ip)) {
+		t.Debugf("Interrupted RSEQ critical section at %#x; restarting at %#x", ip, rscr.Restart)
+		t.Arch().SetIP(uintptr(rscr.Restart))
+		t.Arch().SetRSEQInterruptedIP(ip)
+	}
+}
diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD
new file mode 100644
index 000000000..b533c51c4
--- /dev/null
+++ b/pkg/sentry/kernel/sched/BUILD
@@ -0,0 +1,20 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "sched",
+    srcs = [
+        "cpuset.go",
+        "sched.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched",
+    visibility = ["//pkg/sentry:internal"],
+)
+
+go_test(
+    name = "sched_test",
+    size = "small",
+    srcs = ["cpuset_test.go"],
+    embed = [":sched"],
+)
diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go
new file mode 100644
index 000000000..0a97603f0
--- /dev/null
+++ b/pkg/sentry/kernel/sched/cpuset.go
@@ -0,0 +1,105 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sched
+
+import "math/bits"
+
+const (
+	bitsPerByte  = 8
+	bytesPerLong = 8 // only for 64-bit architectures
+)
+
+// CPUSet contains a bitmap to record CPU information.
+//
+// Note that this definition is only correct for little-endian architectures,
+// since Linux's cpumask_t uses unsigned long.
+type CPUSet []byte
+
+// CPUSetSize returns the size in bytes of a CPUSet that can contain num cpus.
+func CPUSetSize(num uint) uint {
+	// NOTE: Applications may expect that the size of a CPUSet in
+	// bytes is always a multiple of sizeof(unsigned long), since this is true
+	// in Linux. Thus we always round up.
+	bytes := (num + bitsPerByte - 1) / bitsPerByte
+	longs := (bytes + bytesPerLong - 1) / bytesPerLong
+	return longs * bytesPerLong
+}
+
+// NewCPUSet returns a CPUSet for the given number of CPUs which initially
+// contains no CPUs.
+func NewCPUSet(num uint) CPUSet {
+	return CPUSet(make([]byte, CPUSetSize(num)))
+}
+
+// NewFullCPUSet returns a CPUSet for the given number of CPUs, all of which
+// are present in the set.
+func NewFullCPUSet(num uint) CPUSet {
+	c := NewCPUSet(num)
+	var i uint
+	for ; i < num/bitsPerByte; i++ {
+		c[i] = 0xff
+	}
+	if rem := num % bitsPerByte; rem != 0 {
+		c[i] = (1 << rem) - 1
+	}
+	return c
+}
+
+// Size returns the size of 'c' in bytes.
+func (c CPUSet) Size() uint {
+	return uint(len(c))
+}
+
+// NumCPUs returns how many cpus are set in the CPUSet.
+func (c CPUSet) NumCPUs() uint {
+	var n int
+	for _, b := range c {
+		n += bits.OnesCount8(b)
+	}
+	return uint(n)
+}
+
+// Copy returns a copy of the CPUSet.
+func (c CPUSet) Copy() CPUSet {
+	return append(CPUSet(nil), c...)
+}
+
+// Set sets the bit corresponding to cpu.
+func (c *CPUSet) Set(cpu uint) {
+	(*c)[cpu/bitsPerByte] |= 1 << (cpu % bitsPerByte)
+}
+
+// ClearAbove clears bits corresponding to cpu and all higher cpus.
+func (c *CPUSet) ClearAbove(cpu uint) {
+	i := cpu / bitsPerByte
+	if i >= c.Size() {
+		return
+	}
+	(*c)[i] &^= 0xff << (cpu % bitsPerByte)
+	for i++; i < c.Size(); i++ {
+		(*c)[i] = 0
+	}
+}
+
+// ForEachCPU iterates over the CPUSet and calls fn with the cpu index if
+// it's set.
+func (c CPUSet) ForEachCPU(fn func(uint)) {
+	for i := uint(0); i < c.Size()*bitsPerByte; i++ {
+		bit := uint(1) << (i & (bitsPerByte - 1))
+		if uint(c[i/bitsPerByte])&bit == bit {
+			fn(i)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/sched/cpuset_test.go b/pkg/sentry/kernel/sched/cpuset_test.go
new file mode 100644
index 000000000..8a6e12958
--- /dev/null
+++ b/pkg/sentry/kernel/sched/cpuset_test.go
@@ -0,0 +1,44 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sched
+
+import (
+	"testing"
+)
+
+func TestNumCPUs(t *testing.T) {
+	for i := uint(0); i < 1024; i++ {
+		c := NewCPUSet(i)
+		for j := uint(0); j < i; j++ {
+			c.Set(j)
+		}
+		n := c.NumCPUs()
+		if n != i {
+			t.Errorf("got wrong number of cpus %d, want %d", n, i)
+		}
+	}
+}
+
+func TestClearAbove(t *testing.T) {
+	const n = 1024
+	c := NewFullCPUSet(n)
+	for i := uint(0); i < n; i++ {
+		cpu := n - i
+		c.ClearAbove(cpu)
+		if got := c.NumCPUs(); got != cpu {
+			t.Errorf("iteration %d: got %d cpus, wanted %d", i, got, cpu)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/sched/sched.go b/pkg/sentry/kernel/sched/sched.go
new file mode 100644
index 000000000..f1de1da60
--- /dev/null
+++ b/pkg/sentry/kernel/sched/sched.go
@@ -0,0 +1,16 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sched implements scheduler related features.
+package sched
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
new file mode 100644
index 000000000..b7c4a507f
--- /dev/null
+++ b/pkg/sentry/kernel/seccomp.go
@@ -0,0 +1,205 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/bpf"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const maxSyscallFilterInstructions = 1 << 15
+
+type seccompResult int
+
+const (
+	// seccompResultDeny indicates that a syscall should not be executed.
+	seccompResultDeny seccompResult = iota
+
+	// seccompResultAllow indicates that a syscall should be executed.
+	seccompResultAllow
+
+	// seccompResultKill indicates that the task should be killed immediately,
+	// with the exit status indicating that the task was killed by SIGSYS.
+	seccompResultKill
+
+	// seccompResultTrace indicates that a ptracer was successfully notified as
+	// a result of a SECCOMP_RET_TRACE.
+	seccompResultTrace
+)
+
+// seccompData is equivalent to struct seccomp_data, which contains the data
+// passed to seccomp-bpf filters.
+type seccompData struct {
+	// nr is the system call number.
+	nr int32
+
+	// arch is an AUDIT_ARCH_* value indicating the system call convention.
+	arch uint32
+
+	// instructionPointer is the value of the instruction pointer at the time
+	// of the system call.
+	instructionPointer uint64
+
+	// args contains the first 6 system call arguments.
+	args [6]uint64
+}
+
+func (d *seccompData) asBPFInput() bpf.Input {
+	return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder}
+}
+
+func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo {
+	si := &arch.SignalInfo{
+		Signo: int32(linux.SIGSYS),
+		Errno: errno,
+		Code:  arch.SYS_SECCOMP,
+	}
+	si.SetCallAddr(uint64(ip))
+	si.SetSyscall(sysno)
+	si.SetArch(t.SyscallTable().AuditNumber)
+	return si
+}
+
+// checkSeccompSyscall applies the task's seccomp filters before the execution
+// of syscall sysno at instruction pointer ip. (These parameters must be passed
+// in because vsyscalls do not use the values in t.Arch().)
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) seccompResult {
+	result := t.evaluateSyscallFilters(sysno, args, ip)
+	switch result & linux.SECCOMP_RET_ACTION {
+	case linux.SECCOMP_RET_TRAP:
+		// "Results in the kernel sending a SIGSYS signal to the triggering
+		// task without executing the system call. ... The SECCOMP_RET_DATA
+		// portion of the return value will be passed as si_errno." -
+		// Documentation/prctl/seccomp_filter.txt
+		t.SendSignal(seccompSiginfo(t, int32(result&linux.SECCOMP_RET_DATA), sysno, ip))
+		return seccompResultDeny
+
+	case linux.SECCOMP_RET_ERRNO:
+		// "Results in the lower 16-bits of the return value being passed to
+		// userland as the errno without executing the system call."
+		t.Arch().SetReturn(-uintptr(result & linux.SECCOMP_RET_DATA))
+		return seccompResultDeny
+
+	case linux.SECCOMP_RET_TRACE:
+		// "When returned, this value will cause the kernel to attempt to
+		// notify a ptrace()-based tracer prior to executing the system call.
+		// If there is no tracer present, -ENOSYS is returned to userland and
+		// the system call is not executed."
+		if t.ptraceSeccomp(uint16(result & linux.SECCOMP_RET_DATA)) {
+			return seccompResultTrace
+		}
+		// This useless-looking temporary is needed because Go.
+		tmp := uintptr(syscall.ENOSYS)
+		t.Arch().SetReturn(-tmp)
+		return seccompResultDeny
+
+	case linux.SECCOMP_RET_ALLOW:
+		// "Results in the system call being executed."
+		return seccompResultAllow
+
+	case linux.SECCOMP_RET_KILL:
+		// "Results in the task exiting immediately without executing the
+		// system call. The exit status of the task will be SIGSYS, not
+		// SIGKILL."
+		fallthrough
+	default: // consistent with Linux
+		return seccompResultKill
+	}
+}
+
+func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
+	data := seccompData{
+		nr:                 sysno,
+		arch:               t.tc.st.AuditNumber,
+		instructionPointer: uint64(ip),
+	}
+	// data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
+	// we can't do any slicing tricks or even use copy/append here.
+	for i, arg := range args {
+		if i >= len(data.args) {
+			break
+		}
+		data.args[i] = arg.Uint64()
+	}
+	input := data.asBPFInput()
+
+	ret := uint32(linux.SECCOMP_RET_ALLOW)
+	// "Every filter successfully installed will be evaluated (in reverse
+	// order) for each system call the task makes." - kernel/seccomp.c
+	for i := len(t.syscallFilters) - 1; i >= 0; i-- {
+		thisRet, err := bpf.Exec(t.syscallFilters[i], input)
+		if err != nil {
+			t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
+			thisRet = linux.SECCOMP_RET_KILL
+		}
+		// "If multiple filters exist, the return value for the evaluation of a
+		// given system call will always use the highest precedent value." -
+		// Documentation/prctl/seccomp_filter.txt
+		//
+		// (Note that this contradicts prctl(2): "If the filters permit prctl()
+		// calls, then additional filters can be added; they are run in order
+		// until the first non-allow result is seen." prctl(2) is incorrect.)
+		//
+		// "The ordering ensures that a min_t() over composed return values
+		// always selects the least permissive choice." -
+		// include/uapi/linux/seccomp.h
+		if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) {
+			ret = thisRet
+		}
+	}
+
+	return ret
+}
+
+// AppendSyscallFilter adds BPF program p as a system call filter.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) AppendSyscallFilter(p bpf.Program) error {
+	// Cap the combined length of all syscall filters (plus a penalty of 4
+	// instructions per filter beyond the first) to
+	// maxSyscallFilterInstructions. (This restriction is inherited from
+	// Linux.)
+	totalLength := p.Length()
+	for _, f := range t.syscallFilters {
+		totalLength += f.Length() + 4
+	}
+	if totalLength > maxSyscallFilterInstructions {
+		return syserror.ENOMEM
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.syscallFilters = append(t.syscallFilters, p)
+	return nil
+}
+
+// SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current
+// seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP)
+// and /proc/[pid]/status.
+func (t *Task) SeccompMode() int {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if len(t.syscallFilters) > 0 {
+		return linux.SECCOMP_MODE_FILTER
+	}
+	return linux.SECCOMP_MODE_NONE
+}
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
new file mode 100644
index 000000000..1656ad126
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -0,0 +1,62 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_template_instance(
+    name = "waiter_list",
+    out = "waiter_list.go",
+    package = "semaphore",
+    prefix = "waiter",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*waiter",
+    },
+)
+
+go_stateify(
+    name = "semaphore_state",
+    srcs = [
+        "semaphore.go",
+        "waiter_list.go",
+    ],
+    out = "semaphore_autogen_state.go",
+    package = "semaphore",
+)
+
+go_library(
+    name = "semaphore",
+    srcs = [
+        "semaphore.go",
+        "semaphore_autogen_state.go",
+        "waiter_list.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/state",
+        "//pkg/state/statefile",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "semaphore_test",
+    size = "small",
+    srcs = ["semaphore_test.go"],
+    embed = [":semaphore"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
new file mode 100644
index 000000000..19ad5d537
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -0,0 +1,473 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package semaphore implements System V semaphores.
+package semaphore
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	valueMax = 32767 // SEMVMX
+
+	// semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL).
+	semaphoresMax = 32000
+
+	// setMax is "system-wide limit on the number of semaphore sets" (SEMMNI).
+	setsMax = 32000
+
+	// semaphoresTotalMax is "system-wide limit on the number of semaphores"
+	// (SEMMNS = SEMMNI*SEMMSL).
+	semaphoresTotalMax = 1024000000
+)
+
+// Registry maintains a set of semaphores that can be found by key or ID.
+type Registry struct {
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	semaphores map[int32]*Set
+	lastIDUsed int32
+}
+
+// Set represents a set of semaphores that can be operated atomically.
+type Set struct {
+	// Id is a handle that identifies the set.
+	ID int32
+
+	// key is an user provided key that can be shared between processes.
+	key int32
+
+	// creator is the user that created the set. Immutable.
+	creator fs.FileOwner
+
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	owner      fs.FileOwner
+	perms      fs.FilePermissions
+	opTime     ktime.Time
+	changeTime ktime.Time
+	sems       []sem
+
+	// dead is set to true when the set is removed and can't be reached anymore.
+	// All waiters must wake up and fail when set is dead.
+	dead bool
+}
+
+// sem represents a single semanphore from a set.
+type sem struct {
+	value   int16
+	waiters waiterList `state:"zerovalue"`
+}
+
+// waiter represents a caller that is waiting for the semaphore value to
+// become positive or zero.
+type waiter struct {
+	waiterEntry
+
+	// value represents how much resource the waiter needs to wake up.
+	value int16
+	ch    chan struct{}
+}
+
+// NewRegistry creates a new semaphore set registry.
+func NewRegistry() *Registry {
+	return &Registry{semaphores: make(map[int32]*Set)}
+}
+
+// FindOrCreate searches for a semaphore set that matches 'key'. If not found,
+// it may create a new one if requested. If private is true, key is ignored and
+// a new set is always created. If create is false, it fails if a set cannot
+// be found. If exclusive is true, it fails if a set with the same key already
+// exists.
+func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) {
+	if nsems < 0 || nsems > semaphoresMax {
+		return nil, syserror.EINVAL
+	}
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if !private {
+		// Look up an existing semaphore.
+		if set := r.findByKey(key); set != nil {
+			// Check that caller can access semaphore set.
+			creds := auth.CredentialsFromContext(ctx)
+			if !set.checkPerms(creds, fs.PermsFromMode(mode)) {
+				return nil, syserror.EACCES
+			}
+
+			// Validate parameters.
+			if nsems > int32(set.size()) {
+				return nil, syserror.EINVAL
+			}
+			if create && exclusive {
+				return nil, syserror.EEXIST
+			}
+			return set, nil
+		}
+
+		if !create {
+			// Semaphore not found and should not be created.
+			return nil, syserror.ENOENT
+		}
+	}
+
+	// Zero is only valid if an existing set is found.
+	if nsems == 0 {
+		return nil, syserror.EINVAL
+	}
+
+	// Apply system limits.
+	if len(r.semaphores) >= setsMax {
+		return nil, syserror.EINVAL
+	}
+	if r.totalSems() > int(semaphoresTotalMax-nsems) {
+		return nil, syserror.EINVAL
+	}
+
+	// Finally create a new set.
+	owner := fs.FileOwnerFromContext(ctx)
+	perms := fs.FilePermsFromMode(mode)
+	return r.newSet(ctx, key, owner, owner, perms, nsems)
+}
+
+// RemoveID removes set with give 'id' from the registry and marks the set as
+// dead. All waiters will be awakened and fail.
+func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	set := r.semaphores[id]
+	if set == nil {
+		return syserror.EINVAL
+	}
+
+	// "The effective user ID of the calling process must match the creator or
+	// owner of the semaphore set, or the caller must be privileged."
+	if !set.checkCredentials(creds) && !set.checkCapability(creds) {
+		return syserror.EACCES
+	}
+
+	delete(r.semaphores, set.ID)
+	set.destroy()
+	return nil
+}
+
+func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) {
+	set := &Set{
+		key:        key,
+		owner:      owner,
+		creator:    owner,
+		perms:      perms,
+		changeTime: ktime.NowFromContext(ctx),
+		sems:       make([]sem, nsems),
+	}
+
+	// Find the next available ID.
+	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
+		// Handle wrap around.
+		if id < 0 {
+			id = 0
+			continue
+		}
+		if r.semaphores[id] == nil {
+			r.lastIDUsed = id
+			r.semaphores[id] = set
+			set.ID = id
+			return set, nil
+		}
+	}
+
+	log.Warningf("Semaphore map is full, they must be leaking")
+	return nil, syserror.ENOMEM
+}
+
+// FindByID looks up a set given an ID.
+func (r *Registry) FindByID(id int32) *Set {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return r.semaphores[id]
+}
+
+func (r *Registry) findByKey(key int32) *Set {
+	for _, v := range r.semaphores {
+		if v.key == key {
+			return v
+		}
+	}
+	return nil
+}
+
+func (r *Registry) totalSems() int {
+	totalSems := 0
+	for _, v := range r.semaphores {
+		totalSems += v.size()
+	}
+	return totalSems
+}
+
+func (s *Set) findSem(num int32) *sem {
+	if num < 0 || int(num) >= s.size() {
+		return nil
+	}
+	return &s.sems[num]
+}
+
+func (s *Set) size() int {
+	return len(s.sems)
+}
+
+// Change changes some fields from the set atomically.
+func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The effective UID of the calling process must match the owner or creator
+	// of the semaphore set, or the caller must be privileged."
+	if !s.checkCredentials(creds) && !s.checkCapability(creds) {
+		return syserror.EACCES
+	}
+
+	s.owner = owner
+	s.perms = perms
+	s.changeTime = ktime.NowFromContext(ctx)
+	return nil
+}
+
+// SetVal overrides a semaphore value, waking up waiters as needed.
+func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials) error {
+	if val < 0 || val > valueMax {
+		return syserror.ERANGE
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have alter permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Write: true}) {
+		return syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return syserror.ERANGE
+	}
+
+	// TODO: Clear undo entries in all processes
+	sem.value = val
+	s.changeTime = ktime.NowFromContext(ctx)
+	sem.wakeWaiters()
+	return nil
+}
+
+// GetVal returns a semaphore value.
+func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return 0, syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return 0, syserror.ERANGE
+	}
+	return sem.value, nil
+}
+
+// ExecuteOps attempts to execute a list of operations to the set. It only
+// suceeds when all operations can be applied. No changes are made if it fails.
+//
+// On failure, it may return an error (retries are hopeless) or it may return
+// a channel that can be waited on before attempting again.
+func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials) (chan struct{}, int32, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// Did it race with a removal operation?
+	if s.dead {
+		return nil, 0, syserror.EIDRM
+	}
+
+	// Validate the operations.
+	readOnly := true
+	for _, op := range ops {
+		if s.findSem(int32(op.SemNum)) == nil {
+			return nil, 0, syserror.EFBIG
+		}
+		if op.SemOp != 0 {
+			readOnly = false
+		}
+	}
+
+	if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) {
+		return nil, 0, syserror.EACCES
+	}
+
+	ch, num, err := s.executeOps(ctx, ops)
+	if err != nil {
+		return nil, 0, err
+	}
+	return ch, num, nil
+}
+
+func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf) (chan struct{}, int32, error) {
+	// Changes to semaphores go to this slice temporarily until they all succeed.
+	tmpVals := make([]int16, len(s.sems))
+	for i := range s.sems {
+		tmpVals[i] = s.sems[i].value
+	}
+
+	for _, op := range ops {
+		sem := &s.sems[op.SemNum]
+		if op.SemOp == 0 {
+			// Handle 'wait for zero' operation.
+			if tmpVals[op.SemNum] != 0 {
+				// Semaphore isn't 0, must wait.
+				if op.SemFlg&linux.IPC_NOWAIT != 0 {
+					return nil, 0, syserror.ErrWouldBlock
+				}
+
+				w := newWaiter(op.SemOp)
+				sem.waiters.PushBack(w)
+				return w.ch, int32(op.SemNum), nil
+			}
+		} else {
+			if op.SemOp < 0 {
+				// Handle 'wait' operation.
+				if -op.SemOp > valueMax {
+					return nil, 0, syserror.ERANGE
+				}
+				if -op.SemOp > tmpVals[op.SemNum] {
+					// Not enough resources, must wait.
+					if op.SemFlg&linux.IPC_NOWAIT != 0 {
+						return nil, 0, syserror.ErrWouldBlock
+					}
+
+					w := newWaiter(op.SemOp)
+					sem.waiters.PushBack(w)
+					return w.ch, int32(op.SemNum), nil
+				}
+			} else {
+				// op.SemOp > 0: Handle 'signal' operation.
+				if tmpVals[op.SemNum] > valueMax-op.SemOp {
+					return nil, 0, syserror.ERANGE
+				}
+			}
+
+			tmpVals[op.SemNum] += op.SemOp
+		}
+	}
+
+	// All operations succeeded, apply them.
+	// TODO: handle undo operations.
+	for i, v := range tmpVals {
+		s.sems[i].value = v
+		s.sems[i].wakeWaiters()
+	}
+	s.opTime = ktime.NowFromContext(ctx)
+	return nil, 0, nil
+}
+
+// AbortWait notifies that a waiter is giving up and will not wait on the
+// channel anymore.
+func (s *Set) AbortWait(num int32, ch chan struct{}) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	sem := &s.sems[num]
+	for w := sem.waiters.Front(); w != nil; w = w.Next() {
+		if w.ch == ch {
+			sem.waiters.Remove(w)
+			return
+		}
+	}
+	// Waiter may not be found in case it raced with wakeWaiters().
+}
+
+func (s *Set) checkCredentials(creds *auth.Credentials) bool {
+	return s.owner.UID == creds.EffectiveKUID ||
+		s.owner.GID == creds.EffectiveKGID ||
+		s.creator.UID == creds.EffectiveKUID ||
+		s.creator.GID == creds.EffectiveKGID
+}
+
+func (s *Set) checkCapability(creds *auth.Credentials) bool {
+	return creds.HasCapability(linux.CAP_IPC_OWNER) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok()
+}
+
+func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool {
+	// Are we owner, or in group, or other?
+	p := s.perms.Other
+	if s.owner.UID == creds.EffectiveKUID {
+		p = s.perms.User
+	} else if creds.InGroup(s.owner.GID) {
+		p = s.perms.Group
+	}
+
+	// Are permissions satisfied without capability checks?
+	if p.SupersetOf(reqPerms) {
+		return true
+	}
+
+	return s.checkCapability(creds)
+}
+
+func (s *Set) destroy() {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// Notify all waiters. Tney will fail on the next attempt to execute
+	// operations and return error.
+	s.dead = true
+	for _, s := range s.sems {
+		for w := s.waiters.Front(); w != nil; w = w.Next() {
+			w.ch <- struct{}{}
+		}
+		s.waiters.Reset()
+	}
+}
+
+// wakeWaiters goes over all waiters and checks which of them can be notified.
+func (s *sem) wakeWaiters() {
+	// Note that this will release all waiters waiting for 0 too.
+	for w := s.waiters.Front(); w != nil; {
+		if s.value < w.value {
+			// Still blocked, skip it.
+			continue
+		}
+		w.ch <- struct{}{}
+		old := w
+		w = w.Next()
+		s.waiters.Remove(old)
+	}
+}
+
+func newWaiter(val int16) *waiter {
+	return &waiter{
+		value: val,
+		ch:    make(chan struct{}, 1),
+	}
+}
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
new file mode 100644
index 000000000..0386586ab
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -0,0 +1,172 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package semaphore
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func executeOps(ctx context.Context, t *testing.T, set *Set, ops []linux.Sembuf, block bool) chan struct{} {
+	ch, _, err := set.executeOps(ctx, ops)
+	if err != nil {
+		t.Fatalf("ExecuteOps(ops) failed, err: %v, ops: %+v", err, ops)
+	}
+	if block {
+		if ch == nil {
+			t.Fatalf("ExecuteOps(ops) got: nil, expected: !nil, ops: %+v", ops)
+		}
+		if signalled(ch) {
+			t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops)
+		}
+	} else {
+		if ch != nil {
+			t.Fatalf("ExecuteOps(ops) got: %v, expected: nil, ops: %+v", ch, ops)
+		}
+	}
+	return ch
+}
+
+func signalled(ch chan struct{}) bool {
+	select {
+	case <-ch:
+		return true
+	default:
+		return false
+	}
+}
+
+func TestBasic(t *testing.T) {
+	ctx := contexttest.Context(t)
+	set := &Set{ID: 123, sems: make([]sem, 1)}
+	ops := []linux.Sembuf{
+		linux.Sembuf{SemOp: 1},
+	}
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -1
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -1
+	ch1 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 1
+	executeOps(ctx, t, set, ops, false)
+	if !signalled(ch1) {
+		t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops)
+	}
+}
+
+func TestWaitForZero(t *testing.T) {
+	ctx := contexttest.Context(t)
+	set := &Set{ID: 123, sems: make([]sem, 1)}
+	ops := []linux.Sembuf{
+		linux.Sembuf{SemOp: 0},
+	}
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -2
+	ch1 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 0
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = 1
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = 0
+	chZero1 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 0
+	chZero2 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 1
+	executeOps(ctx, t, set, ops, false)
+	if !signalled(ch1) {
+		t.Fatalf("ExecuteOps(ops) channel should have been signalled, ops: %+v, set: %+v", ops, set)
+	}
+
+	ops[0].SemOp = -2
+	executeOps(ctx, t, set, ops, false)
+	if !signalled(chZero1) {
+		t.Fatalf("ExecuteOps(ops) channel zero 1 should have been signalled, ops: %+v, set: %+v", ops, set)
+	}
+	if !signalled(chZero2) {
+		t.Fatalf("ExecuteOps(ops) channel zero 2 should have been signalled, ops: %+v, set: %+v", ops, set)
+	}
+}
+
+func TestNoWait(t *testing.T) {
+	ctx := contexttest.Context(t)
+	set := &Set{ID: 123, sems: make([]sem, 1)}
+	ops := []linux.Sembuf{
+		linux.Sembuf{SemOp: 1},
+	}
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -2
+	ops[0].SemFlg = linux.IPC_NOWAIT
+	if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock {
+		t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
+	}
+
+	ops[0].SemOp = 0
+	ops[0].SemFlg = linux.IPC_NOWAIT
+	if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock {
+		t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
+	}
+}
+
+func TestUnregister(t *testing.T) {
+	ctx := contexttest.Context(t)
+	r := NewRegistry()
+	set, err := r.FindOrCreate(ctx, 123, 2, linux.FileMode(0x600), true, true, true)
+	if err != nil {
+		t.Fatalf("FindOrCreate() failed, err: %v", err)
+	}
+	if got := r.FindByID(set.ID); got.ID != set.ID {
+		t.Fatalf("FindById(%d) failed, got: %+v, expected: %+v", set.ID, got, set)
+	}
+
+	ops := []linux.Sembuf{
+		linux.Sembuf{SemOp: -1},
+	}
+	chs := make([]chan struct{}, 0, 5)
+	for i := 0; i < 5; i++ {
+		ch := executeOps(ctx, t, set, ops, true)
+		chs = append(chs, ch)
+	}
+
+	creds := auth.CredentialsFromContext(ctx)
+	if err := r.RemoveID(set.ID, creds); err != nil {
+		t.Fatalf("RemoveID(%d) failed, err: %v", set.ID, err)
+	}
+	if !set.dead {
+		t.Fatalf("set is not dead: %+v", set)
+	}
+	if got := r.FindByID(set.ID); got != nil {
+		t.Fatalf("FindById(%d) failed, got: %+v, expected: nil", set.ID, got)
+	}
+	for i, ch := range chs {
+		if !signalled(ch) {
+			t.Fatalf("channel %d should have been signalled", i)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
new file mode 100644
index 000000000..53d8fb844
--- /dev/null
+++ b/pkg/sentry/kernel/sessions.go
@@ -0,0 +1,462 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SessionID is the public identifier.
+type SessionID ThreadID
+
+// ProcessGroupID is the public identifier.
+type ProcessGroupID ThreadID
+
+// Session contains a leader threadgroup and a list of ProcessGroups.
+type Session struct {
+	refs refs.AtomicRefCount
+
+	// leader is the originator of the Session.
+	//
+	// Note that this may no longer be running (and may be reaped), so the
+	// ID is cached upon initial creation. The leader is still required
+	// however, since its PIDNamespace defines the scope of the Session.
+	//
+	// The leader is immutable.
+	leader *ThreadGroup
+
+	// id is the cached identifier in the leader's namespace.
+	//
+	// The id is immutable.
+	id SessionID
+
+	// ProcessGroups is a list of process groups in this Session. This is
+	// protected by TaskSet.mu.
+	processGroups processGroupList
+
+	// sessionEntry is the embed for TaskSet.sessions. This is protected by
+	// TaskSet.mu.
+	sessionEntry
+}
+
+// incRef grabs a reference.
+func (s *Session) incRef() {
+	s.refs.IncRef()
+}
+
+// decRef drops a reference.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (s *Session) decRef() {
+	s.refs.DecRefWithDestructor(func() {
+		// Remove translations from the leader.
+		for ns := s.leader.pidns; ns != nil; ns = ns.parent {
+			id := ns.sids[s]
+			delete(ns.sids, s)
+			delete(ns.sessions, id)
+		}
+
+		// Remove from the list of global Sessions.
+		s.leader.pidns.owner.sessions.Remove(s)
+	})
+}
+
+// ProcessGroup contains an originator threadgroup and a parent Session.
+type ProcessGroup struct {
+	refs refs.AtomicRefCount // not exported.
+
+	// originator is the originator of the group.
+	//
+	// See note re: leader in Session. The same applies here.
+	//
+	// The originator is immutable.
+	originator *ThreadGroup
+
+	// id is the cached identifier in the originator's namespace.
+	//
+	// The id is immutable.
+	id ProcessGroupID
+
+	// Session is the parent Session.
+	//
+	// The session is immutable.
+	session *Session
+
+	// ancestors is the number of thread groups in this process group whose
+	// parent is in a different process group in the same session.
+	//
+	// The name is derived from the fact that process groups where
+	// ancestors is zero are considered "orphans".
+	//
+	// ancestors is protected by TaskSet.mu.
+	ancestors uint32
+
+	// processGroupEntry is the embedded entry for Sessions.groups. This is
+	// protected by TaskSet.mu.
+	processGroupEntry
+}
+
+// incRefWithParent grabs a reference.
+//
+// This function is called when this ProcessGroup is being associated with some
+// new ThreadGroup, tg. parentPG is the ProcessGroup of tg's parent
+// ThreadGroup. If tg is init, then parentPG may be nil.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) incRefWithParent(parentPG *ProcessGroup) {
+	// We acquire an "ancestor" reference in the case of a nil parent.
+	// This is because the process being associated is init, and init can
+	// never be orphaned (we count it as always having an ancestor).
+	if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) {
+		pg.ancestors++
+	}
+
+	pg.refs.IncRef()
+}
+
+// decRefWithParent drops a reference.
+//
+// parentPG is per incRefWithParent.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
+	// See incRefWithParent regarding parent == nil.
+	if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) {
+		pg.ancestors--
+	}
+
+	alive := true
+	pg.refs.DecRefWithDestructor(func() {
+		alive = false // don't bother with handleOrphan.
+
+		// Remove translations from the originator.
+		for ns := pg.originator.pidns; ns != nil; ns = ns.parent {
+			id := ns.pgids[pg]
+			delete(ns.pgids, pg)
+			delete(ns.processGroups, id)
+		}
+
+		// Remove the list of process groups.
+		pg.session.processGroups.Remove(pg)
+		pg.session.decRef()
+	})
+	if alive {
+		pg.handleOrphan()
+	}
+}
+
+// parentPG returns the parent process group.
+//
+// Precondition: callers must hold TaskSet.mu.
+func (tg *ThreadGroup) parentPG() *ProcessGroup {
+	if tg.leader.parent != nil {
+		return tg.leader.parent.tg.processGroup
+	}
+	return nil
+}
+
+// handleOrphan checks whether the process group is an orphan and has any
+// stopped jobs. If yes, then appropriate signals are delivered to each thread
+// group within the process group.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) handleOrphan() {
+	// Check if this process is an orphan.
+	if pg.ancestors != 0 {
+		return
+	}
+
+	// See if there are any stopped jobs.
+	hasStopped := false
+	pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) {
+		if tg.processGroup != pg {
+			return
+		}
+		tg.signalHandlers.mu.Lock()
+		if tg.groupStopPhase == groupStopComplete {
+			hasStopped = true
+		}
+		tg.signalHandlers.mu.Unlock()
+	})
+	if !hasStopped {
+		return
+	}
+
+	// Deliver appropriate signals to all thread groups.
+	pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) {
+		if tg.processGroup != pg {
+			return
+		}
+		tg.signalHandlers.mu.Lock()
+		tg.leader.sendSignalLocked(sigPriv(linux.SIGHUP), true /* group */)
+		tg.leader.sendSignalLocked(sigPriv(linux.SIGCONT), true /* group */)
+		tg.signalHandlers.mu.Unlock()
+	})
+
+	return
+}
+
+// CreateSession creates a new Session, with the ThreadGroup as the leader.
+//
+// EPERM may be returned if either the given ThreadGroup is already a Session
+// leader, or a ProcessGroup already exists for the ThreadGroup's ID.
+func (tg *ThreadGroup) CreateSession() error {
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+	return tg.createSession()
+}
+
+// createSession creates a new session for a threadgroup.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (tg *ThreadGroup) createSession() error {
+	// Get the ID for this thread in the current namespace.
+	id := tg.pidns.tids[tg.leader]
+
+	// Check if this ThreadGroup already leads a Session, or
+	// if the proposed group is already taken.
+	for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() {
+		if s.leader.pidns != tg.pidns {
+			continue
+		}
+		if s.leader == tg {
+			return syserror.EPERM
+		}
+		if s.id == SessionID(id) {
+			return syserror.EPERM
+		}
+		for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
+			if pg.id == ProcessGroupID(id) {
+				return syserror.EPERM
+			}
+		}
+	}
+
+	// Create a new Session, with a single reference.
+	s := &Session{
+		id:     SessionID(id),
+		leader: tg,
+	}
+
+	// Create a new ProcessGroup, belonging to that Session.
+	// This also has a single reference (assigned below).
+	//
+	// Note that since this is a new session and a new process group, there
+	// will be zero ancestors for this process group. (It is an orphan at
+	// this point.)
+	pg := &ProcessGroup{
+		id:         ProcessGroupID(id),
+		originator: tg,
+		session:    s,
+		ancestors:  0,
+	}
+
+	// Tie them and return the result.
+	s.processGroups.PushBack(pg)
+	tg.pidns.owner.sessions.PushBack(s)
+
+	// Leave the current group, and assign the new one.
+	if tg.processGroup != nil {
+		oldParentPG := tg.parentPG()
+		tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+			childTG.processGroup.incRefWithParent(pg)
+			childTG.processGroup.decRefWithParent(oldParentPG)
+		})
+		tg.processGroup.decRefWithParent(oldParentPG)
+		tg.processGroup = pg
+	} else {
+		// The current process group may be nil only in the case of an
+		// unparented thread group (i.e. the init process). This would
+		// not normally occur, but we allow it for the convenience of
+		// CreateSession working from that point. There will be no
+		// child processes. We always say that the very first group
+		// created has ancestors (avoids checks elsewhere).
+		//
+		// Note that this mirrors the parent == nil logic in
+		// incRef/decRef/reparent, which counts nil as an ancestor.
+		tg.processGroup = pg
+		tg.processGroup.ancestors++
+	}
+
+	// Ensure a translation is added to all namespaces.
+	for ns := tg.pidns; ns != nil; ns = ns.parent {
+		local := ns.tids[tg.leader]
+		ns.sids[s] = SessionID(local)
+		ns.sessions[SessionID(local)] = s
+		ns.pgids[pg] = ProcessGroupID(local)
+		ns.processGroups[ProcessGroupID(local)] = pg
+	}
+
+	return nil
+}
+
+// CreateProcessGroup creates a new process group.
+//
+// An EPERM error will be returned if the ThreadGroup belongs to a different
+// Session, is a Session leader or the group already exists.
+func (tg *ThreadGroup) CreateProcessGroup() error {
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+
+	// Get the ID for this thread in the current namespace.
+	id := tg.pidns.tids[tg.leader]
+
+	// Per above, check for a Session leader or existing group.
+	for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() {
+		if s.leader.pidns != tg.pidns {
+			continue
+		}
+		if s.leader == tg {
+			return syserror.EPERM
+		}
+		for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
+			if pg.id == ProcessGroupID(id) {
+				return syserror.EPERM
+			}
+		}
+	}
+
+	// Create a new ProcessGroup, belonging to the current Session.
+	//
+	// We manually adjust the ancestors if the parent is in the same
+	// session.
+	tg.processGroup.session.incRef()
+	pg := &ProcessGroup{
+		id:         ProcessGroupID(id),
+		originator: tg,
+		session:    tg.processGroup.session,
+	}
+	if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
+		pg.ancestors++
+	}
+
+	// Assign the new process group; adjust children.
+	oldParentPG := tg.parentPG()
+	tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+		childTG.processGroup.incRefWithParent(pg)
+		childTG.processGroup.decRefWithParent(oldParentPG)
+	})
+	tg.processGroup.decRefWithParent(oldParentPG)
+	tg.processGroup = pg
+
+	// Ensure this translation is added to all namespaces.
+	for ns := tg.pidns; ns != nil; ns = ns.parent {
+		local := ns.tids[tg.leader]
+		ns.pgids[pg] = ProcessGroupID(local)
+		ns.processGroups[ProcessGroupID(local)] = pg
+	}
+
+	return nil
+}
+
+// JoinProcessGroup joins an existing process group.
+//
+// This function will return EACCES if an exec has been performed since fork
+// by the given ThreadGroup, and EPERM if the Sessions are not the same or the
+// group does not exist.
+//
+// If checkExec is set, then the join is not permitted after the process has
+// executed exec at least once.
+func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID, checkExec bool) error {
+	pidns.owner.mu.Lock()
+	defer pidns.owner.mu.Unlock()
+
+	// Lookup the ProcessGroup.
+	pg := pidns.processGroups[pgid]
+	if pg == nil {
+		return syserror.EPERM
+	}
+
+	// Disallow the join if an execve has performed, per POSIX.
+	if checkExec && tg.execed {
+		return syserror.EACCES
+	}
+
+	// See if it's in the same session as ours.
+	if pg.session != tg.processGroup.session {
+		return syserror.EPERM
+	}
+
+	// Join the group; adjust children.
+	parentPG := tg.parentPG()
+	pg.incRefWithParent(parentPG)
+	tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+		childTG.processGroup.incRefWithParent(pg)
+		childTG.processGroup.decRefWithParent(tg.processGroup)
+	})
+	tg.processGroup.decRefWithParent(parentPG)
+	tg.processGroup = pg
+
+	return nil
+}
+
+// Session returns the ThreadGroup's Session.
+//
+// A reference is not taken on the session.
+func (tg *ThreadGroup) Session() *Session {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.processGroup.session
+}
+
+// IDOfSession returns the Session assigned to s in PID namespace ns.
+//
+// If this group isn't visible in this namespace, zero will be returned. It is
+// the callers responsibility to check that before using this function.
+func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.sids[s]
+}
+
+// SessionWithID returns the Session with the given ID in the PID namespace ns,
+// or nil if that given ID is not defined in this namespace.
+//
+// A reference is not taken on the session.
+func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.sessions[id]
+}
+
+// ProcessGroup returns the ThreadGroup's ProcessGroup.
+//
+// A reference is not taken on the process group.
+func (tg *ThreadGroup) ProcessGroup() *ProcessGroup {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.processGroup
+}
+
+// IDOfProcessGroup returns the process group assigned to pg in PID namespace ns.
+//
+// The same constraints apply as IDOfSession.
+func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.pgids[pg]
+}
+
+// ProcessGroupWithID returns the ProcessGroup with the given ID in the PID
+// namespace ns, or nil if that given ID is not defined in this namespace.
+//
+// A reference is not taken on the process group.
+func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.processGroups[id]
+}
diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
new file mode 100644
index 000000000..8edd05cdf
--- /dev/null
+++ b/pkg/sentry/kernel/signal.go
@@ -0,0 +1,69 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+)
+
+// SignalPanic is used to panic the running threads. It is a signal which
+// cannot be used by the application: it must be caught and ignored by the
+// runtime (in order to catch possible races).
+const SignalPanic = linux.SIGUSR2
+
+// sendExternalSignal is called when an asynchronous signal is sent to the
+// sentry ("in sentry context"). On some platforms, it may also be called when
+// an asynchronous signal is sent to sandboxed application threads ("in
+// application context").
+//
+// context is used only for debugging to differentiate these cases.
+//
+// Returns false if signal could not be sent because the Kernel is not fully
+// initialized yet.
+func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) bool {
+	switch linux.Signal(info.Signo) {
+	case platform.SignalInterrupt:
+		// Assume that a call to platform.Context.Interrupt() misfired.
+		return true
+
+	case SignalPanic:
+		// SignalPanic is also specially handled in sentry setup to ensure that
+		// it causes a panic even after tasks exit, but SignalPanic may also
+		// be sent here if it is received while in app context.
+		panic("Signal-induced panic")
+
+	default:
+		log.Infof("Received external signal %d in %s context", info.Signo, context)
+		if k.globalInit == nil {
+			log.Warningf("Received external signal %d before init created", info.Signo)
+			return false
+		}
+		k.globalInit.SendSignal(info)
+	}
+
+	return true
+}
+
+// sigPriv returns a SignalInfo representing a signal sent by the sentry. (The
+// name reflects its equivalence to Linux's SEND_SIG_PRIV.)
+func sigPriv(sig linux.Signal) *arch.SignalInfo {
+	return &arch.SignalInfo{
+		Signo: int32(sig),
+		Code:  arch.SignalInfoKernel,
+	}
+}
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
new file mode 100644
index 000000000..21ba4ee70
--- /dev/null
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+// SignalHandlers holds information about signal actions.
+type SignalHandlers struct {
+	// mu protects actions, as well as the signal state of all tasks and thread
+	// groups using this SignalHandlers object. (See comment on
+	// ThreadGroup.signalHandlers.)
+	mu sync.Mutex `state:"nosave"`
+
+	// actions is the action to be taken upon receiving each signal.
+	actions map[linux.Signal]arch.SignalAct
+}
+
+// NewSignalHandlers returns a new SignalHandlers specifying all default
+// actions.
+func NewSignalHandlers() *SignalHandlers {
+	return &SignalHandlers{
+		actions: make(map[linux.Signal]arch.SignalAct),
+	}
+}
+
+// Fork returns a copy of sh for a new thread group.
+func (sh *SignalHandlers) Fork() *SignalHandlers {
+	sh2 := NewSignalHandlers()
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	for sig, act := range sh.actions {
+		sh2.actions[sig] = act
+	}
+	return sh2
+}
+
+// CopyForExec returns a copy of sh for a thread group that is undergoing an
+// execve. (See comments in Task.finishExec.)
+func (sh *SignalHandlers) CopyForExec() *SignalHandlers {
+	sh2 := NewSignalHandlers()
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	for sig, act := range sh.actions {
+		if act.Handler == arch.SignalActIgnore {
+			sh2.actions[sig] = arch.SignalAct{
+				Handler: arch.SignalActIgnore,
+			}
+		}
+	}
+	return sh2
+}
+
+// dequeueActionLocked returns the SignalAct that should be used to handle sig.
+//
+// Preconditions: sh.mu must be locked.
+func (sh *SignalHandlers) dequeueAction(sig linux.Signal) arch.SignalAct {
+	act := sh.actions[sig]
+	if act.IsResetHandler() {
+		delete(sh.actions, sig)
+	}
+	return act
+}
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
new file mode 100644
index 000000000..e20fa3eb6
--- /dev/null
+++ b/pkg/sentry/kernel/syscalls.go
@@ -0,0 +1,305 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// maxSyscallNum is the highest supported syscall number.
+//
+// The types below create fast lookup slices for all syscalls. This maximum
+// serves as a sanity check that we don't allocate huge slices for a very large
+// syscall.
+const maxSyscallNum = 2000
+
+// SyscallFn is a syscall implementation.
+type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error)
+
+// MissingFn is a syscall to be called when an implementation is missing.
+type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error)
+
+// Possible flags for SyscallFlagsTable.enable.
+const (
+	// syscallPresent indicates that this is not a missing syscall.
+	//
+	// This flag is used internally in SyscallFlagsTable.
+	syscallPresent = 1 << iota
+
+	// StraceEnableLog enables syscall log tracing.
+	StraceEnableLog
+
+	// StraceEnableEvent enables syscall event tracing.
+	StraceEnableEvent
+
+	// ExternalBeforeEnable enables the external hook before syscall execution.
+	ExternalBeforeEnable
+
+	// ExternalAfterEnable enables the external hook after syscall execution.
+	ExternalAfterEnable
+)
+
+// StraceEnableBits combines both strace log and event flags.
+const StraceEnableBits = StraceEnableLog | StraceEnableEvent
+
+// SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall
+// basis.
+type SyscallFlagsTable struct {
+	// mu protects writes to the fields below.
+	//
+	// Atomic loads are always allowed. Atomic stores are allowed only
+	// while mu is held.
+	mu sync.Mutex
+
+	// enable contains the enable bits for each syscall.
+	//
+	// missing syscalls have the same value in enable as missingEnable to
+	// avoid an extra branch in Word.
+	enable []uint32
+
+	// missingEnable contains the enable bits for missing syscalls.
+	missingEnable uint32
+}
+
+// Init initializes the struct, with all syscalls in table set to enable.
+//
+// max is the largest syscall number in table.
+func (e *SyscallFlagsTable) init(table map[uintptr]SyscallFn, max uintptr) {
+	e.enable = make([]uint32, max+1)
+	for num := range table {
+		e.enable[num] = syscallPresent
+	}
+}
+
+// Word returns the enable bitfield for sysno.
+func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 {
+	if sysno < uintptr(len(e.enable)) {
+		return atomic.LoadUint32(&e.enable[sysno])
+	}
+
+	return atomic.LoadUint32(&e.missingEnable)
+}
+
+// Enable sets enable bit bit for all syscalls based on s.
+//
+// Syscalls missing from s are disabled.
+//
+// Syscalls missing from the initial table passed to Init cannot be added as
+// individual syscalls. If present in s they will be ignored.
+//
+// Callers to Word may see either the old or new value while this function
+// is executing.
+func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	missingVal := atomic.LoadUint32(&e.missingEnable)
+	if missingEnable {
+		missingVal |= bit
+	} else {
+		missingVal &^= bit
+	}
+	atomic.StoreUint32(&e.missingEnable, missingVal)
+
+	for num := range e.enable {
+		val := atomic.LoadUint32(&e.enable[num])
+		if !bits.IsOn32(val, syscallPresent) {
+			// Missing.
+			atomic.StoreUint32(&e.enable[num], missingVal)
+			continue
+		}
+
+		if s[uintptr(num)] {
+			val |= bit
+		} else {
+			val &^= bit
+		}
+		atomic.StoreUint32(&e.enable[num], val)
+	}
+}
+
+// EnableAll sets enable bit bit for all syscalls, present and missing.
+func (e *SyscallFlagsTable) EnableAll(bit uint32) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	missingVal := atomic.LoadUint32(&e.missingEnable)
+	missingVal |= bit
+	atomic.StoreUint32(&e.missingEnable, missingVal)
+
+	for num := range e.enable {
+		val := atomic.LoadUint32(&e.enable[num])
+		if !bits.IsOn32(val, syscallPresent) {
+			// Missing.
+			atomic.StoreUint32(&e.enable[num], missingVal)
+			continue
+		}
+
+		val |= bit
+		atomic.StoreUint32(&e.enable[num], val)
+	}
+}
+
+// Stracer traces syscall execution.
+type Stracer interface {
+	// SyscallEnter is called on syscall entry.
+	//
+	// The returned private data is passed to SyscallExit.
+	//
+	// TODO: remove kernel imports from the strace package so
+	// that the type can be used directly.
+	SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
+
+	// SyscallExit is called on syscall exit.
+	SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error)
+}
+
+// SyscallTable is a lookup table of system calls. Critically, a SyscallTable
+// is *immutable*. In order to make supporting suspend and resume sane, they
+// must be uniquely registered and may not change during operation.
+type SyscallTable struct {
+	// OS is the operating system that this syscall table implements.
+	OS abi.OS `state:"wait"`
+
+	// Arch is the architecture that this syscall table targets.
+	Arch arch.Arch `state:"wait"`
+
+	// The OS version that this syscall table implements.
+	Version Version `state:"manual"`
+
+	// AuditNumber is a numeric constant that represents the syscall table. If
+	// non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
+	// linux/audit.h.
+	AuditNumber uint32 `state:"manual"`
+
+	// Table is the collection of functions.
+	Table map[uintptr]SyscallFn `state:"manual"`
+
+	// lookup is a fixed-size array that holds the syscalls (indexed by
+	// their numbers). It is used for fast look ups.
+	lookup []SyscallFn `state:"manual"`
+
+	// Emulate is a collection of instruction addresses to emulate. The
+	// keys are addresses, and the values are system call numbers.
+	Emulate map[usermem.Addr]uintptr `state:"manual"`
+
+	// The function to call in case of a missing system call.
+	Missing MissingFn `state:"manual"`
+
+	// Stracer traces this syscall table.
+	Stracer Stracer `state:"manual"`
+
+	// External is used to handle an external callback.
+	External func(*Kernel) `state:"manual"`
+
+	// ExternalFilterBefore is called before External is called before the syscall is executed.
+	// External is not called if it returns false.
+	ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+
+	// ExternalFilterAfter is called before External is called after the syscall is executed.
+	// External is not called if it returns false.
+	ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+
+	// FeatureEnable stores the strace and one-shot enable bits.
+	FeatureEnable SyscallFlagsTable `state:"manual"`
+}
+
+// allSyscallTables contains all known tables.
+var allSyscallTables []*SyscallTable
+
+// SyscallTables returns a read-only slice of registered SyscallTables.
+func SyscallTables() []*SyscallTable {
+	return allSyscallTables
+}
+
+// LookupSyscallTable returns the SyscallCall table for the OS/Arch combination.
+func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) {
+	for _, s := range allSyscallTables {
+		if s.OS == os && s.Arch == a {
+			return s, true
+		}
+	}
+	return nil, false
+}
+
+// RegisterSyscallTable registers a new syscall table for use by a Kernel.
+func RegisterSyscallTable(s *SyscallTable) {
+	if s.Table == nil {
+		// Ensure non-nil lookup table.
+		s.Table = make(map[uintptr]SyscallFn)
+	}
+	if s.Emulate == nil {
+		// Ensure non-nil emulate table.
+		s.Emulate = make(map[usermem.Addr]uintptr)
+	}
+
+	var max uintptr
+	for num := range s.Table {
+		if num > max {
+			max = num
+		}
+	}
+
+	if max > maxSyscallNum {
+		panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max))
+	}
+
+	s.lookup = make([]SyscallFn, max+1)
+
+	// Initialize the fast-lookup table.
+	for num, fn := range s.Table {
+		s.lookup[num] = fn
+	}
+
+	s.FeatureEnable.init(s.Table, max)
+
+	if _, ok := LookupSyscallTable(s.OS, s.Arch); ok {
+		panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch))
+	}
+
+	// Save a reference to this table.
+	//
+	// This is required for a Kernel to find the table and for save/restore
+	// operations below.
+	allSyscallTables = append(allSyscallTables, s)
+}
+
+// Lookup returns the syscall implementation, if one exists.
+func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
+	if sysno < uintptr(len(s.lookup)) {
+		return s.lookup[sysno]
+	}
+
+	return nil
+}
+
+// LookupEmulate looks up an emulation syscall number.
+func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
+	sysno, ok := s.Emulate[addr]
+	return sysno, ok
+}
+
+// mapLookup is similar to Lookup, except that it only uses the syscall table,
+// that is, it skips the fast look array. This is available for benchmarking.
+func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn {
+	return s.Table[sysno]
+}
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
new file mode 100644
index 000000000..826809a70
--- /dev/null
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -0,0 +1,29 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import "fmt"
+
+// afterLoad is invoked by stateify.
+func (s *SyscallTable) afterLoad() {
+	otherTable, ok := LookupSyscallTable(s.OS, s.Arch)
+	if !ok {
+		// Couldn't find a reference?
+		panic(fmt.Sprintf("syscall table not found for OS %v Arch %v", s.OS, s.Arch))
+	}
+
+	// Copy the table.
+	*s = *otherTable
+}
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
new file mode 100644
index 000000000..31541749e
--- /dev/null
+++ b/pkg/sentry/kernel/syslog.go
@@ -0,0 +1,100 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"math/rand"
+	"sync"
+)
+
+// syslog represents a sentry-global kernel log.
+//
+// Currently, it contains only fun messages for a dmesg easter egg.
+type syslog struct {
+	// mu protects the below.
+	mu sync.Mutex `state:"nosave"`
+
+	// msg is the syslog message buffer. It is lazily initialized.
+	msg []byte
+}
+
+// Log returns a copy of the syslog.
+func (s *syslog) Log() []byte {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if s.msg != nil {
+		// Already initialized, just return a copy.
+		o := make([]byte, len(s.msg))
+		copy(o, s.msg)
+		return o
+	}
+
+	// Not initialized, create message.
+	allMessages := []string{
+		"Synthesizing system calls...",
+		"Mounting deweydecimalfs...",
+		"Moving files to filing cabinet...",
+		"Digging up root...",
+		"Constructing home...",
+		"Segmenting fault lines...",
+		"Creating bureaucratic processes...",
+		"Searching for needles in stacks...",
+		"Preparing for the zombie uprising...",
+		"Feeding the init monster...",
+		"Creating cloned children...",
+		"Daemonizing children...",
+		"Waiting for children...",
+		"Gathering forks...",
+		"Committing treasure map to memory...",
+		"Reading process obituaries...",
+		"Searching for socket adapter...",
+		"Creating process schedule...",
+		"Generating random numbers by fair dice roll...",
+		"Rewriting operating system in Javascript...",
+		"Consulting tar man page...",
+		"Forking spaghetti code...",
+		"Checking naughty and nice process list...",
+		"Checking naughty and nice process list...", // Check it up to twice.
+		"Granting licence to kill(2)...",            // British spelling for British movie.
+		"Letting the watchdogs out...",
+	}
+
+	selectMessage := func() string {
+		i := rand.Intn(len(allMessages))
+		m := allMessages[i]
+
+		// Delete the selected message.
+		allMessages[i] = allMessages[len(allMessages)-1]
+		allMessages = allMessages[:len(allMessages)-1]
+
+		return m
+	}
+
+	time := 0.0
+	for i := 0; i < 10; i++ {
+		time += rand.Float64() / 2
+		s.msg = append(s.msg, []byte(fmt.Sprintf("<6>[%11.6f] %s\n", time, selectMessage()))...)
+	}
+
+	time += rand.Float64() / 2
+	s.msg = append(s.msg, []byte(fmt.Sprintf("<6>[%11.6f] Ready!\n", time))...)
+
+	// Return a copy.
+	o := make([]byte, len(s.msg))
+	copy(o, s.msg)
+	return o
+}
diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go
new file mode 100644
index 000000000..71ca75555
--- /dev/null
+++ b/pkg/sentry/kernel/table_test.go
@@ -0,0 +1,108 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+const (
+	maxTestSyscall = 1000
+)
+
+func createSyscallTable() *SyscallTable {
+	m := make(map[uintptr]SyscallFn)
+	for i := uintptr(0); i <= maxTestSyscall; i++ {
+		j := i
+		m[i] = func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) {
+			return j, nil, nil
+		}
+	}
+
+	s := &SyscallTable{
+		OS:    abi.Linux,
+		Arch:  arch.AMD64,
+		Table: m,
+	}
+
+	RegisterSyscallTable(s)
+	return s
+}
+
+func TestTable(t *testing.T) {
+	table := createSyscallTable()
+	defer func() {
+		// Cleanup registered tables to keep tests separate.
+		allSyscallTables = []*SyscallTable{}
+	}()
+
+	// Go through all functions and check that they return the right value.
+	for i := uintptr(0); i < maxTestSyscall; i++ {
+		fn := table.Lookup(i)
+		if fn == nil {
+			t.Errorf("Syscall %v is set to nil", i)
+			continue
+		}
+
+		v, _, _ := fn(nil, arch.SyscallArguments{})
+		if v != i {
+			t.Errorf("Wrong return value for syscall %v: expected %v, got %v", i, i, v)
+		}
+	}
+
+	// Check that values outside the range return nil.
+	for i := uintptr(maxTestSyscall + 1); i < maxTestSyscall+100; i++ {
+		fn := table.Lookup(i)
+		if fn != nil {
+			t.Errorf("Syscall %v is not nil: %v", i, fn)
+			continue
+		}
+	}
+}
+
+func BenchmarkTableLookup(b *testing.B) {
+	table := createSyscallTable()
+
+	b.ResetTimer()
+
+	j := uintptr(0)
+	for i := 0; i < b.N; i++ {
+		table.Lookup(j)
+		j = (j + 1) % 310
+	}
+
+	b.StopTimer()
+	// Cleanup registered tables to keep tests separate.
+	allSyscallTables = []*SyscallTable{}
+}
+
+func BenchmarkTableMapLookup(b *testing.B) {
+	table := createSyscallTable()
+
+	b.ResetTimer()
+
+	j := uintptr(0)
+	for i := 0; i < b.N; i++ {
+		table.mapLookup(j)
+		j = (j + 1) % 310
+	}
+
+	b.StopTimer()
+	// Cleanup registered tables to keep tests separate.
+	allSyscallTables = []*SyscallTable{}
+}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
new file mode 100644
index 000000000..3d2e035e9
--- /dev/null
+++ b/pkg/sentry/kernel/task.go
@@ -0,0 +1,606 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bpf"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	ssync "gvisor.googlesource.com/gvisor/pkg/sync"
+)
+
+// Task represents a thread of execution in the untrusted app.  It
+// includes registers and any thread-specific state that you would
+// normally expect.
+//
+// Each task is associated with a goroutine, called the task goroutine, that
+// executes code (application code, system calls, etc.) on behalf of that task.
+// See Task.run (task_run.go).
+//
+// All fields that are "owned by the task goroutine" can only be mutated by the
+// task goroutine while it is running. The task goroutine does not require
+// synchronization to read these fields, although it still requires
+// synchronization as described for those fields to mutate them.
+//
+// All fields that are "exclusive to the task goroutine" can only be accessed
+// by the task goroutine while it is running. The task goroutine does not
+// require synchronization to read or write these fields.
+type Task struct {
+	taskNode
+
+	// runState is what the task goroutine is executing if it is not stopped.
+	// If runState is nil, the task goroutine should exit or has exited.
+	// runState is exclusive to the task goroutine.
+	runState taskRunState
+
+	// haveSyscallReturn is true if tc.Arch().Return() represents a value
+	// returned by a syscall (or set by ptrace after a syscall).
+	//
+	// haveSyscallReturn is exclusive to the task goroutine.
+	haveSyscallReturn bool
+
+	// interruptChan is notified whenever the task goroutine is interrupted
+	// (usually by a pending signal). interruptChan is effectively a condition
+	// variable that can be used in select statements.
+	//
+	// interruptChan is not saved; because saving interrupts all tasks,
+	// interruptChan is always notified after restore (see Task.run).
+	interruptChan chan struct{} `state:"nosave"`
+
+	// gosched contains the current scheduling state of the task goroutine.
+	//
+	// gosched is protected by goschedSeq. gosched is owned by the task
+	// goroutine.
+	goschedSeq ssync.SeqCount `state:"nosave"`
+	gosched    TaskGoroutineSchedInfo
+
+	// yieldCount is the number of times the task goroutine has called
+	// Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or
+	// Task.Yield(), voluntarily ceasing execution.
+	//
+	// yieldCount is accessed using atomic memory operations. yieldCount is
+	// owned by the task goroutine.
+	yieldCount uint64
+
+	// pendingSignals is the set of pending signals that may be handled only by
+	// this task.
+	//
+	// pendingSignals is protected by (taskNode.)tg.signalHandlers.mu
+	// (hereafter "the signal mutex"); see comment on
+	// ThreadGroup.signalHandlers.
+	pendingSignals pendingSignals
+
+	// If haveSavedSignalMask is true, savedSignalMask is the signal mask that
+	// should be applied after the task has either delivered one signal to a
+	// user handler or is about to resume execution in the untrusted
+	// application.
+	//
+	// Both haveSavedSignalMask and savedSignalMask are exclusive to the task
+	// goroutine.
+	haveSavedSignalMask bool
+	savedSignalMask     linux.SignalSet
+
+	// signalStack is the alternate signal stack used by signal handlers for
+	// which the SA_ONSTACK flag is set.
+	//
+	// signalStack is exclusive to the task goroutine.
+	signalStack arch.SignalStack
+
+	// If groupStopRequired is true, the task should enter a group stop in the
+	// interrupt path. groupStopRequired is not redundant with
+	// tg.groupStopPhase != groupStopNone, because ptrace allows tracers to
+	// resume individual tasks from a group stop without ending the group stop
+	// as a whole.
+	//
+	// groupStopRequired is analogous to JOBCTL_TRAP_STOP in Linux, except that
+	// Linux only uses that flag for ptraced tasks.
+	//
+	// groupStopRequired is protected by the signal mutex.
+	groupStopRequired bool
+
+	// If groupStopAcknowledged is true, the task has already acknowledged that
+	// it is entering the most recent group stop that has been initiated on its
+	// thread group. groupStopAcknowledged is only meaningful if
+	// tg.groupStopPhase == groupStopInitiated.
+	//
+	// groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux.
+	//
+	// groupStopAcknowledged is protected by the signal mutex.
+	groupStopAcknowledged bool
+
+	// If stop is not nil, it is the internally-initiated condition that
+	// currently prevents the task goroutine from running.
+	//
+	// stop is protected by the signal mutex.
+	stop TaskStop
+
+	// stopCount is the number of active external stops (calls to
+	// Task.BeginExternalStop that have not been paired with a call to
+	// Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is
+	// non-zero if the task goroutine should stop.
+	//
+	// Mutating stopCount requires both locking the signal mutex and using
+	// atomic memory operations. Reading stopCount requires either locking the
+	// signal mutex or using atomic memory operations. This allows Task.doStop
+	// to require only a single atomic read in the common case where stopCount
+	// is 0.
+	//
+	// stopCount is not saved, because external stops cannot be retained across
+	// a save/restore cycle. (Suppose a sentryctl command issues an external
+	// stop; after a save/restore cycle, the restored sentry has no knowledge
+	// of the pre-save sentryctl command, and the stopped task would remain
+	// stopped forever.)
+	stopCount int32 `state:"nosave"`
+
+	// endStopCond is signaled when stopCount transitions to 0. The combination
+	// of stopCount and endStopCond effectively form a sync.WaitGroup, but
+	// WaitGroup provides no way to read its counter value.
+	//
+	// Invariant: endStopCond.L is the signal mutex. (This is not racy because
+	// sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine
+	// calls sync.Cond.Wait; and only the task goroutine can change the
+	// identity of the signal mutex, in Task.finishExec.)
+	endStopCond sync.Cond `state:"nosave"`
+
+	// exitStatus is the task's exit status.
+	//
+	// exitStatus is protected by the signal mutex.
+	exitStatus ExitStatus
+
+	// syscallRestartBlock represents a custom restart function to run in
+	// restart_syscall(2) to resume an interrupted syscall.
+	//
+	// syscallRestartBlock is exclusive to the task goroutine.
+	syscallRestartBlock SyscallRestartBlock
+
+	// mu protects some of the following fields.
+	mu sync.Mutex `state:"nosave"`
+
+	// tc and tr form the majority of the task's data.
+	//
+	// tc and tr are protected by mu. tc and tr are owned by the task
+	// goroutine. tr.signalMask is protected by the signal mutex and must be
+	// written using atomic memory operations (such that reading tr.signalMask
+	// is safe if the signal mutex is locked or if atomic memory operations are
+	// used), but is also owned by the task goroutine.
+	tc TaskContext
+	tr TaskResources
+
+	// p provides the mechanism by which the task runs code in userspace. The p
+	// interface object is immutable.
+	p platform.Context `state:"nosave"`
+
+	// k is the Kernel that this task belongs to. The k pointer is immutable.
+	k *Kernel
+
+	// If vforkParent is not nil, it is the task that created this task with
+	// vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
+	// this TaskContext is released.
+	//
+	// vforkParent is protected by the TaskSet mutex.
+	vforkParent *Task
+
+	// exitState is the task's progress through the exit path.
+	//
+	// exitState is protected by the TaskSet mutex. exitState is owned by the
+	// task goroutine.
+	exitState TaskExitState
+
+	// exitTracerNotified is true if the exit path has either signaled the
+	// task's tracer to indicate the exit, or determined that no such signal is
+	// needed. exitTracerNotified can only be true if exitState is
+	// TaskExitZombie or TaskExitDead.
+	//
+	// exitTracerNotified is protected by the TaskSet mutex.
+	exitTracerNotified bool
+
+	// exitTracerAcked is true if exitTracerNotified is true and either the
+	// task's tracer has acknowledged the exit notification, or the exit path
+	// has determined that no such notification is needed.
+	//
+	// exitTracerAcked is protected by the TaskSet mutex.
+	exitTracerAcked bool
+
+	// exitParentNotified is true if the exit path has either signaled the
+	// task's parent to indicate the exit, or determined that no such signal is
+	// needed. exitParentNotified can only be true if exitState is
+	// TaskExitZombie or TaskExitDead.
+	//
+	// exitParentNotified is protected by the TaskSet mutex.
+	exitParentNotified bool
+
+	// exitParentAcked is true if exitParentNotified is true and either the
+	// task's parent has acknowledged the exit notification, or the exit path
+	// has determined that no such acknowledgment is needed.
+	//
+	// exitParentAcked is protected by the TaskSet mutex.
+	exitParentAcked bool
+
+	// goroutineStopped is a WaitGroup whose counter value is 1 when the task
+	// goroutine is running and 0 when the task goroutine is stopped or has
+	// exited.
+	goroutineStopped sync.WaitGroup `state:"nosave"`
+
+	// ptraceTracer is the task that is ptrace-attached to this one. If
+	// ptraceTracer is nil, this task is not being traced. Note that due to
+	// atomic.Value limitations (atomic.Value.Store(nil) panics), a nil
+	// ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)).
+	//
+	// ptraceTracer is protected by the TaskSet mutex, and accessed with atomic
+	// operations. This allows paths that wouldn't otherwise lock the TaskSet
+	// mutex, notably the syscall path, to check if ptraceTracer is nil without
+	// additional synchronization.
+	ptraceTracer atomic.Value `state:".(*Task)"`
+
+	// ptraceTracees is the set of tasks that this task is ptrace-attached to.
+	//
+	// ptraceTracees is protected by the TaskSet mutex.
+	ptraceTracees map[*Task]struct{}
+
+	// ptraceOpts contains ptrace options explicitly set by the tracer. If
+	// ptraceTracer is nil, ptraceOpts is expected to be the zero value.
+	//
+	// ptraceOpts is protected by the TaskSet mutex.
+	ptraceOpts ptraceOptions
+
+	// ptraceSyscallMode controls ptrace behavior around syscall entry and
+	// exit.
+	//
+	// ptraceSyscallMode is protected by the TaskSet mutex.
+	ptraceSyscallMode ptraceSyscallMode
+
+	// If ptraceSinglestep is true, the next time the task executes application
+	// code, single-stepping should be enabled. ptraceSinglestep is stored
+	// independently of the architecture-specific trap flag because tracer
+	// detaching (which can happen concurrently with the tracee's execution if
+	// the tracer exits) must disable single-stepping, and the task's
+	// architectural state is implicitly exclusive to the task goroutine (no
+	// synchronization occurs before passing registers to SwitchToApp).
+	//
+	// ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP.
+	//
+	// ptraceSinglestep is protected by the TaskSet mutex.
+	ptraceSinglestep bool
+
+	// If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the
+	// time that t entered the ptrace stop, reset to 0 when the tracer
+	// acknowledges the stop with a wait*() syscall. Otherwise, it is the
+	// signal number passed to the ptrace operation that ended the last ptrace
+	// stop on this task. In the latter case, the effect of ptraceCode depends
+	// on the nature of the ptrace stop; signal-delivery-stop uses it to
+	// conditionally override ptraceSiginfo, syscall-entry/exit-stops send the
+	// signal to the task after leaving the stop, and PTRACE_EVENT stops and
+	// traced group stops ignore it entirely.
+	//
+	// Linux contextually stores the equivalent of ptraceCode in
+	// task_struct::exit_code.
+	//
+	// ptraceCode is protected by the TaskSet mutex.
+	ptraceCode int32
+
+	// ptraceSiginfo is the value returned to the tracer by
+	// ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO).
+	// (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.)
+	// ptraceSiginfo is nil if the task is in a ptraced group-stop (this is
+	// required for PTRACE_GETSIGINFO to return EINVAL during such stops, which
+	// is in turn required to distinguish group stops from other ptrace stops,
+	// per subsection "Group-stop" in ptrace(2)).
+	//
+	// ptraceSiginfo is analogous to Linux's task_struct::last_siginfo.
+	//
+	// ptraceSiginfo is protected by the TaskSet mutex.
+	ptraceSiginfo *arch.SignalInfo
+
+	// ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to
+	// the tracer by ptrace(PTRACE_GETEVENTMSG).
+	//
+	// ptraceEventMsg is protected by the TaskSet mutex.
+	ptraceEventMsg uint64
+
+	// The struct that holds the IO-related usage. The ioUsage pointer is
+	// immutable.
+	ioUsage *usage.IO
+
+	// logPrefix is a string containing the task's thread ID in the root PID
+	// namespace, and is prepended to log messages emitted by Task.Infof etc.
+	logPrefix atomic.Value `state:".(string)"`
+
+	// creds is the task's credentials.
+	//
+	// creds is protected by mu.
+	creds *auth.Credentials
+
+	// utsns is the task's UTS namespace.
+	//
+	// utsns is protected by mu.
+	utsns *UTSNamespace
+
+	// ipcns is the task's IPC namespace.
+	//
+	// ipcns is protected by mu.
+	ipcns *IPCNamespace
+
+	// parentDeathSignal is sent to this task's thread group when its parent exits.
+	//
+	// parentDeathSignal is protected by mu.
+	parentDeathSignal linux.Signal
+
+	// syscallFilters is all seccomp-bpf syscall filters applicable to the
+	// task, in the order in which they were installed.
+	//
+	// syscallFilters is protected by mu. syscallFilters is owned by the task
+	// goroutine.
+	syscallFilters []bpf.Program
+
+	// If cleartid is non-zero, treat it as a pointer to a ThreadID in the
+	// task's virtual address space; when the task exits, set the pointed-to
+	// ThreadID to 0, and wake any futex waiters.
+	//
+	// cleartid is exclusive to the task goroutine.
+	cleartid usermem.Addr
+
+	// This is mostly a fake cpumask just for sched_set/getaffinity as we
+	// don't really control the affinity.
+	//
+	// Invariant: allowedCPUMask.Size() ==
+	// sched.CPUMaskSize(Kernel.applicationCores).
+	//
+	// allowedCPUMask is protected by mu.
+	allowedCPUMask sched.CPUSet
+
+	// cpu is the fake cpu number returned by getcpu(2). cpu is ignored
+	// entirely if Kernel.useHostCores is true.
+	//
+	// cpu is accessed using atomic memory operations.
+	cpu int32
+
+	// This is used to keep track of changes made to a process' priority/niceness.
+	// It is mostly used to provide some reasonable return value from
+	// getpriority(2) after a call to setpriority(2) has been made.
+	// We currently do not actually modify a process' scheduling priority.
+	// NOTE: This represents the userspace view of priority (nice).
+	// This means that the value should be in the range [-20, 19].
+	//
+	// niceness is protected by mu.
+	niceness int
+
+	// This is used to track the numa policy for the current thread. This can be
+	// modified through a set_mempolicy(2) syscall. Since we always report a
+	// single numa node, all policies are no-ops. We only track this information
+	// so that we can return reasonable values if the application calls
+	// get_mempolicy(2) after setting a non-default policy. Note that in the
+	// real syscall, nodemask can be longer than 4 bytes, but we always report a
+	// single node so never need to save more than a single bit.
+	//
+	// numaPolicy and numaNodeMask are protected by mu.
+	numaPolicy   int32
+	numaNodeMask uint32
+
+	// If netns is true, the task is in a non-root network namespace. Network
+	// namespaces aren't currently implemented in full; being in a network
+	// namespace simply prevents the task from observing any network devices
+	// (including loopback) or using abstract socket addresses (see unix(7)).
+	//
+	// netns is protected by mu. netns is owned by the task goroutine.
+	netns bool
+
+	// If rseqPreempted is true, before the next call to p.Switch(), interrupt
+	// RSEQ critical regions as defined by tg.rseq and write the task
+	// goroutine's CPU number to rseqCPUAddr. rseqCPU is the last CPU number
+	// written to rseqCPUAddr.
+	//
+	// If rseqCPUAddr is 0, rseqCPU is -1.
+	//
+	// rseqCPUAddr, rseqCPU, and rseqPreempted are exclusive to the task
+	// goroutine.
+	rseqPreempted bool `state:"nosave"`
+	rseqCPUAddr   usermem.Addr
+	rseqCPU       int32
+
+	// copyScratchBuffer is a buffer available to CopyIn/CopyOut
+	// implementations that require an intermediate buffer to copy data
+	// into/out of. It prevents these buffers from being allocated/zeroed in
+	// each syscall and eventually garbage collected.
+	//
+	// copyScratchBuffer is exclusive to the task goroutine.
+	copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"`
+
+	// blockingTimer is used for blocking timeouts. blockingTimerChan is the
+	// channel that is sent to when blockingTimer fires.
+	//
+	// blockingTimer is exclusive to the task goroutine.
+	blockingTimer     *ktime.Timer    `state:"nosave"`
+	blockingTimerChan <-chan struct{} `state:"nosave"`
+
+	// futexWaiter is used for futex(FUTEX_WAIT) syscalls.
+	//
+	// futexWaiter is exclusive to the task goroutine.
+	futexWaiter *futex.Waiter `state:"nosave"`
+
+	// startTime is the real time at which the task started. It is set when
+	// a Task is created or invokes execve(2).
+	//
+	// startTime is protected by mu.
+	startTime ktime.Time
+}
+
+func (t *Task) savePtraceTracer() *Task {
+	return t.ptraceTracer.Load().(*Task)
+}
+
+func (t *Task) loadPtraceTracer(tracer *Task) {
+	t.ptraceTracer.Store(tracer)
+}
+
+func (t *Task) saveLogPrefix() string {
+	return t.logPrefix.Load().(string)
+}
+
+func (t *Task) loadLogPrefix(prefix string) {
+	t.logPrefix.Store(prefix)
+}
+
+// afterLoad is invoked by stateify.
+func (t *Task) afterLoad() {
+	t.interruptChan = make(chan struct{}, 1)
+	t.gosched.State = TaskGoroutineNonexistent
+	if t.stop != nil {
+		t.stopCount = 1
+	}
+	t.endStopCond.L = &t.tg.signalHandlers.mu
+	t.p = t.k.Platform.NewContext()
+	t.rseqPreempted = true
+	t.futexWaiter = futex.NewWaiter()
+}
+
+// copyScratchBufferLen is the length of the copyScratchBuffer field of the Task
+// struct.
+const copyScratchBufferLen = 52
+
+// TaskMaybe is the interface for extracting Tasks out of things which may be
+// or contain Task objects.
+type TaskMaybe interface {
+	// ExtractTask returns the Task.
+	ExtractTask() *Task
+}
+
+// CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut
+// functions. It must only be used within those functions and can only be used
+// by the task goroutine; it exists to improve performance and thus
+// intentionally lacks any synchronization.
+//
+// Callers should pass a constant value as an argument, which will allow the
+// compiler to inline and optimize out the if statement below.
+func (t *Task) CopyScratchBuffer(size int) []byte {
+	if size > copyScratchBufferLen {
+		return make([]byte, size)
+	}
+	return t.copyScratchBuffer[:size]
+}
+
+// FutexWaiter returns the Task's futex.Waiter.
+func (t *Task) FutexWaiter() *futex.Waiter {
+	return t.futexWaiter
+}
+
+// ExtractTask implements TaskMaybe.ExtractTask.
+func (t *Task) ExtractTask() *Task {
+	return t
+}
+
+// TaskContext returns t's TaskContext.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) TaskContext() *TaskContext {
+	return &t.tc
+}
+
+// TaskResources returns t's TaskResources.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) TaskResources() *TaskResources {
+	return &t.tr
+}
+
+// WithMuLocked executes f with t.mu locked.
+func (t *Task) WithMuLocked(f func(*Task)) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	f(t)
+}
+
+// Kernel returns the Kernel containing t.
+func (t *Task) Kernel() *Kernel {
+	return t.k
+}
+
+// Value implements context.Context.Value.
+func (t *Task) Value(key interface{}) interface{} {
+	switch key {
+	case CtxCanTrace:
+		return t.CanTrace
+	case CtxKernel:
+		return t.k
+	case CtxPIDNamespace:
+		return t.tg.pidns
+	case CtxUTSNamespace:
+		return t.utsns
+	case CtxIPCNamespace:
+		return t.ipcns
+	case CtxTask:
+		return t
+	case auth.CtxCredentials:
+		return t.creds
+	case fs.CtxRoot:
+		return t.FSContext().RootDirectory()
+	case ktime.CtxRealtimeClock:
+		return t.k.RealtimeClock()
+	case limits.CtxLimits:
+		return t.tg.limits
+	case platform.CtxPlatform:
+		return t.k
+	case uniqueid.CtxGlobalUniqueID:
+		return t.k.UniqueID()
+	case uniqueid.CtxInotifyCookie:
+		return t.k.GenerateInotifyCookie()
+	default:
+		return nil
+	}
+}
+
+// SetClearTID sets t's cleartid.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) SetClearTID(addr usermem.Addr) {
+	t.cleartid = addr
+}
+
+// SetSyscallRestartBlock sets the restart block for use in
+// restart_syscall(2). After registering a restart block, a syscall should
+// return ERESTART_RESTARTBLOCK to request a restart using the block.
+//
+// Precondition: The caller must be running on the task goroutine.
+func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) {
+	t.syscallRestartBlock = r
+}
+
+// SyscallRestartBlock returns the currently registered restart block for use in
+// restart_syscall(2). This function is *not* idempotent and may be called once
+// per syscall. This function must not be called if a restart block has not been
+// registered for the current syscall.
+//
+// Precondition: The caller must be running on the task goroutine.
+func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
+	r := t.syscallRestartBlock
+	// Explicitly set the restart block to nil so that a future syscall can't
+	// accidentally reuse it.
+	t.syscallRestartBlock = nil
+	return r
+}
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
new file mode 100644
index 000000000..ce12cdb64
--- /dev/null
+++ b/pkg/sentry/kernel/task_acct.go
@@ -0,0 +1,111 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// Accounting, limits, timers.
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// IOUsage returns the io usage of the thread.
+func (t *Task) IOUsage() *usage.IO {
+	return t.ioUsage
+}
+
+// IOUsage returns the total io usage of all dead and live threads in the group.
+func (tg *ThreadGroup) IOUsage() *usage.IO {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+
+	io := *tg.ioUsage
+	// Account for active tasks.
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		io.Accumulate(t.IOUsage())
+	}
+	return &io
+}
+
+// Name returns t's name.
+func (t *Task) Name() string {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.tc.Name
+}
+
+// SetName changes t's name.
+func (t *Task) SetName(name string) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.tc.Name = name
+	t.Debugf("Set thread name to %q", name)
+}
+
+// SetCPUTimer is used by setrlimit(RLIMIT_CPU) to enforce the hard and soft
+// limits on CPU time used by this process.
+func (tg *ThreadGroup) SetCPUTimer(l *limits.Limit) {
+	tg.Timer().applyCPULimits(*l)
+}
+
+// Limits implements context.Context.Limits.
+func (t *Task) Limits() *limits.LimitSet {
+	return t.ThreadGroup().Limits()
+}
+
+// StartTime returns t's start time.
+func (t *Task) StartTime() ktime.Time {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.startTime
+}
+
+// MaxRSS returns the maximum resident set size of the task in bytes. which
+// should be one of RUSAGE_SELF, RUSAGE_CHILDREN, RUSAGE_THREAD, or
+// RUSAGE_BOTH. See getrusage(2) for documentation on the behavior of these
+// flags.
+func (t *Task) MaxRSS(which int32) uint64 {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+
+	switch which {
+	case linux.RUSAGE_SELF, linux.RUSAGE_THREAD:
+		// If there's an active mm we can use its value.
+		if mm := t.MemoryManager(); mm != nil {
+			if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > t.tg.maxRSS {
+				return mmMaxRSS
+			}
+		}
+		return t.tg.maxRSS
+	case linux.RUSAGE_CHILDREN:
+		return t.tg.childMaxRSS
+	case linux.RUSAGE_BOTH:
+		maxRSS := t.tg.maxRSS
+		if maxRSS < t.tg.childMaxRSS {
+			maxRSS = t.tg.childMaxRSS
+		}
+		if mm := t.MemoryManager(); mm != nil {
+			if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > maxRSS {
+				return mmMaxRSS
+			}
+		}
+		return maxRSS
+	default:
+		// We'll only get here if which is invalid.
+		return 0
+	}
+}
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
new file mode 100644
index 000000000..9fd24f134
--- /dev/null
+++ b/pkg/sentry/kernel/task_block.go
@@ -0,0 +1,207 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"time"
+
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// BlockWithTimeout blocks t until an event is received from C, the application
+// monotonic clock indicates that timeout has elapsed (only if haveTimeout is true),
+// or t is interrupted. It returns:
+//
+// - The remaining timeout, which is guaranteed to be 0 if the timeout expired,
+// and is unspecified if haveTimeout is false.
+//
+// - An error which is nil if an event is received from C, ETIMEDOUT if the timeout
+// expired, and syserror.ErrInterrupted if t is interrupted.
+func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) {
+	if !haveTimeout {
+		return timeout, t.block(C, nil)
+	}
+
+	start := t.Kernel().MonotonicClock().Now()
+	deadline := start.Add(timeout)
+	err := t.BlockWithDeadline(C, true, deadline)
+
+	// Timeout, explicitly return a remaining duration of 0.
+	if err == syserror.ETIMEDOUT {
+		return 0, err
+	}
+
+	// Compute the remaining timeout. Note that even if block() above didn't
+	// return due to a timeout, we may have used up any of the remaining time
+	// since then. We cap the remaining timeout to 0 to make it easier to
+	// directly use the returned duration.
+	end := t.Kernel().MonotonicClock().Now()
+	remainingTimeout := timeout - end.Sub(start)
+	if remainingTimeout < 0 {
+		remainingTimeout = 0
+	}
+
+	return remainingTimeout, err
+}
+
+// BlockWithDeadline blocks t until an event is received from C, the
+// application monotonic clock indicates a time of deadline (only if
+// haveDeadline is true), or t is interrupted. It returns nil if an event is
+// received from C, ETIMEDOUT if the deadline expired, and
+// syserror.ErrInterrupted if t is interrupted.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) BlockWithDeadline(C chan struct{}, haveDeadline bool, deadline ktime.Time) error {
+	if !haveDeadline {
+		return t.block(C, nil)
+	}
+
+	// Start the timeout timer.
+	t.blockingTimer.Swap(ktime.Setting{
+		Enabled: true,
+		Next:    deadline,
+	})
+
+	err := t.block(C, t.blockingTimerChan)
+
+	// Stop the timeout timer and drain the channel.
+	t.blockingTimer.Swap(ktime.Setting{})
+	select {
+	case <-t.blockingTimerChan:
+	default:
+	}
+
+	return err
+}
+
+// BlockWithTimer blocks t until an event is received from C or tchan, or t is
+// interrupted. It returns nil if an event is received from C, ETIMEDOUT if an
+// event is received from tchan, and syserror.ErrInterrupted if t is
+// interrupted.
+//
+// Most clients should use BlockWithDeadline or BlockWithTimeout instead.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) BlockWithTimer(C chan struct{}, tchan <-chan struct{}) error {
+	return t.block(C, tchan)
+}
+
+// Block blocks t until an event is received from C or t is interrupted. It
+// returns nil if an event is received from C and syserror.ErrInterrupted if t
+// is interrupted.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Block(C chan struct{}) error {
+	return t.block(C, nil)
+}
+
+// block blocks a task on one of many events.
+// N.B. defer is too expensive to be used here.
+func (t *Task) block(C chan struct{}, timerChan <-chan struct{}) error {
+	// Fast path if the request is already done.
+	select {
+	case <-C:
+		return nil
+	default:
+	}
+
+	// Deactive our address space, we don't need it.
+	interrupt := t.SleepStart()
+
+	select {
+	case <-C:
+		t.SleepFinish(true)
+		return nil
+
+	case <-interrupt:
+		t.SleepFinish(false)
+		// Return the indicated error on interrupt.
+		return syserror.ErrInterrupted
+
+	case <-timerChan:
+		// We've timed out.
+		t.SleepFinish(true)
+		return syserror.ETIMEDOUT
+	}
+}
+
+// SleepStart implements amutex.Sleeper.SleepStart.
+func (t *Task) SleepStart() <-chan struct{} {
+	t.Deactivate()
+	t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible)
+	return t.interruptChan
+}
+
+// SleepFinish implements amutex.Sleeper.SleepFinish.
+func (t *Task) SleepFinish(success bool) {
+	if !success {
+		// The interrupted notification is consumed only at the top-level
+		// (Run). Therefore we attempt to reset the pending notification.
+		// This will also elide our next entry back into the task, so we
+		// will process signals, state changes, etc.
+		t.interruptSelf()
+	}
+	t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible)
+	t.Activate()
+}
+
+// UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart.
+func (t *Task) UninterruptibleSleepStart(deactivate bool) {
+	if deactivate {
+		t.Deactivate()
+	}
+	t.accountTaskGoroutineEnter(TaskGoroutineBlockedUninterruptible)
+}
+
+// UninterruptibleSleepFinish implements context.Context.UninterruptibleSleepFinish.
+func (t *Task) UninterruptibleSleepFinish(activate bool) {
+	t.accountTaskGoroutineLeave(TaskGoroutineBlockedUninterruptible)
+	if activate {
+		t.Activate()
+	}
+}
+
+// interrupted returns true if interrupt or interruptSelf has been called at
+// least once since the last call to interrupted.
+func (t *Task) interrupted() bool {
+	select {
+	case <-t.interruptChan:
+		return true
+	default:
+		return false
+	}
+}
+
+// interrupt unblocks the task and interrupts it if it's currently running in
+// userspace.
+func (t *Task) interrupt() {
+	t.interruptSelf()
+	t.p.Interrupt()
+}
+
+// interruptSelf is like Interrupt, but can only be called by the task
+// goroutine.
+func (t *Task) interruptSelf() {
+	select {
+	case t.interruptChan <- struct{}{}:
+		t.Debugf("Interrupt queued")
+	default:
+		t.Debugf("Dropping duplicate interrupt")
+	}
+	// platform.Context.Interrupt() is unnecessary since a task goroutine
+	// calling interruptSelf() cannot also be blocked in
+	// platform.Context.Switch().
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
new file mode 100644
index 000000000..3a74abdfb
--- /dev/null
+++ b/pkg/sentry/kernel/task_clone.go
@@ -0,0 +1,475 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bpf"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SharingOptions controls what resources are shared by a new task created by
+// Task.Clone, or an existing task affected by Task.Unshare.
+type SharingOptions struct {
+	// If NewAddressSpace is true, the task should have an independent virtual
+	// address space.
+	NewAddressSpace bool
+
+	// If NewSignalHandlers is true, the task should use an independent set of
+	// signal handlers.
+	NewSignalHandlers bool
+
+	// If NewThreadGroup is true, the task should be the leader of its own
+	// thread group. TerminationSignal is the signal that the thread group
+	// will send to its parent when it exits. If NewThreadGroup is false,
+	// TerminationSignal is ignored.
+	NewThreadGroup    bool
+	TerminationSignal linux.Signal
+
+	// If NewPIDNamespace is true:
+	//
+	// - In the context of Task.Clone, the new task should be the init task
+	// (TID 1) in a new PID namespace.
+	//
+	// - In the context of Task.Unshare, the task should create a new PID
+	// namespace, and all subsequent clones of the task should be members of
+	// the new PID namespace.
+	NewPIDNamespace bool
+
+	// If NewUserNamespace is true, the task should have an independent user
+	// namespace.
+	NewUserNamespace bool
+
+	// If NewNetworkNamespace is true, the task should have an independent
+	// network namespace. (Note that network namespaces are not really
+	// implemented; see comment on Task.netns for details.)
+	NewNetworkNamespace bool
+
+	// If NewFiles is true, the task should use an independent file descriptor
+	// table.
+	NewFiles bool
+
+	// If NewFSContext is true, the task should have an independent FSContext.
+	NewFSContext bool
+
+	// If NewUTSNamespace is true, the task should have an independent UTS
+	// namespace.
+	NewUTSNamespace bool
+
+	// If NewIPCNamespace is true, the task should have an independent IPC
+	// namespace.
+	NewIPCNamespace bool
+}
+
+// CloneOptions controls the behavior of Task.Clone.
+type CloneOptions struct {
+	// SharingOptions defines the set of resources that the new task will share
+	// with its parent.
+	SharingOptions
+
+	// Stack is the initial stack pointer of the new task. If Stack is 0, the
+	// new task will start with the same stack pointer as its parent.
+	Stack usermem.Addr
+
+	// If SetTLS is true, set the new task's TLS (thread-local storage)
+	// descriptor to TLS. If SetTLS is false, TLS is ignored.
+	SetTLS bool
+	TLS    usermem.Addr
+
+	// If ChildClearTID is true, when the child exits, 0 is written to the
+	// address ChildTID in the child's memory, and if the write is successful a
+	// futex wake on the same address is performed.
+	//
+	// If ChildSetTID is true, the child's thread ID (in the child's PID
+	// namespace) is written to address ChildTID in the child's memory. (As in
+	// Linux, failed writes are silently ignored.)
+	ChildClearTID bool
+	ChildSetTID   bool
+	ChildTID      usermem.Addr
+
+	// If ParentSetTID is true, the child's thread ID (in the parent's PID
+	// namespace) is written to address ParentTID in the parent's memory. (As
+	// in Linux, failed writes are silently ignored.)
+	//
+	// Older versions of the clone(2) man page state that CLONE_PARENT_SETTID
+	// causes the child's thread ID to be written to ptid in both the parent
+	// and child's memory, but this is a documentation error fixed by
+	// 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID").
+	ParentSetTID bool
+	ParentTID    usermem.Addr
+
+	// If Vfork is true, place the parent in vforkStop until the cloned task
+	// releases its TaskContext.
+	Vfork bool
+
+	// If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for
+	// this clone(), and do not ptrace-attach the caller's tracer to the new
+	// task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate).
+	Untraced bool
+
+	// If InheritTracer is true, ptrace-attach the caller's tracer to the new
+	// task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported
+	// for it. If both Untraced and InheritTracer are true, no event will be
+	// reported, but tracer inheritance will still occur.
+	InheritTracer bool
+}
+
+// Clone implements the clone(2) syscall and returns the thread ID of the new
+// task in t's PID namespace. Clone may return both a non-zero thread ID and a
+// non-nil error.
+//
+// Preconditions: The caller must be running Task.doSyscallInvoke on the task
+// goroutine.
+func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
+	// Since signal actions may refer to application signal handlers by virtual
+	// address, any set of signal handlers must refer to the same address
+	// space.
+	if !opts.NewSignalHandlers && opts.NewAddressSpace {
+		return 0, nil, syserror.EINVAL
+	}
+	// In order for the behavior of thread-group-directed signals to be sane,
+	// all tasks in a thread group must share signal handlers.
+	if !opts.NewThreadGroup && opts.NewSignalHandlers {
+		return 0, nil, syserror.EINVAL
+	}
+	// All tasks in a thread group must be in the same PID namespace.
+	if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) {
+		return 0, nil, syserror.EINVAL
+	}
+	// The two different ways of specifying a new PID namespace are
+	// incompatible.
+	if opts.NewPIDNamespace && t.childPIDNamespace != nil {
+		return 0, nil, syserror.EINVAL
+	}
+	// Thread groups and FS contexts cannot span user namespaces.
+	if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
+	// single clone(2) or unshare(2) call, the user namespace is guaranteed to
+	// be created first, giving the child (clone(2)) or caller (unshare(2))
+	// privileges over the remaining namespaces created by the call." -
+	// user_namespaces(7)
+	creds := t.Credentials()
+	var userns *auth.UserNamespace
+	if opts.NewUserNamespace {
+		var err error
+		// "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
+		// the caller is in a chroot environment (i.e., the caller's root
+		// directory does not match the root directory of the mount namespace
+		// in which it resides)." - clone(2). Neither chroot(2) nor
+		// user_namespaces(7) document this.
+		if t.IsChrooted() {
+			return 0, nil, syserror.EPERM
+		}
+		userns, err = creds.NewChildUserNamespace()
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+	if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapability(linux.CAP_SYS_ADMIN) {
+		return 0, nil, syserror.EPERM
+	}
+
+	utsns := t.UTSNamespace()
+	if opts.NewUTSNamespace {
+		// Note that this must happen after NewUserNamespace so we get
+		// the new userns if there is one.
+		utsns = t.UTSNamespace().Clone(userns)
+	}
+
+	ipcns := t.IPCNamespace()
+	if opts.NewIPCNamespace {
+		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
+		// namespace"
+		ipcns = NewIPCNamespace()
+	}
+
+	tc, err := t.tc.Fork(t, !opts.NewAddressSpace)
+	if err != nil {
+		return 0, nil, err
+	}
+	// clone() returns 0 in the child.
+	tc.Arch.SetReturn(0)
+	if opts.Stack != 0 {
+		tc.Arch.SetStack(uintptr(opts.Stack))
+	}
+	if opts.SetTLS {
+		tc.Arch.StateData().Regs.Fs_base = uint64(opts.TLS)
+	}
+
+	pidns := t.tg.pidns
+	if t.childPIDNamespace != nil {
+		pidns = t.childPIDNamespace
+	} else if opts.NewPIDNamespace {
+		pidns = pidns.NewChild(userns)
+	}
+	tg := t.tg
+	parent := t.parent
+	if opts.NewThreadGroup {
+		sh := t.tg.signalHandlers
+		if opts.NewSignalHandlers {
+			sh = sh.Fork()
+		}
+		tg = NewThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
+		parent = t
+	}
+	cfg := &TaskConfig{
+		Kernel:            t.k,
+		Parent:            parent,
+		ThreadGroup:       tg,
+		TaskContext:       tc,
+		TaskResources:     t.tr.Fork(!opts.NewFiles, !opts.NewFSContext),
+		Niceness:          t.Niceness(),
+		Credentials:       creds.Fork(),
+		NetworkNamespaced: t.netns,
+		AllowedCPUMask:    t.CPUMask(),
+		UTSNamespace:      utsns,
+		IPCNamespace:      ipcns,
+	}
+	if opts.NewNetworkNamespace {
+		cfg.NetworkNamespaced = true
+	}
+	nt, err := t.tg.pidns.owner.NewTask(cfg)
+	if err != nil {
+		if opts.NewThreadGroup {
+			tg.release()
+		}
+		return 0, nil, err
+	}
+
+	// "A child process created via fork(2) inherits a copy of its parent's
+	// alternate signal stack settings" - sigaltstack(2).
+	//
+	// However kernel/fork.c:copy_process() adds a limitation to this:
+	// "sigaltstack should be cleared when sharing the same VM".
+	if opts.NewAddressSpace || opts.Vfork {
+		nt.SetSignalStack(t.SignalStack())
+	}
+
+	if userns != nil {
+		if err := nt.SetUserNamespace(userns); err != nil {
+			// This shouldn't be possible: userns was created from nt.creds, so
+			// nt should have CAP_SYS_ADMIN in userns.
+			panic("Task.Clone: SetUserNamespace failed: " + err.Error())
+		}
+	}
+
+	// This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
+	// nt that it must receive before its task goroutine starts running.
+	tid := nt.k.tasks.Root.IDOfTask(nt)
+	defer nt.Start(tid)
+
+	// "If fork/clone and execve are allowed by @prog, any child processes will
+	// be constrained to the same filters and system call ABI as the parent." -
+	// Documentation/prctl/seccomp_filter.txt
+	nt.syscallFilters = append([]bpf.Program(nil), t.syscallFilters...)
+	if opts.Vfork {
+		nt.vforkParent = t
+	}
+
+	if opts.ChildClearTID {
+		nt.SetClearTID(opts.ChildTID)
+	}
+	if opts.ChildSetTID {
+		// Can't use Task.CopyOut, which assumes AddressSpaceActive.
+		usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{})
+	}
+	ntid := t.tg.pidns.IDOfTask(nt)
+	if opts.ParentSetTID {
+		t.CopyOut(opts.ParentTID, ntid)
+	}
+
+	kind := ptraceCloneKindClone
+	if opts.Vfork {
+		kind = ptraceCloneKindVfork
+	} else if opts.TerminationSignal == linux.SIGCHLD {
+		kind = ptraceCloneKindFork
+	}
+	if t.ptraceClone(kind, nt, opts) {
+		if opts.Vfork {
+			return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil
+		}
+		return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil
+	}
+	if opts.Vfork {
+		t.maybeBeginVforkStop(nt)
+		return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil
+	}
+	return ntid, nil, nil
+}
+
+// maybeBeginVforkStop checks if a previously-started vfork child is still
+// running and has not yet released its MM, such that its parent t should enter
+// a vforkStop.
+//
+// Preconditions: The caller must be running on t's task goroutine.
+func (t *Task) maybeBeginVforkStop(child *Task) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.killedLocked() {
+		child.vforkParent = nil
+		return
+	}
+	if child.vforkParent == t {
+		t.beginInternalStopLocked((*vforkStop)(nil))
+	}
+}
+
+func (t *Task) unstopVforkParent() {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if p := t.vforkParent; p != nil {
+		p.tg.signalHandlers.mu.Lock()
+		defer p.tg.signalHandlers.mu.Unlock()
+		if _, ok := p.stop.(*vforkStop); ok {
+			p.endInternalStopLocked()
+		}
+		// Parent no longer needs to be unstopped.
+		t.vforkParent = nil
+	}
+}
+
+type runSyscallAfterPtraceEventClone struct {
+	vforkChild *Task
+
+	// If vforkChild is not nil, vforkChildTID is its thread ID in the parent's
+	// PID namespace. vforkChildTID must be stored since the child may exit and
+	// release its TID before the PTRACE_EVENT stop ends.
+	vforkChildTID ThreadID
+}
+
+func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
+	if r.vforkChild != nil {
+		t.maybeBeginVforkStop(r.vforkChild)
+		return &runSyscallAfterVforkStop{r.vforkChildTID}
+	}
+	return (*runSyscallExit)(nil)
+}
+
+type runSyscallAfterVforkStop struct {
+	// childTID has the same meaning as
+	// runSyscallAfterPtraceEventClone.vforkChildTID.
+	childTID ThreadID
+}
+
+func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState {
+	t.ptraceVforkDone(r.childTID)
+	return (*runSyscallExit)(nil)
+}
+
+// Unshare changes the set of resources t shares with other tasks, as specified
+// by opts.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Unshare(opts *SharingOptions) error {
+	// In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and
+	// NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if
+	// t is the only task using its MM, which due to clone(2)'s rules imply
+	// that it is also the only task using its signal handlers / in its thread
+	// group, and cause EINVAL to be returned otherwise.
+	//
+	// Since we don't count the number of tasks using each address space or set
+	// of signal handlers, we reject NewSignalHandlers and NewAddressSpace
+	// altogether, and interpret NewThreadGroup as requiring that t be the only
+	// member of its thread group. This seems to be logically coherent, in the
+	// sense that clone(2) allows a task to share signal handlers and address
+	// spaces with tasks in other thread groups.
+	if opts.NewAddressSpace || opts.NewSignalHandlers {
+		return syserror.EINVAL
+	}
+	if opts.NewThreadGroup {
+		t.tg.signalHandlers.mu.Lock()
+		if t.tg.tasksCount != 1 {
+			t.tg.signalHandlers.mu.Unlock()
+			return syserror.EINVAL
+		}
+		t.tg.signalHandlers.mu.Unlock()
+		// This isn't racy because we're the only living task, and therefore
+		// the only task capable of creating new ones, in our thread group.
+	}
+	if opts.NewUserNamespace {
+		if t.IsChrooted() {
+			return syserror.EPERM
+		}
+		// This temporary is needed because Go.
+		creds := t.Credentials()
+		newUserNS, err := creds.NewChildUserNamespace()
+		if err != nil {
+			return err
+		}
+		err = t.SetUserNamespace(newUserNS)
+		if err != nil {
+			return err
+		}
+	}
+	haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
+	if opts.NewPIDNamespace {
+		if !haveCapSysAdmin {
+			return syserror.EPERM
+		}
+		t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if opts.NewNetworkNamespace {
+		if !haveCapSysAdmin {
+			return syserror.EPERM
+		}
+		t.netns = true
+	}
+	if opts.NewUTSNamespace {
+		if !haveCapSysAdmin {
+			return syserror.EPERM
+		}
+		// Note that this must happen after NewUserNamespace, so the
+		// new user namespace is used if there is one.
+		t.utsns = t.utsns.Clone(t.creds.UserNamespace)
+	}
+	if opts.NewIPCNamespace {
+		if !haveCapSysAdmin {
+			return syserror.EPERM
+		}
+		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
+		// namespace"
+		t.ipcns = NewIPCNamespace()
+	}
+	if opts.NewFiles {
+		oldFDMap := t.tr.FDMap
+		t.tr.FDMap = oldFDMap.Fork()
+		oldFDMap.DecRef()
+	}
+	if opts.NewFSContext {
+		oldFS := t.tr.FSContext
+		t.tr.FSContext = oldFS.Fork()
+		oldFS.DecRef()
+	}
+	return nil
+}
+
+// vforkStop is a TaskStop imposed on a task that creates a child with
+// CLONE_VFORK or vfork(2), that ends when the child task ceases to use its
+// current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
+// that the child and parent share mappings until the child execve()s into a
+// new process image or exits.)
+type vforkStop struct{}
+
+// StopIgnoresKill implements TaskStop.Killable.
+func (*vforkStop) Killable() bool { return true }
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
new file mode 100644
index 000000000..5c563ba08
--- /dev/null
+++ b/pkg/sentry/kernel/task_context.go
@@ -0,0 +1,179 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"errors"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// ErrNoSyscalls is returned if there is no syscall table.
+var ErrNoSyscalls = errors.New("no syscall table found")
+
+// Auxmap contains miscellaneous data for the task.
+type Auxmap map[string]interface{}
+
+// TaskContext is the subset of a task's data that is provided by the loader.
+type TaskContext struct {
+	// Name is the thread name set by the prctl(PR_SET_NAME) system call.
+	Name string
+
+	// Arch is the architecture-specific context (registers, etc.)
+	Arch arch.Context
+
+	// MemoryManager is the task's address space.
+	MemoryManager *mm.MemoryManager
+
+	// fu implements futexes in the address space.
+	fu *futex.Manager
+
+	// st is the task's syscall table.
+	st *SyscallTable
+}
+
+// release releases all resources held by the TaskContext. release is called by
+// the task when it execs into a new TaskContext or exits.
+func (tc *TaskContext) release() {
+	// Nil out pointers so that if the task is saved after release, it doesn't
+	// follow the pointers to possibly now-invalid objects.
+	if tc.MemoryManager != nil {
+		// TODO
+		tc.MemoryManager.DecUsers(context.Background())
+		tc.MemoryManager = nil
+	}
+	tc.fu = nil
+}
+
+// Fork returns a duplicate of tc. The copied TaskContext always has an
+// independent arch.Context. If shareAddressSpace is true, the copied
+// TaskContext shares an address space with the original; otherwise, the copied
+// TaskContext has an independent address space that is initially a duplicate
+// of the original's.
+func (tc *TaskContext) Fork(ctx context.Context, shareAddressSpace bool) (*TaskContext, error) {
+	newTC := &TaskContext{
+		Arch: tc.Arch.Fork(),
+		st:   tc.st,
+	}
+	if shareAddressSpace {
+		newTC.MemoryManager = tc.MemoryManager
+		if newTC.MemoryManager != nil {
+			if !newTC.MemoryManager.IncUsers() {
+				// Shouldn't be possible since tc.MemoryManager should be a
+				// counted user.
+				panic(fmt.Sprintf("TaskContext.Fork called with userless TaskContext.MemoryManager"))
+			}
+		}
+		newTC.fu = tc.fu
+	} else {
+		newMM, err := tc.MemoryManager.Fork(ctx)
+		if err != nil {
+			return nil, err
+		}
+		newTC.MemoryManager = newMM
+		// TODO: revisit when shmem is supported.
+		newTC.fu = futex.NewManager()
+	}
+	return newTC, nil
+}
+
+// Arch returns t's arch.Context.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Arch() arch.Context {
+	return t.tc.Arch
+}
+
+// MemoryManager returns t's MemoryManager. MemoryManager does not take an
+// additional reference on the returned MM.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) MemoryManager() *mm.MemoryManager {
+	return t.tc.MemoryManager
+}
+
+// Futex returns t's futex manager.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Futex() *futex.Manager {
+	return t.tc.fu
+}
+
+// SyscallTable returns t's syscall table.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) SyscallTable() *SyscallTable {
+	return t.tc.st
+}
+
+// Stack returns the userspace stack.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Stack() *arch.Stack {
+	return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())}
+}
+
+// LoadTaskImage loads filename into a new TaskContext.
+//
+// It takes several arguments:
+//  * mounts: MountNamespace to lookup filename in
+//  * root: Root to lookup filename under
+//  * wd: Working directory to lookup filename under
+//  * maxTraversals: maximum number of symlinks to follow
+//  * filename: path to binary to load
+//  * argv: Binary argv
+//  * envv: Binary envv
+//  * fs: Binary FeatureSet
+func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, error) {
+	// Prepare a new user address space to load into.
+	m := mm.NewMemoryManager(k)
+	defer m.DecUsers(ctx)
+
+	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv, k.extraAuxv, k.vdso)
+	if err != nil {
+		return nil, err
+	}
+
+	// Lookup our new syscall table.
+	st, ok := LookupSyscallTable(os, ac.Arch())
+	if !ok {
+		// No syscall table found. Yikes.
+		return nil, ErrNoSyscalls
+	}
+
+	if !m.IncUsers() {
+		panic("Failed to increment users count on new MM")
+	}
+	return &TaskContext{
+		Name:          name,
+		Arch:          ac,
+		MemoryManager: m,
+		fu:            futex.NewManager(),
+		st:            st,
+	}, nil
+}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
new file mode 100644
index 000000000..2285847a2
--- /dev/null
+++ b/pkg/sentry/kernel/task_exec.go
@@ -0,0 +1,240 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements the machinery behind the execve() syscall. In brief, a
+// thread executes an execve() by killing all other threads in its thread
+// group, assuming the leader's identity, and then switching process images.
+//
+// This design is effectively mandated by Linux. From ptrace(2):
+//
+// """
+// execve(2) under ptrace
+//     When one thread in a multithreaded process calls execve(2), the
+//     kernel destroys all other threads in the process, and resets the
+//     thread ID of the execing thread to the thread group ID (process ID).
+//     (Or, to put things another way, when a multithreaded process does an
+//     execve(2), at completion of the call, it appears as though the
+//     execve(2) occurred in the thread group leader, regardless of which
+//     thread did the execve(2).)  This resetting of the thread ID looks
+//     very confusing to tracers:
+//
+//     *  All other threads stop in PTRACE_EVENT_EXIT stop, if the
+//        PTRACE_O_TRACEEXIT option was turned on.  Then all other threads
+//        except the thread group leader report death as if they exited via
+//        _exit(2) with exit code 0.
+//
+//     *  The execing tracee changes its thread ID while it is in the
+//        execve(2).  (Remember, under ptrace, the "pid" returned from
+//        waitpid(2), or fed into ptrace calls, is the tracee's thread ID.)
+//        That is, the tracee's thread ID is reset to be the same as its
+//        process ID, which is the same as the thread group leader's thread
+//        ID.
+//
+//     *  Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC
+//        option was turned on.
+//
+//     *  If the thread group leader has reported its PTRACE_EVENT_EXIT stop
+//        by this time, it appears to the tracer that the dead thread leader
+//        "reappears from nowhere".  (Note: the thread group leader does not
+//        report death via WIFEXITED(status) until there is at least one
+//        other live thread.  This eliminates the possibility that the
+//        tracer will see it dying and then reappearing.)  If the thread
+//        group leader was still alive, for the tracer this may look as if
+//        thread group leader returns from a different system call than it
+//        entered, or even "returned from a system call even though it was
+//        not in any system call".  If the thread group leader was not
+//        traced (or was traced by a different tracer), then during
+//        execve(2) it will appear as if it has become a tracee of the
+//        tracer of the execing tracee.
+//
+//     All of the above effects are the artifacts of the thread ID change in
+//     the tracee.
+// """
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// execStop is a TaskStop that a task sets on itself when it wants to execve
+// and is waiting for the other tasks in its thread group to exit first.
+type execStop struct{}
+
+// Killable implements TaskStop.Killable.
+func (*execStop) Killable() bool { return true }
+
+// Execve implements the execve(2) syscall by killing all other tasks in its
+// thread group and switching to newTC. Execve always takes ownership of newTC.
+//
+// Preconditions: The caller must be running Task.doSyscallInvoke on the task
+// goroutine.
+func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+
+	if t.tg.exiting || t.tg.execing != nil {
+		// We lost to a racing group-exit, kill, or exec from another thread
+		// and should just exit.
+		newTC.release()
+		return nil, syserror.EINTR
+	}
+
+	// Cancel any racing group stops.
+	t.tg.endGroupStopLocked(false)
+
+	// If the task has any siblings, they have to exit before the exec can
+	// continue.
+	t.tg.execing = t
+	if t.tg.tasks.Front() != t.tg.tasks.Back() {
+		// "[All] other threads except the thread group leader report death as
+		// if they exited via _exit(2) with exit code 0." - ptrace(2)
+		for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
+			if t != sibling {
+				sibling.killLocked()
+			}
+		}
+		// The last sibling to exit will wake t.
+		t.beginInternalStopLocked((*execStop)(nil))
+	}
+
+	return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil
+}
+
+// The runSyscallAfterExecStop state continues execve(2) after all siblings of
+// a thread in the execve syscall have exited.
+type runSyscallAfterExecStop struct {
+	tc *TaskContext
+}
+
+func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
+	t.tg.pidns.owner.mu.Lock()
+	t.tg.execing = nil
+	if t.killed() {
+		t.tg.pidns.owner.mu.Unlock()
+		r.tc.release()
+		return (*runInterrupt)(nil)
+	}
+	// We are the thread group leader now. Save our old thread ID for
+	// PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this
+	// point it will get a PID of 0, but this is consistent with Linux.
+	oldTID := ThreadID(0)
+	if tracer := t.Tracer(); tracer != nil {
+		oldTID = tracer.tg.pidns.tids[t]
+	}
+	t.promoteLocked()
+	// "During an execve(2), the dispositions of handled signals are reset to
+	// the default; the dispositions of ignored signals are left unchanged. ...
+	// [The] signal mask is preserved across execve(2). ... [The] pending
+	// signal set is preserved across an execve(2)." - signal(7)
+	//
+	// Details:
+	//
+	// - If the thread group is sharing its signal handlers with another thread
+	// group via CLONE_SIGHAND, execve forces the signal handlers to be copied
+	// (see Linux's fs/exec.c:de_thread). We're not reference-counting signal
+	// handlers, so we always make a copy.
+	//
+	// - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags,
+	// restorer (if present), and mask are always reset. (See Linux's
+	// fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.)
+	t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec()
+	t.endStopCond.L = &t.tg.signalHandlers.mu
+	// "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2)
+	t.signalStack = arch.SignalStack{Flags: arch.SignalStackFlagDisable}
+	// "The termination signal is reset to SIGCHLD (see clone(2))."
+	t.tg.terminationSignal = linux.SIGCHLD
+	// execed indicates that the process can no longer join a process group
+	// in some scenarios (namely, the parent call setpgid(2) on the child).
+	// See the JoinProcessGroup function in sessions.go for more context.
+	t.tg.execed = true
+	// Maximum RSS is preserved across execve(2).
+	t.updateRSSLocked()
+	// Restartable sequence state is discarded.
+	t.rseqPreempted = false
+	t.rseqCPUAddr = 0
+	t.rseqCPU = -1
+	t.tg.rscr.Store(&RSEQCriticalRegion{})
+	t.tg.pidns.owner.mu.Unlock()
+
+	// Remove FDs with the CloseOnExec flag set.
+	t.FDMap().RemoveIf(func(file *fs.File, flags FDFlags) bool {
+		return flags.CloseOnExec
+	})
+
+	// Switch to the new process.
+	t.MemoryManager().Deactivate()
+	t.mu.Lock()
+	// Update credentials to reflect the execve. This should precede switching
+	// MMs to ensure that dumpability has been reset first, if needed.
+	t.updateCredsForExecLocked()
+	t.tc.release()
+	t.tc = *r.tc
+	t.mu.Unlock()
+	t.unstopVforkParent()
+	// NOTE: All locks must be dropped prior to calling Activate.
+	t.MemoryManager().Activate()
+
+	t.ptraceExec(oldTID)
+	return (*runSyscallExit)(nil)
+}
+
+// promoteLocked makes t the leader of its thread group. If t is already the
+// thread group leader, promoteLocked is a no-op.
+//
+// Preconditions: All other tasks in t's thread group, including the existing
+// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must
+// be locked for writing.
+func (t *Task) promoteLocked() {
+	oldLeader := t.tg.leader
+	if t == oldLeader {
+		return
+	}
+	// Swap the leader's TIDs with the execing task's. The latter will be
+	// released when the old leader is reaped below.
+	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+		oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader]
+		ns.tids[oldLeader] = oldTID
+		ns.tids[t] = leaderTID
+		ns.tasks[oldTID] = oldLeader
+		ns.tasks[leaderTID] = t
+	}
+
+	// Inherit the old leader's start time.
+	oldStartTime := oldLeader.StartTime()
+	t.mu.Lock()
+	t.startTime = oldStartTime
+	t.mu.Unlock()
+
+	t.tg.leader = t
+	t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t])
+	t.updateLogPrefixLocked()
+	// Reap the original leader. If it has a tracer, detach it instead of
+	// waiting for it to acknowledge the original leader's death.
+	oldLeader.exitParentNotified = true
+	oldLeader.exitParentAcked = true
+	if tracer := oldLeader.Tracer(); tracer != nil {
+		delete(tracer.ptraceTracees, oldLeader)
+		oldLeader.forgetTracerLocked()
+		// Notify the tracer that it will no longer be receiving these events
+		// from the tracee.
+		tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue)
+	}
+	oldLeader.exitNotifyLocked(false)
+}
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
new file mode 100644
index 000000000..3d49ae350
--- /dev/null
+++ b/pkg/sentry/kernel/task_exit.go
@@ -0,0 +1,1139 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements the task exit cycle:
+//
+// - Tasks are asynchronously requested to exit with Task.Kill.
+//
+// - When able, the task goroutine enters the exit path starting from state
+// runExit.
+//
+// - Other tasks observe completed exits with Task.Wait (which implements the
+// wait*() family of syscalls).
+
+import (
+	"errors"
+	"fmt"
+	"strconv"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// An ExitStatus is a value communicated from an exiting task or thread group
+// to the party that reaps it.
+type ExitStatus struct {
+	// Code is the numeric value passed to the call to exit or exit_group that
+	// caused the exit. If the exit was not caused by such a call, Code is 0.
+	Code int
+
+	// Signo is the signal that caused the exit. If the exit was not caused by
+	// a signal, Signo is 0.
+	Signo int
+}
+
+// Signaled returns true if the ExitStatus indicates that the exiting task or
+// thread group was killed by a signal.
+func (es ExitStatus) Signaled() bool {
+	return es.Signo != 0
+}
+
+// Status returns the numeric representation of the ExitStatus returned by e.g.
+// the wait4() system call.
+func (es ExitStatus) Status() uint32 {
+	return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff)
+}
+
+// ShellExitCode returns the numeric exit code that Bash would return for an
+// exit status of es.
+func (es ExitStatus) ShellExitCode() int {
+	if es.Signaled() {
+		return 128 + es.Signo
+	}
+	return es.Code
+}
+
+// TaskExitState represents a step in the task exit path.
+//
+// "Exiting" and "exited" are often ambiguous; prefer to name specific states.
+type TaskExitState int
+
+const (
+	// TaskExitNone indicates that the task has not begun exiting.
+	TaskExitNone TaskExitState = iota
+
+	// TaskExitInitiated indicates that the task goroutine has entered the exit
+	// path, and the task is no longer eligible to participate in group stops
+	// or group signal handling. TaskExitInitiated is analogous to Linux's
+	// PF_EXITING.
+	TaskExitInitiated
+
+	// TaskExitZombie indicates that the task has released its resources, and
+	// the task no longer prevents a sibling thread from completing execve.
+	TaskExitZombie
+
+	// TaskExitDead indicates that the task's thread IDs have been released,
+	// and the task no longer prevents its thread group leader from being
+	// reaped. ("Reaping" refers to the transitioning of a task from
+	// TaskExitZombie to TaskExitDead.)
+	TaskExitDead
+)
+
+// String implements fmt.Stringer.
+func (t TaskExitState) String() string {
+	switch t {
+	case TaskExitNone:
+		return "TaskExitNone"
+	case TaskExitInitiated:
+		return "TaskExitInitiated"
+	case TaskExitZombie:
+		return "TaskExitZombie"
+	case TaskExitDead:
+		return "TaskExitDead"
+	default:
+		return strconv.Itoa(int(t))
+	}
+}
+
+// killLocked marks t as killed by enqueueing a SIGKILL, without causing the
+// thread-group-affecting side effects SIGKILL usually has.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) killLocked() {
+	// Clear killable stops.
+	if t.stop != nil && t.stop.Killable() {
+		t.endInternalStopLocked()
+	}
+	t.groupStopRequired = false
+	t.pendingSignals.enqueue(&arch.SignalInfo{
+		Signo: int32(linux.SIGKILL),
+		// Linux just sets SIGKILL in the pending signal bitmask without
+		// enqueueing an actual siginfo, such that
+		// kernel/signal.c:collect_signal() initalizes si_code to SI_USER.
+		Code: arch.SignalInfoUser,
+	})
+	t.interrupt()
+}
+
+// killed returns true if t has a SIGKILL pending. killed is analogous to
+// Linux's fatal_signal_pending().
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) killed() bool {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.killedLocked()
+}
+
+func (t *Task) killedLocked() bool {
+	return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0
+}
+
+// PrepareExit indicates an exit with status es.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) PrepareExit(es ExitStatus) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.exitStatus = es
+}
+
+// PrepareGroupExit indicates a group exit with status es to t's thread group.
+//
+// PrepareGroupExit is analogous to Linux's do_group_exit(), except that it
+// does not tail-call do_exit(), except that it *does* set Task.exitStatus.
+// (Linux does not do so until within do_exit(), since it reuses exit_code for
+// ptrace.)
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) PrepareGroupExit(es ExitStatus) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.tg.exiting || t.tg.execing != nil {
+		// Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e.
+		// this "group exit" is being executed by the killed sibling of an
+		// execing task, then Task.Execve never set t.tg.exitStatus, so it's
+		// still the zero value. This is consistent with Linux, both in intent
+		// ("all other threads ... report death as if they exited via _exit(2)
+		// with exit code 0" - ptrace(2), "execve under ptrace") and in
+		// implementation (compare fs/exec.c:de_thread() =>
+		// kernel/signal.c:zap_other_threads() and
+		// kernel/exit.c:do_group_exit() =>
+		// include/linux/sched.h:signal_group_exit()).
+		t.exitStatus = t.tg.exitStatus
+		return
+	}
+	t.tg.exiting = true
+	t.tg.exitStatus = es
+	t.exitStatus = es
+	for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
+		if sibling != t {
+			sibling.killLocked()
+		}
+	}
+}
+
+// Kill requests that all tasks in ts exit as if group exiting with status es.
+// Kill does not wait for tasks to exit.
+//
+// Kill has no analogue in Linux; it's provided for save/restore only.
+func (ts *TaskSet) Kill(es ExitStatus) {
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.Root.exiting = true
+	for t := range ts.Root.tids {
+		t.tg.signalHandlers.mu.Lock()
+		if !t.tg.exiting {
+			t.tg.exiting = true
+			t.tg.exitStatus = es
+		}
+		t.killLocked()
+		t.tg.signalHandlers.mu.Unlock()
+	}
+}
+
+// advanceExitStateLocked checks that t's current exit state is oldExit, then
+// sets it to newExit. If t's current exit state is not oldExit,
+// advanceExitStateLocked panics.
+//
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) {
+	if t.exitState != oldExit {
+		panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState))
+	}
+	t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit)
+	t.exitState = newExit
+}
+
+// runExit is the entry point into the task exit path.
+type runExit struct{}
+
+func (*runExit) execute(t *Task) taskRunState {
+	t.ptraceExit()
+	return (*runExitMain)(nil)
+}
+
+type runExitMain struct{}
+
+func (*runExitMain) execute(t *Task) taskRunState {
+	lastExiter := t.exitThreadGroup()
+
+	// If the task has a cleartid, and the thread group wasn't killed by a
+	// signal, handle that before releasing the MM.
+	if t.cleartid != 0 {
+		t.tg.signalHandlers.mu.Lock()
+		signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
+		t.tg.signalHandlers.mu.Unlock()
+		if !signaled {
+			if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
+				t.Futex().Wake(uintptr(t.cleartid), ^uint32(0), 1)
+			}
+			// If the CopyOut fails, there's nothing we can do.
+		}
+	}
+
+	// Deactivate the address space before releasing the MM.
+	t.Deactivate()
+
+	// Update the max resident set size before releasing t.tc.mm.
+	t.tg.pidns.owner.mu.Lock()
+	t.updateRSSLocked()
+	t.tg.pidns.owner.mu.Unlock()
+
+	// Release all of the task's resources.
+	t.mu.Lock()
+	t.tc.release()
+	t.tr.release()
+	t.mu.Unlock()
+	t.unstopVforkParent()
+
+	// If this is the last task to exit from the thread group, release the
+	// thread group's resources.
+	if lastExiter {
+		t.tg.release()
+	}
+
+	// Detach tracees.
+	t.exitPtrace()
+
+	// Reparent the task's children.
+	t.exitChildren()
+
+	// Don't tail-call runExitNotify, as exitChildren may have initiated a stop
+	// to wait for a PID namespace to die.
+	return (*runExitNotify)(nil)
+}
+
+// exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread
+// group that it is no longer eligible to participate in group activities. It
+// returns true if t is the last task in its thread group to call
+// exitThreadGroup.
+func (t *Task) exitThreadGroup() bool {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.tg.signalHandlers.mu.Lock()
+	// Can't defer unlock: see below.
+
+	t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated)
+	t.tg.activeTasks--
+	last := t.tg.activeTasks == 0
+
+	// Ensure that someone will handle the signals we can't.
+	t.setSignalMaskLocked(^linux.SignalSet(0))
+
+	// Check if this task's exit interacts with an initiated group stop.
+	if t.tg.groupStopPhase != groupStopInitiated {
+		t.tg.signalHandlers.mu.Unlock()
+		return last
+	}
+	if t.groupStopAcknowledged {
+		// Un-acknowledge the group stop.
+		t.tg.groupStopCount--
+		t.groupStopAcknowledged = false
+		// If the group stop wasn't complete before, then there is still at
+		// least one other task that hasn't acknowledged the group stop, so
+		// it is still not complete now.
+		t.tg.signalHandlers.mu.Unlock()
+		return last
+	}
+	if t.tg.groupStopCount != t.tg.activeTasks {
+		t.tg.signalHandlers.mu.Unlock()
+		return last
+	}
+	t.Debugf("Completing group stop")
+	t.tg.groupStopPhase = groupStopComplete
+	t.tg.groupStopWaitable = true
+	sig := t.tg.groupStopSignal
+	t.tg.groupContNotify = false
+	t.tg.groupContWaitable = false
+	// signalStop must be called with t's signal mutex unlocked.
+	t.tg.signalHandlers.mu.Unlock()
+	if t.tg.leader.parent != nil {
+		t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig))
+		t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
+	}
+	return last
+}
+
+func (t *Task) exitChildren() {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	newParent := t.findReparentTargetLocked()
+	if newParent == nil {
+		// "If the init process of a PID namespace terminates, the kernel
+		// terminates all of the processes in the namespace via a SIGKILL
+		// signal." - pid_namespaces(7)
+		t.Debugf("Init process terminating, killing namespace")
+		t.tg.pidns.exiting = true
+		for other := range t.tg.pidns.tids {
+			if other.tg != t.tg {
+				other.tg.signalHandlers.mu.Lock()
+				other.sendSignalLocked(&arch.SignalInfo{
+					Signo: int32(linux.SIGKILL),
+				}, false /* group */)
+				other.tg.signalHandlers.mu.Unlock()
+			}
+		}
+		// TODO: The init process waits for all processes in the
+		// namespace to exit before completing its own exit
+		// (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all
+		// other tasks in the namespace are dead, except possibly for this
+		// thread group's leader (which can't be reaped until this task exits).
+	}
+	// This is correct even if newParent is nil (it ensures that children don't
+	// wait for a parent to reap them.)
+	for c := range t.children {
+		if sig := c.ParentDeathSignal(); sig != 0 {
+			siginfo := &arch.SignalInfo{
+				Signo: int32(sig),
+				Code:  arch.SignalInfoUser,
+			}
+			siginfo.SetPid(int32(c.tg.pidns.tids[t]))
+			siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow()))
+			c.tg.signalHandlers.mu.Lock()
+			c.sendSignalLocked(siginfo, true /* group */)
+			c.tg.signalHandlers.mu.Unlock()
+		}
+		c.reparentLocked(newParent)
+		if newParent != nil {
+			newParent.children[c] = struct{}{}
+		}
+	}
+}
+
+// findReparentTargetLocked returns the task to which t's children should be
+// reparented. If no such task exists, findNewParentLocked returns nil.
+//
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) findReparentTargetLocked() *Task {
+	// Reparent to any sibling in the same thread group that hasn't begun
+	// exiting.
+	if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil {
+		return t2
+	}
+	// "A child process that is orphaned within the namespace will be
+	// reparented to [the init process for the namespace] ..." -
+	// pid_namespaces(7)
+	if init := t.tg.pidns.tasks[InitTID]; init != nil {
+		return init.tg.anyNonExitingTaskLocked()
+	}
+	return nil
+}
+
+func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task {
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		if t.exitState == TaskExitNone {
+			return t
+		}
+	}
+	return nil
+}
+
+// reparentLocked changes t's parent. The new parent may be nil.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) reparentLocked(parent *Task) {
+	oldParent := t.parent
+	t.parent = parent
+	// If a thread group leader's parent changes, reset the thread group's
+	// termination signal to SIGCHLD and re-check exit notification. (Compare
+	// kernel/exit.c:reparent_leader().)
+	if t != t.tg.leader {
+		return
+	}
+	if oldParent == nil && parent == nil {
+		return
+	}
+	if oldParent != nil && parent != nil && oldParent.tg == parent.tg {
+		return
+	}
+	t.tg.terminationSignal = linux.SIGCHLD
+	if t.exitParentNotified && !t.exitParentAcked {
+		t.exitParentNotified = false
+		t.exitNotifyLocked(false)
+	}
+}
+
+// When a task exits, other tasks in the system, notably the task's parent and
+// ptracer, may want to be notified. The exit notification system ensures that
+// interested tasks receive signals and/or are woken from blocking calls to
+// wait*() syscalls; these notifications must be resolved before exiting tasks
+// can be reaped and disappear from the system.
+//
+// Each task may have a parent task and/or a tracer task. If both a parent and
+// a tracer exist, they may be the same task, different tasks in the same
+// thread group, or tasks in different thread groups. (In the last case, Linux
+// refers to the task as being ptrace-reparented due to an implementation
+// detail; we avoid this terminology to avoid confusion.)
+//
+// A thread group is *empty* if all non-leader tasks in the thread group are
+// dead, and the leader is either a zombie or dead. The exit of a thread group
+// leader is never waitable - by either the parent or tracer - until the thread
+// group is empty.
+//
+// There are a few ways for an exit notification to be resolved:
+//
+// - The exit notification may be acknowledged by a call to Task.Wait with
+// WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall).
+//
+// - If the notified party is the parent, and the parent thread group is not
+// also the tracer thread group, and the notification signal is SIGCHLD, the
+// parent may explicitly ignore the notification (see quote in exitNotify).
+// Note that it's possible for the notified party to ignore the signal in other
+// cases, but the notification is only resolved under the above conditions.
+// (Actually, there is one exception; see the last paragraph of the "leader,
+// has tracer, tracer thread group is parent thread group" case below.)
+//
+// - If the notified party is the parent, and the parent does not exist, the
+// notification is resolved as if ignored. (This is only possible in the
+// sentry. In Linux, the only task / thread group without a parent is global
+// init, and killing global init causes a kernel panic.)
+//
+// - If the notified party is a tracer, the tracer may detach the traced task.
+// (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.)
+//
+// In addition, if the notified party is the parent, the parent may exit and
+// cause the notifying task to be reparented to another thread group. This does
+// not resolve the notification; instead, the notification must be resent to
+// the new parent.
+//
+// The series of notifications generated for a given task's exit depend on
+// whether it is a thread group leader; whether the task is ptraced; and, if
+// so, whether the tracer thread group is the same as the parent thread group.
+//
+// - Non-leader, no tracer: No notification is generated; the task is reaped
+// immediately.
+//
+// - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer
+// notification is resolved (by waiting or detaching), the task is reaped. (For
+// non-leaders, whether the tracer and parent thread groups are the same is
+// irrelevant.)
+//
+// - Leader, no tracer: The task remains a zombie, with no notification sent,
+// until all other tasks in the thread group are dead. (In Linux terms, this
+// condition is indicated by include/linux/sched.h:thread_group_empty(); tasks
+// are removed from their thread_group list in kernel/exit.c:release_task() =>
+// __exit_signal() => __unhash_process().) Then the thread group's termination
+// signal is sent to the parent. When the parent notification is resolved (by
+// waiting or ignoring), the task is reaped.
+//
+// - Leader, has tracer, tracer thread group is not parent thread group:
+// SIGCHLD is sent to the tracer. When the tracer notification is resolved (by
+// waiting or detaching), and all other tasks in the thread group are dead, the
+// thread group's termination signal is sent to the parent. (Note that the
+// tracer cannot resolve the exit notification by waiting until the thread
+// group is empty.) When the parent notification is resolved, the task is
+// reaped.
+//
+// - Leader, has tracer, tracer thread group is parent thread group:
+//
+// If all other tasks in the thread group are dead, the thread group's
+// termination signal is sent to the parent. At this point, the notification
+// can only be resolved by waiting. If the parent detaches from the task as a
+// tracer, the notification is not resolved, but the notification can now be
+// resolved by waiting or ignoring. When the parent notification is resolved,
+// the task is reaped.
+//
+// If at least one task in the thread group is not dead, SIGCHLD is sent to the
+// parent. At this point, the notification cannot be resolved at all; once the
+// thread group becomes empty, it can be resolved only by waiting. If the
+// parent detaches from the task as a tracer before all remaining tasks die,
+// then exit notification proceeds as in the case where the leader never had a
+// tracer. If the parent detaches from the task as a tracer after all remaining
+// tasks die, the notification is not resolved, but the notification can now be
+// resolved by waiting or ignoring. When the parent notification is resolved,
+// the task is reaped.
+//
+// In both of the above cases, when the parent detaches from the task as a
+// tracer while the thread group is empty, whether or not the parent resolves
+// the notification by ignoring it is based on the parent's SIGCHLD signal
+// action, whether or not the thread group's termination signal is SIGCHLD
+// (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()).
+//
+// There is one final wrinkle: A leader can become a non-leader due to a
+// sibling execve. In this case, the execing thread detaches the leader's
+// tracer (if one exists) and reaps the leader immediately. In Linux, this is
+// in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked().
+
+type runExitNotify struct{}
+
+func (*runExitNotify) execute(t *Task) taskRunState {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie)
+	t.tg.liveTasks--
+	// Check if this completes a sibling's execve.
+	if t.tg.execing != nil && t.tg.liveTasks == 1 {
+		// execing blocks the addition of new tasks to the thread group, so
+		// the sole living task must be the execing one.
+		e := t.tg.execing
+		e.tg.signalHandlers.mu.Lock()
+		if _, ok := e.stop.(*execStop); ok {
+			e.endInternalStopLocked()
+		}
+		e.tg.signalHandlers.mu.Unlock()
+	}
+	t.exitNotifyLocked(false)
+	// The task goroutine will now exit.
+	return nil
+}
+
+// exitNotifyLocked is called after changes to t's state that affect exit
+// notification.
+//
+// If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace;
+// thanks to Linux's haphazard implementation of this functionality, such cases
+// determine whether parent notifications are ignored based on the parent's
+// handling of SIGCHLD, regardless of what the exited task's thread group's
+// termination signal is.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
+	if t.exitState != TaskExitZombie {
+		return
+	}
+	if !t.exitTracerNotified {
+		t.exitTracerNotified = true
+		tracer := t.Tracer()
+		if tracer == nil {
+			t.exitTracerAcked = true
+		} else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg {
+			// Don't set exitParentNotified if t is non-leader, even if the
+			// tracer is in the parent thread group, so that if the parent
+			// detaches the following call to exitNotifyLocked passes through
+			// the !exitParentNotified case below and causes t to be reaped
+			// immediately.
+			//
+			// Tracer notification doesn't care about about
+			// SIG_IGN/SA_NOCLDWAIT.
+			tracer.tg.signalHandlers.mu.Lock()
+			tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */)
+			tracer.tg.signalHandlers.mu.Unlock()
+			// Wake EventTraceeStop waiters as well since this task will never
+			// ptrace-stop again.
+			tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop)
+		} else {
+			// t is a leader and the tracer is in the parent thread group.
+			t.exitParentNotified = true
+			sig := linux.SIGCHLD
+			if t.tg.tasksCount == 1 {
+				sig = t.tg.terminationSignal
+			}
+			// This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either
+			// (in Linux, the check in do_notify_parent() is gated by
+			// !tsk->ptrace.)
+			t.parent.tg.signalHandlers.mu.Lock()
+			t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */)
+			t.parent.tg.signalHandlers.mu.Unlock()
+			// See below for rationale for this event mask.
+			t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
+		}
+	}
+	if t.exitTracerAcked && !t.exitParentNotified {
+		if t != t.tg.leader {
+			t.exitParentNotified = true
+			t.exitParentAcked = true
+		} else if t.tg.tasksCount == 1 {
+			t.exitParentNotified = true
+			if t.parent == nil {
+				t.exitParentAcked = true
+			} else {
+				// "POSIX.1-2001 specifies that if the disposition of SIGCHLD is
+				// set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see
+				// sigaction(2)), then children that terminate do not become
+				// zombies and a call to wait() or waitpid() will block until all
+				// children have terminated, and then fail with errno set to
+				// ECHILD. (The original POSIX standard left the behavior of
+				// setting SIGCHLD to SIG_IGN unspecified. Note that even though
+				// the default disposition of SIGCHLD is "ignore", explicitly
+				// setting the disposition to SIG_IGN results in different
+				// treatment of zombie process children.) Linux 2.6 conforms to
+				// this specification." - wait(2)
+				//
+				// Some undocumented Linux-specific details:
+				//
+				// - All of the above is ignored if the termination signal isn't
+				// SIGCHLD.
+				//
+				// - SA_NOCLDWAIT causes the leader to be immediately reaped, but
+				// does not suppress the SIGCHLD.
+				signalParent := t.tg.terminationSignal.IsValid()
+				t.parent.tg.signalHandlers.mu.Lock()
+				if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach {
+					if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok {
+						if act.Handler == arch.SignalActIgnore {
+							t.exitParentAcked = true
+							signalParent = false
+						} else if act.Flags&arch.SignalFlagNoCldWait != 0 {
+							t.exitParentAcked = true
+						}
+					}
+				}
+				if signalParent {
+					t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */)
+				}
+				t.parent.tg.signalHandlers.mu.Unlock()
+				// If a task in the parent was waiting for a child group stop
+				// or continue, it needs to be notified of the exit, because
+				// there may be no remaining eligible tasks (so that wait
+				// should return ECHILD).
+				t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
+			}
+		}
+	}
+	if t.exitTracerAcked && t.exitParentAcked {
+		t.advanceExitStateLocked(TaskExitZombie, TaskExitDead)
+		for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+			tid := ns.tids[t]
+			delete(ns.tasks, tid)
+			delete(ns.tids, t)
+		}
+		t.tg.exitedCPUStats.Accumulate(t.CPUStats())
+		t.tg.ioUsage.Accumulate(t.ioUsage)
+		t.tg.signalHandlers.mu.Lock()
+		t.tg.tasks.Remove(t)
+		if t.tg.lastTimerSignalTask == t {
+			t.tg.lastTimerSignalTask = nil
+		}
+		t.tg.tasksCount--
+		tc := t.tg.tasksCount
+		t.tg.signalHandlers.mu.Unlock()
+		if tc == 1 && t != t.tg.leader {
+			// Our fromPtraceDetach doesn't matter here (in Linux terms, this
+			// is via a call to release_task()).
+			t.tg.leader.exitNotifyLocked(false)
+		} else if tc == 0 {
+			t.tg.processGroup.decRefWithParent(t.tg.parentPG())
+		}
+		if t.parent != nil {
+			delete(t.parent.children, t)
+			t.parent = nil
+		}
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.SignalInfo {
+	info := &arch.SignalInfo{
+		Signo: int32(sig),
+	}
+	info.SetPid(int32(receiver.tg.pidns.tids[t]))
+	info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+	if t.exitStatus.Signaled() {
+		info.Code = arch.CLD_KILLED
+		info.SetStatus(int32(t.exitStatus.Signo))
+	} else {
+		info.Code = arch.CLD_EXITED
+		info.SetStatus(int32(t.exitStatus.Code))
+	}
+	// TODO: Set utime, stime.
+	return info
+}
+
+// ExitStatus returns t's exit status, which is only guaranteed to be
+// meaningful if t.ExitState() != TaskExitNone.
+func (t *Task) ExitStatus() ExitStatus {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.exitStatus
+}
+
+// ExitStatus returns the exit status that would be returned by a consuming
+// wait*() on tg.
+func (tg *ThreadGroup) ExitStatus() ExitStatus {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	if tg.exiting {
+		return tg.exitStatus
+	}
+	return tg.leader.exitStatus
+}
+
+// TerminationSignal returns the thread group's termination signal.
+func (tg *ThreadGroup) TerminationSignal() linux.Signal {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.terminationSignal
+}
+
+// Task events that can be waited for.
+const (
+	// EventExit represents an exit notification generated for a child thread
+	// group leader or a tracee under the conditions specified in the comment
+	// above runExitNotify.
+	EventExit waiter.EventMask = 1 << iota
+
+	// EventChildGroupStop occurs when a child thread group completes a group
+	// stop (i.e. all tasks in the child thread group have entered a stopped
+	// state as a result of a group stop).
+	EventChildGroupStop
+
+	// EventTraceeStop occurs when a task that is ptraced by a task in the
+	// notified thread group enters a ptrace stop (see ptrace(2)).
+	EventTraceeStop
+
+	// EventGroupContinue occurs when a child thread group, or a thread group
+	// whose leader is ptraced by a task in the notified thread group, that had
+	// initiated or completed a group stop leaves the group stop, due to the
+	// child thread group or any task in the child thread group being sent
+	// SIGCONT.
+	EventGroupContinue
+)
+
+// WaitOptions controls the behavior of Task.Wait.
+type WaitOptions struct {
+	// If SpecificTID is non-zero, only events from the task with thread ID
+	// SpecificTID are eligible to be waited for. SpecificTID is resolved in
+	// the PID namespace of the waiter (the method receiver of Task.Wait). If
+	// no such task exists, or that task would not otherwise be eligible to be
+	// waited for by the waiting task, then there are no waitable tasks and
+	// Wait will return ECHILD.
+	SpecificTID ThreadID
+
+	// If SpecificPGID is non-zero, only events from ThreadGroups with a
+	// matching ProcessGroupID are eligible to be waited for. (Same
+	// constraints as SpecificTID apply.)
+	SpecificPGID ProcessGroupID
+
+	// Terminology note: Per waitpid(2), "a clone child is one which delivers
+	// no signal, or a signal other than SIGCHLD to its parent upon
+	// termination." In Linux, termination signal is technically a per-task
+	// property rather than a per-thread-group property. However, clone()
+	// forces no termination signal for tasks created with CLONE_THREAD, and
+	// execve() resets the termination signal to SIGCHLD, so all
+	// non-group-leader threads have no termination signal and are therefore
+	// "clone tasks".
+
+	// If NonCloneTasks is true, events from non-clone tasks are eligible to be
+	// waited for.
+	NonCloneTasks bool
+
+	// If CloneTasks is true, events from clone tasks are eligible to be waited
+	// for.
+	CloneTasks bool
+
+	// Events is a bitwise combination of the events defined above that specify
+	// what events are of interest to the call to Wait.
+	Events waiter.EventMask
+
+	// If ConsumeEvent is true, the Wait should consume the event such that it
+	// cannot be returned by a future Wait. Note that if a task exit is
+	// consumed in this way, in most cases the task will be reaped.
+	ConsumeEvent bool
+
+	// If BlockInterruptErr is not nil, Wait will block until either an event
+	// is available or there are no tasks that could produce a waitable event;
+	// if that blocking is interrupted, Wait returns BlockInterruptErr. If
+	// BlockInterruptErr is nil, Wait will not block.
+	BlockInterruptErr error
+}
+
+// Preconditions: The TaskSet mutex must be locked (for reading or writing).
+func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace) bool {
+	if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] {
+		return false
+	}
+	if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] {
+		return false
+	}
+	if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD {
+		return o.NonCloneTasks
+	}
+	return o.CloneTasks
+}
+
+// ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g.
+// waitpid(WNOHANG)) that find no waitable events, but determine that waitable
+// events may exist in the future. (In contrast, if a non-blocking or blocking
+// Wait determines that there are no tasks that can produce a waitable event,
+// Task.Wait returns ECHILD.)
+var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events")
+
+// WaitResult contains information about a waited-for event.
+type WaitResult struct {
+	// Task is the task that reported the event.
+	Task *Task
+
+	// TID is the thread ID of Task in the PID namespace of the task that
+	// called Wait (that is, the method receiver of the call to Task.Wait). TID
+	// is provided because consuming exit waits cause the thread ID to be
+	// deallocated.
+	TID ThreadID
+
+	// UID is the real UID of Task in the user namespace of the task that
+	// called Wait.
+	UID auth.UID
+
+	// Event is exactly one of the events defined above.
+	Event waiter.EventMask
+
+	// Status is the numeric status associated with the event.
+	Status uint32
+}
+
+// Wait waits for an event from a thread group that is a child of t's thread
+// group, or a task in such a thread group, or a task that is ptraced by t,
+// subject to the options specified in opts.
+func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) {
+	if opts.BlockInterruptErr == nil {
+		return t.waitOnce(opts)
+	}
+	w, ch := waiter.NewChannelEntry(nil)
+	t.tg.eventQueue.EventRegister(&w, opts.Events)
+	defer t.tg.eventQueue.EventUnregister(&w)
+	for {
+		wr, err := t.waitOnce(opts)
+		if err != ErrNoWaitableEvent {
+			// This includes err == nil.
+			return wr, err
+		}
+		if err := t.Block(ch); err != nil {
+			return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr)
+		}
+	}
+}
+
+func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) {
+	anyWaitableTasks := false
+
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+
+	// Without the (unimplemented) __WNOTHREAD flag, a task can wait on the
+	// children and tracees of any task in the same thread group.
+	for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() {
+		for child := range parent.children {
+			if !opts.matchesTask(child, parent.tg.pidns) {
+				continue
+			}
+			// Non-leaders don't notify parents on exit and aren't eligible to
+			// be waited on.
+			if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked {
+				anyWaitableTasks = true
+				if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil {
+					return wr, nil
+				}
+			}
+			// Check for group stops and continues. Tasks that have passed
+			// TaskExitInitiated can no longer participate in group stops.
+			if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 {
+				continue
+			}
+			if child.exitState >= TaskExitInitiated {
+				continue
+			}
+			// If the waiter is in the same thread group as the task's
+			// tracer, do not report its group stops; they will be reported
+			// as ptrace stops instead. This also skips checking for group
+			// continues, but they'll be checked for when scanning tracees
+			// below. (Per kernel/exit.c:wait_consider_task(): "If a
+			// ptracer wants to distinguish the two events for its own
+			// children, it should create a separate process which takes
+			// the role of real parent.")
+			if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg {
+				continue
+			}
+			anyWaitableTasks = true
+			if opts.Events&EventChildGroupStop != 0 {
+				if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil {
+					return wr, nil
+				}
+			}
+			if opts.Events&EventGroupContinue != 0 {
+				if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil {
+					return wr, nil
+				}
+			}
+		}
+		for tracee := range parent.ptraceTracees {
+			if !opts.matchesTask(tracee, parent.tg.pidns) {
+				continue
+			}
+			// Non-leaders do notify tracers on exit.
+			if opts.Events&EventExit != 0 && !tracee.exitTracerAcked {
+				anyWaitableTasks = true
+				if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil {
+					return wr, nil
+				}
+			}
+			if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 {
+				continue
+			}
+			if tracee.exitState >= TaskExitInitiated {
+				continue
+			}
+			anyWaitableTasks = true
+			if opts.Events&EventTraceeStop != 0 {
+				if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil {
+					return wr, nil
+				}
+			}
+			if opts.Events&EventGroupContinue != 0 {
+				if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil {
+					return wr, nil
+				}
+			}
+		}
+	}
+
+	if anyWaitableTasks {
+		return nil, ErrNoWaitableEvent
+	}
+	return nil, syserror.ECHILD
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult {
+	if asPtracer && !target.exitTracerNotified {
+		return nil
+	}
+	if !asPtracer && !target.exitParentNotified {
+		return nil
+	}
+	// Zombied thread group leaders are never waitable until their thread group
+	// is otherwise empty. Usually this is caught by the
+	// target.exitParentNotified check above, but if t is both (in the thread
+	// group of) target's tracer and parent, asPtracer may be true.
+	if target == target.tg.leader && target.tg.tasksCount != 1 {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	status := target.exitStatus.Status()
+	if !opts.ConsumeEvent {
+		return &WaitResult{
+			Task:   target,
+			TID:    pid,
+			UID:    uid,
+			Event:  EventExit,
+			Status: status,
+		}
+	}
+	// Surprisingly, the exit status reported by a non-consuming wait can
+	// differ from that reported by a consuming wait; the latter will return
+	// the group exit code if one is available.
+	if target.tg.exiting {
+		status = target.tg.exitStatus.Status()
+	}
+	// t may be (in the thread group of) target's parent, tracer, or both. We
+	// don't need to check for !exitTracerAcked because tracees are detached
+	// here, and we don't need to check for !exitParentAcked because zombies
+	// will be reaped here.
+	if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified {
+		target.exitTracerAcked = true
+		target.ptraceTracer.Store((*Task)(nil))
+		delete(t.ptraceTracees, target)
+	}
+	if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified {
+		target.exitParentAcked = true
+		if target == target.tg.leader {
+			// target.tg.exitedCPUStats doesn't include target.CPUStats() yet,
+			// and won't until after target.exitNotifyLocked() (maybe). Include
+			// target.CPUStats() explicitly. This is consistent with Linux,
+			// which accounts an exited task's cputime to its thread group in
+			// kernel/exit.c:release_task() => __exit_signal(), and uses
+			// thread_group_cputime_adjusted() in wait_task_zombie().
+			t.tg.childCPUStats.Accumulate(target.CPUStats())
+			t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats)
+			t.tg.childCPUStats.Accumulate(target.tg.childCPUStats)
+			// Update t's child max resident set size. The size will be the maximum
+			// of this thread's size and all its childrens' sizes.
+			if t.tg.childMaxRSS < target.tg.maxRSS {
+				t.tg.childMaxRSS = target.tg.maxRSS
+			}
+			if t.tg.childMaxRSS < target.tg.childMaxRSS {
+				t.tg.childMaxRSS = target.tg.childMaxRSS
+			}
+		}
+	}
+	target.exitNotifyLocked(false)
+	return &WaitResult{
+		Task:   target,
+		TID:    pid,
+		UID:    uid,
+		Event:  EventExit,
+		Status: status,
+	}
+}
+
+// updateRSSLocked updates t.tg.maxRSS.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) updateRSSLocked() {
+	if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS {
+		t.tg.maxRSS = mmMaxRSS
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult {
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if !target.tg.groupStopWaitable {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	sig := target.tg.groupStopSignal
+	if opts.ConsumeEvent {
+		target.tg.groupStopWaitable = false
+	}
+	return &WaitResult{
+		Task:  target,
+		TID:   pid,
+		UID:   uid,
+		Event: EventChildGroupStop,
+		// There is no name for these status constants.
+		Status: (uint32(sig)&0xff)<<8 | 0x7f,
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult {
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if !target.tg.groupContWaitable {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	if opts.ConsumeEvent {
+		target.tg.groupContWaitable = false
+	}
+	return &WaitResult{
+		Task:   target,
+		TID:    pid,
+		UID:    uid,
+		Event:  EventGroupContinue,
+		Status: 0xffff,
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult {
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if target.stop == nil {
+		return nil
+	}
+	if _, ok := target.stop.(*ptraceStop); !ok {
+		return nil
+	}
+	if target.ptraceCode == 0 {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	code := target.ptraceCode
+	if opts.ConsumeEvent {
+		target.ptraceCode = 0
+	}
+	return &WaitResult{
+		Task:   target,
+		TID:    pid,
+		UID:    uid,
+		Event:  EventTraceeStop,
+		Status: uint32(code)<<8 | 0x7f,
+	}
+}
+
+// ExitState returns t's current progress through the exit path.
+func (t *Task) ExitState() TaskExitState {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	return t.exitState
+}
+
+// ParentDeathSignal returns t's parent death signal.
+func (t *Task) ParentDeathSignal() linux.Signal {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.parentDeathSignal
+}
+
+// SetParentDeathSignal sets t's parent death signal.
+func (t *Task) SetParentDeathSignal(sig linux.Signal) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.parentDeathSignal = sig
+}
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
new file mode 100644
index 000000000..a51fa9d7e
--- /dev/null
+++ b/pkg/sentry/kernel/task_identity.go
@@ -0,0 +1,557 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Credentials returns t's credentials by value.
+func (t *Task) Credentials() auth.Credentials {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return *t.creds // Copy out with lock held.
+}
+
+// UserNamespace returns the user namespace associated with the task.
+func (t *Task) UserNamespace() *auth.UserNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.creds.UserNamespace
+}
+
+// HasCapabilityIn checks if the task has capability cp in user namespace ns.
+func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.creds.HasCapabilityIn(cp, ns)
+}
+
+// HasCapability checks if the task has capability cp in its user namespace.
+func (t *Task) HasCapability(cp linux.Capability) bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.creds.HasCapability(cp)
+}
+
+// SetUID implements the semantics of setuid(2).
+func (t *Task) SetUID(uid auth.UID) error {
+	// setuid considers -1 to be invalid.
+	if !uid.Ok() {
+		return syserror.EINVAL
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	kuid := t.creds.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return syserror.EINVAL
+	}
+	// "setuid() sets the effective user ID of the calling process. If the
+	// effective UID of the caller is root (more precisely: if the caller has
+	// the CAP_SETUID capability), the real UID and saved set-user-ID are also
+	// set." - setuid(2)
+	if t.creds.HasCapability(linux.CAP_SETUID) {
+		t.setKUIDsUncheckedLocked(kuid, kuid, kuid)
+		return nil
+	}
+	// "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID
+	// capability) and uid does not match the real UID or saved set-user-ID of
+	// the calling process."
+	if kuid != t.creds.RealKUID && kuid != t.creds.SavedKUID {
+		return syserror.EPERM
+	}
+	t.setKUIDsUncheckedLocked(t.creds.RealKUID, kuid, t.creds.SavedKUID)
+	return nil
+}
+
+// SetREUID implements the semantics of setreuid(2).
+func (t *Task) SetREUID(r, e auth.UID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Supplying a value of -1 for either the real or effective user ID forces
+	// the system to leave that ID unchanged." - setreuid(2)
+	newR := t.creds.RealKUID
+	if r.Ok() {
+		newR = t.creds.UserNamespace.MapToKUID(r)
+		if !newR.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	newE := t.creds.EffectiveKUID
+	if e.Ok() {
+		newE = t.creds.UserNamespace.MapToKUID(e)
+		if !newE.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	if !t.creds.HasCapability(linux.CAP_SETUID) {
+		// "Unprivileged processes may only set the effective user ID to the
+		// real user ID, the effective user ID, or the saved set-user-ID."
+		if newE != t.creds.RealKUID && newE != t.creds.EffectiveKUID && newE != t.creds.SavedKUID {
+			return syserror.EPERM
+		}
+		// "Unprivileged users may only set the real user ID to the real user
+		// ID or the effective user ID."
+		if newR != t.creds.RealKUID && newR != t.creds.EffectiveKUID {
+			return syserror.EPERM
+		}
+	}
+	// "If the real user ID is set (i.e., ruid is not -1) or the effective user
+	// ID is set to a value not equal to the previous real user ID, the saved
+	// set-user-ID will be set to the new effective user ID."
+	newS := t.creds.SavedKUID
+	if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKUID) {
+		newS = newE
+	}
+	t.setKUIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// SetRESUID implements the semantics of the setresuid(2) syscall.
+func (t *Task) SetRESUID(r, e, s auth.UID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Unprivileged user processes may change the real UID, effective UID, and
+	// saved set-user-ID, each to one of: the current real UID, the current
+	// effective UID or the current saved set-user-ID. Privileged processes (on
+	// Linux, those having the CAP_SETUID capability) may set the real UID,
+	// effective UID, and saved set-user-ID to arbitrary values. If one of the
+	// arguments equals -1, the corresponding value is not changed." -
+	// setresuid(2)
+	var err error
+	newR := t.creds.RealKUID
+	if r.Ok() {
+		newR, err = t.creds.UseUID(r)
+		if err != nil {
+			return err
+		}
+	}
+	newE := t.creds.EffectiveKUID
+	if e.Ok() {
+		newE, err = t.creds.UseUID(e)
+		if err != nil {
+			return err
+		}
+	}
+	newS := t.creds.SavedKUID
+	if s.Ok() {
+		newS, err = t.creds.UseUID(s)
+		if err != nil {
+			return err
+		}
+	}
+	t.setKUIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// Preconditions: t.mu must be locked.
+func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
+	root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
+	oldR, oldE, oldS := t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID
+	t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID = newR, newE, newS
+
+	// "1. If one or more of the real, effective or saved set user IDs was
+	// previously 0, and as a result of the UID changes all of these IDs have a
+	// nonzero value, then all capabilities are cleared from the permitted and
+	// effective capability sets." - capabilities(7)
+	if (oldR == root || oldE == root || oldS == root) && (newR != root && newE != root && newS != root) {
+		// prctl(2): "PR_SET_KEEPCAP: Set the state of the calling thread's
+		// "keep capabilities" flag, which determines whether the thread's permitted
+		// capability set is cleared when a change is made to the
+		// thread's user IDs such that the thread's real UID, effective
+		// UID, and saved set-user-ID all become nonzero when at least
+		// one of them previously had the value 0.  By default, the
+		// permitted capability set is cleared when such a change is
+		// made; setting the "keep capabilities" flag prevents it from
+		// being cleared." (A thread's effective capability set is always
+		// cleared when such a credential change is made,
+		// regardless of the setting of the "keep capabilities" flag.)
+		if !t.creds.KeepCaps {
+			t.creds.PermittedCaps = 0
+			t.creds.EffectiveCaps = 0
+		}
+	}
+	// """
+	// 2. If the effective user ID is changed from 0 to nonzero, then all
+	// capabilities are cleared from the effective set.
+	//
+	// 3. If the effective user ID is changed from nonzero to 0, then the
+	// permitted set is copied to the effective set.
+	// """
+	if oldE == root && newE != root {
+		t.creds.EffectiveCaps = 0
+	} else if oldE != root && newE == root {
+		t.creds.EffectiveCaps = t.creds.PermittedCaps
+	}
+	// "4. If the filesystem user ID is changed from 0 to nonzero (see
+	// setfsuid(2)), then the following capabilities are cleared from the
+	// effective set: ..."
+	// (filesystem UIDs aren't implemented, nor are any of the capabilities in
+	// question)
+
+	// Not documented, but compare Linux's kernel/cred.c:commit_creds().
+	if oldE != newE {
+		t.parentDeathSignal = 0
+	}
+}
+
+// SetGID implements the semantics of setgid(2).
+func (t *Task) SetGID(gid auth.GID) error {
+	if !gid.Ok() {
+		return syserror.EINVAL
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	kgid := t.creds.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return syserror.EINVAL
+	}
+	if t.creds.HasCapability(linux.CAP_SETGID) {
+		t.setKGIDsUncheckedLocked(kgid, kgid, kgid)
+		return nil
+	}
+	if kgid != t.creds.RealKGID && kgid != t.creds.SavedKGID {
+		return syserror.EPERM
+	}
+	t.setKGIDsUncheckedLocked(t.creds.RealKGID, kgid, t.creds.SavedKGID)
+	return nil
+}
+
+// SetREGID implements the semantics of setregid(2).
+func (t *Task) SetREGID(r, e auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	newR := t.creds.RealKGID
+	if r.Ok() {
+		newR = t.creds.UserNamespace.MapToKGID(r)
+		if !newR.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	newE := t.creds.EffectiveKGID
+	if e.Ok() {
+		newE = t.creds.UserNamespace.MapToKGID(e)
+		if !newE.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	if !t.creds.HasCapability(linux.CAP_SETGID) {
+		if newE != t.creds.RealKGID && newE != t.creds.EffectiveKGID && newE != t.creds.SavedKGID {
+			return syserror.EPERM
+		}
+		if newR != t.creds.RealKGID && newR != t.creds.EffectiveKGID {
+			return syserror.EPERM
+		}
+	}
+	newS := t.creds.SavedKGID
+	if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKGID) {
+		newS = newE
+	}
+	t.setKGIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// SetRESGID implements the semantics of the setresgid(2) syscall.
+func (t *Task) SetRESGID(r, e, s auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	var err error
+	newR := t.creds.RealKGID
+	if r.Ok() {
+		newR, err = t.creds.UseGID(r)
+		if err != nil {
+			return err
+		}
+	}
+	newE := t.creds.EffectiveKGID
+	if e.Ok() {
+		newE, err = t.creds.UseGID(e)
+		if err != nil {
+			return err
+		}
+	}
+	newS := t.creds.SavedKGID
+	if s.Ok() {
+		newS, err = t.creds.UseGID(s)
+		if err != nil {
+			return err
+		}
+	}
+	t.setKGIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
+	oldE := t.creds.EffectiveKGID
+	t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS
+
+	// Not documented, but compare Linux's kernel/cred.c:commit_creds().
+	if oldE != newE {
+		t.parentDeathSignal = 0
+	}
+}
+
+// SetExtraGIDs attempts to change t's supplemental groups. All IDs are
+// interpreted as being in t's user namespace.
+func (t *Task) SetExtraGIDs(gids []auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if !t.creds.HasCapability(linux.CAP_SETGID) {
+		return syserror.EPERM
+	}
+	kgids := make([]auth.KGID, len(gids))
+	for i, gid := range gids {
+		kgid := t.creds.UserNamespace.MapToKGID(gid)
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+		kgids[i] = kgid
+	}
+	t.creds.ExtraKGIDs = kgids
+	return nil
+}
+
+// SetCapabilitySets attempts to change t's permitted, inheritable, and
+// effective capability sets.
+func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Permitted: This is a limiting superset for the effective capabilities
+	// that the thread may assume." - capabilities(7)
+	if effective & ^permitted != 0 {
+		return syserror.EPERM
+	}
+	// "It is also a limiting superset for the capabilities that may be added
+	// to the inheritable set by a thread that does not have the CAP_SETPCAP
+	// capability in its effective set."
+	if !t.creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(t.creds.InheritableCaps|t.creds.PermittedCaps) != 0) {
+		return syserror.EPERM
+	}
+	// "If a thread drops a capability from its permitted set, it can never
+	// reacquire that capability (unless it execve(2)s ..."
+	if permitted & ^t.creds.PermittedCaps != 0 {
+		return syserror.EPERM
+	}
+	// "... if a capability is not in the bounding set, then a thread can't add
+	// this capability to its inheritable set, even if it was in its permitted
+	// capabilities ..."
+	if inheritable & ^(t.creds.InheritableCaps|t.creds.BoundingCaps) != 0 {
+		return syserror.EPERM
+	}
+	t.creds.PermittedCaps = permitted
+	t.creds.InheritableCaps = inheritable
+	t.creds.EffectiveCaps = effective
+	return nil
+}
+
+// DropBoundingCapability attempts to drop capability cp from t's capability
+// bounding set.
+func (t *Task) DropBoundingCapability(cp linux.Capability) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if !t.creds.HasCapability(linux.CAP_SETPCAP) {
+		return syserror.EPERM
+	}
+	t.creds.BoundingCaps &^= auth.CapabilitySetOf(cp)
+	return nil
+}
+
+// SetUserNamespace attempts to move c into ns.
+func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// "A process reassociating itself with a user namespace must have the
+	// CAP_SYS_ADMIN capability in the target user namespace." - setns(2)
+	//
+	// If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN
+	// in ns (by rule 3 in auth.Credentials.HasCapability).
+	if !t.creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
+		return syserror.EPERM
+	}
+
+	t.creds.UserNamespace = ns
+	// "The child process created by clone(2) with the CLONE_NEWUSER flag
+	// starts out with a complete set of capabilities in the new user
+	// namespace. Likewise, a process that creates a new user namespace using
+	// unshare(2) or joins an existing user namespace using setns(2) gains a
+	// full set of capabilities in that namespace."
+	t.creds.PermittedCaps = auth.AllCapabilities
+	t.creds.InheritableCaps = 0
+	t.creds.EffectiveCaps = auth.AllCapabilities
+	t.creds.BoundingCaps = auth.AllCapabilities
+	// "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER
+	// flag sets the "securebits" flags (see capabilities(7)) to their default
+	// values (all flags disabled) in the child (for clone(2)) or caller (for
+	// unshare(2), or setns(2)." - user_namespaces(7)
+	t.creds.KeepCaps = false
+
+	return nil
+}
+
+// SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS.
+func (t *Task) SetKeepCaps(k bool) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.creds.KeepCaps = k
+}
+
+// updateCredsForExec updates t.creds to reflect an execve().
+//
+// NOTE: We currently do not implement privileged executables
+// (set-user/group-ID bits and file capabilities). This allows us to make a lot
+// of simplifying assumptions:
+//
+// - We assume the no_new_privs bit (set by prctl(SET_NO_NEW_PRIVS)), which
+// disables the features we don't support anyway, is always set. This
+// drastically simplifies this function.
+//
+// - We don't implement AT_SECURE, because no_new_privs always being set means
+// that the conditions that require AT_SECURE never arise. (Compare Linux's
+// security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().)
+//
+// - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since
+// seccomp-bpf is also allowed if the task has no_new_privs set.
+//
+// - Task.ptraceAttach does not serialize with execve as it does in Linux,
+// since no_new_privs being set has the same effect as the presence of an
+// unprivileged tracer.
+//
+// Preconditions: t.mu must be locked.
+func (t *Task) updateCredsForExecLocked() {
+	// """
+	// During an execve(2), the kernel calculates the new capabilities of
+	// the process using the following algorithm:
+	//
+	//     P'(permitted) = (P(inheritable) & F(inheritable)) |
+	//                     (F(permitted) & cap_bset)
+	//
+	//     P'(effective) = F(effective) ? P'(permitted) : 0
+	//
+	//     P'(inheritable) = P(inheritable)    [i.e., unchanged]
+	//
+	// where:
+	//
+	//     P         denotes the value of a thread capability set before the
+	//               execve(2)
+	//
+	//     P'        denotes the value of a thread capability set after the
+	//               execve(2)
+	//
+	//     F         denotes a file capability set
+	//
+	//     cap_bset  is the value of the capability bounding set
+	//
+	// ...
+	//
+	// In order to provide an all-powerful root using capability sets, during
+	// an execve(2):
+	//
+	// 1. If a set-user-ID-root program is being executed, or the real user ID
+	// of the process is 0 (root) then the file inheritable and permitted sets
+	// are defined to be all ones (i.e. all capabilities enabled).
+	//
+	// 2. If a set-user-ID-root program is being executed, then the file
+	// effective bit is defined to be one (enabled).
+	//
+	// The upshot of the above rules, combined with the capabilities
+	// transformations described above, is that when a process execve(2)s a
+	// set-user-ID-root program, or when a process with an effective UID of 0
+	// execve(2)s a program, it gains all capabilities in its permitted and
+	// effective capability sets, except those masked out by the capability
+	// bounding set.
+	// """ - capabilities(7)
+	// (ambient capability sets omitted)
+	//
+	// As the last paragraph implies, the case of "a set-user-ID root program
+	// is being executed" also includes the case where (namespace) root is
+	// executing a non-set-user-ID program; the actual check is just based on
+	// the effective user ID.
+	var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0
+	fileEffective := false
+	root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
+	if t.creds.EffectiveKUID == root || t.creds.RealKUID == root {
+		newPermitted = t.creds.InheritableCaps | t.creds.BoundingCaps
+		if t.creds.EffectiveKUID == root {
+			fileEffective = true
+		}
+	}
+
+	// Now we enter poorly-documented, somewhat confusing territory. (The
+	// accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds
+	// is not very helpful.) My reading of it is:
+	//
+	// If at least one of the following is true:
+	//
+	// A1. The execing task is ptraced, and the tracer did not have
+	// CAP_SYS_PTRACE in the execing task's user namespace at the time of
+	// PTRACE_ATTACH.
+	//
+	// A2. The execing task shares its FS context with at least one task in
+	// another thread group.
+	//
+	// A3. The execing task has no_new_privs set.
+	//
+	// AND at least one of the following is true:
+	//
+	// B1. The new effective user ID (which may come from set-user-ID, or be the
+	// execing task's existing effective user ID) is not equal to the task's
+	// real UID.
+	//
+	// B2. The new effective group ID (which may come from set-group-ID, or be
+	// the execing task's existing effective group ID) is not equal to the
+	// task's real GID.
+	//
+	// B3. The new permitted capability set contains capabilities not in the
+	// task's permitted capability set.
+	//
+	// Then:
+	//
+	// C1. Limit the new permitted capability set to the task's permitted
+	// capability set.
+	//
+	// C2. If either the task does not have CAP_SETUID in its user namespace, or
+	// the task has no_new_privs set, force the new effective UID and GID to
+	// the task's real UID and GID.
+	//
+	// But since no_new_privs is always set (A3 is always true), this becomes
+	// much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1
+	// is a no-op. So we can just do C1 and C2 unconditionally.
+	if t.creds.EffectiveKUID != t.creds.RealKUID || t.creds.EffectiveKGID != t.creds.RealKGID {
+		t.creds.EffectiveKUID = t.creds.RealKUID
+		t.creds.EffectiveKGID = t.creds.RealKGID
+		t.parentDeathSignal = 0
+	}
+	// (Saved set-user-ID is always set to the new effective user ID, and saved
+	// set-group-ID is always set to the new effective group ID, regardless of
+	// the above.)
+	t.creds.SavedKUID = t.creds.RealKUID
+	t.creds.SavedKGID = t.creds.RealKGID
+	t.creds.PermittedCaps &= newPermitted
+	if fileEffective {
+		t.creds.EffectiveCaps = t.creds.PermittedCaps
+	} else {
+		t.creds.EffectiveCaps = 0
+	}
+
+	// prctl(2): The "keep capabilities" value will be reset to 0 on subsequent
+	// calls to execve(2).
+	t.creds.KeepCaps = false
+
+	// "The bounding set is inherited at fork(2) from the thread's parent, and
+	// is preserved across an execve(2)". So we're done.
+}
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
new file mode 100644
index 000000000..18efacb19
--- /dev/null
+++ b/pkg/sentry/kernel/task_log.go
@@ -0,0 +1,137 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sort"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// maxStackDebugBytes is the maximum number of user stack bytes that may be
+	// printed by debugDumpStack.
+	maxStackDebugBytes = 1024
+)
+
+// Infof logs an formatted info message by calling log.Infof.
+func (t *Task) Infof(fmt string, v ...interface{}) {
+	if log.IsLogging(log.Info) {
+		log.Infof(t.logPrefix.Load().(string)+fmt, v...)
+	}
+}
+
+// Warningf logs a warning string by calling log.Warningf.
+func (t *Task) Warningf(fmt string, v ...interface{}) {
+	if log.IsLogging(log.Warning) {
+		log.Warningf(t.logPrefix.Load().(string)+fmt, v...)
+	}
+}
+
+// Debugf creates a debug string that includes the task ID.
+func (t *Task) Debugf(fmt string, v ...interface{}) {
+	if log.IsLogging(log.Debug) {
+		log.Debugf(t.logPrefix.Load().(string)+fmt, v...)
+	}
+}
+
+// IsLogging returns true iff this level is being logged.
+func (t *Task) IsLogging(level log.Level) bool {
+	return log.IsLogging(level)
+}
+
+// DebugDumpState logs task state at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) DebugDumpState() {
+	t.debugDumpRegisters()
+	t.debugDumpStack()
+	if mm := t.MemoryManager(); mm != nil {
+		t.Debugf("Mappings:\n%s", mm)
+	}
+	t.Debugf("FDMap:\n%s", t.FDMap())
+}
+
+// debugDumpRegisters logs register state at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) debugDumpRegisters() {
+	if !t.IsLogging(log.Debug) {
+		return
+	}
+	regmap, err := t.Arch().RegisterMap()
+	if err != nil {
+		t.Debugf("Registers: %v", err)
+	} else {
+		t.Debugf("Registers:")
+		var regs []string
+		for reg := range regmap {
+			regs = append(regs, reg)
+		}
+		sort.Strings(regs)
+		for _, reg := range regs {
+			t.Debugf("%-8s = %016x", reg, regmap[reg])
+		}
+	}
+}
+
+// debugDumpStack logs user stack contents at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) debugDumpStack() {
+	if !t.IsLogging(log.Debug) {
+		return
+	}
+	m := t.MemoryManager()
+	if m == nil {
+		t.Debugf("Memory manager for task is gone, skipping application stack dump.")
+		return
+	}
+	t.Debugf("Stack:")
+	start := usermem.Addr(t.Arch().Stack())
+	// Round addr down to a 16-byte boundary.
+	start &= ^usermem.Addr(15)
+	// Print 16 bytes per line, one byte at a time.
+	for offset := uint64(0); offset < maxStackDebugBytes; offset += 16 {
+		addr, ok := start.AddLength(offset)
+		if !ok {
+			break
+		}
+		var data [16]byte
+		n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{
+			IgnorePermissions: true,
+		})
+		// Print as much of the line as we can, even if an error was
+		// encountered.
+		if n > 0 {
+			t.Debugf("%x: % x", addr, data[:n])
+		}
+		if err != nil {
+			t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err)
+			break
+		}
+	}
+}
+
+// updateLogPrefix updates the task's cached log prefix to reflect its
+// current thread ID.
+//
+// Preconditions: The task's owning TaskSet.mu must be locked.
+func (t *Task) updateLogPrefixLocked() {
+	// Use the task's TID in the root PID namespace for logging.
+	t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t]))
+}
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
new file mode 100644
index 000000000..4df2e53d3
--- /dev/null
+++ b/pkg/sentry/kernel/task_net.go
@@ -0,0 +1,35 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+)
+
+// IsNetworkNamespaced returns true if t is in a non-root network namespace.
+func (t *Task) IsNetworkNamespaced() bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns
+}
+
+// NetworkContext returns the network stack used by the task. NetworkContext
+// may return nil if no network stack is available.
+func (t *Task) NetworkContext() inet.Stack {
+	if t.IsNetworkNamespaced() {
+		return nil
+	}
+	return t.k.networkStack
+}
diff --git a/pkg/sentry/kernel/task_resources.go b/pkg/sentry/kernel/task_resources.go
new file mode 100644
index 000000000..e529f0c2d
--- /dev/null
+++ b/pkg/sentry/kernel/task_resources.go
@@ -0,0 +1,126 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// TaskResources is the subset of a task's data provided by its creator that is
+// not provided by the loader.
+type TaskResources struct {
+	// SignalMask is the set of signals whose delivery is currently blocked.
+	//
+	// FIXME: Determine if we also need RealSignalMask
+	SignalMask linux.SignalSet
+
+	// FSContext is the filesystem context.
+	*FSContext
+
+	// FDMap provides access to files to the task.
+	*FDMap
+
+	// Tracks abstract sockets that are in use.
+	AbstractSockets *AbstractSocketNamespace
+}
+
+// newTaskResources returns a new TaskResources, taking an additional reference
+// on fdm.
+func newTaskResources(fdm *FDMap, fc *FSContext) *TaskResources {
+	fdm.IncRef()
+	return &TaskResources{
+		FDMap:           fdm,
+		FSContext:       fc,
+		AbstractSockets: NewAbstractSocketNamespace(),
+	}
+}
+
+// release releases all resources held by the TaskResources. release is called
+// by the task when it exits.
+func (tr *TaskResources) release() {
+	tr.FDMap.DecRef()
+	tr.FDMap = nil
+	tr.FSContext.DecRef()
+	tr.FSContext = nil
+	tr.AbstractSockets = nil
+}
+
+// Fork returns a duplicate of tr.
+//
+// FIXME: Preconditions: When tr is owned by a Task, that task's
+// signal mutex must be locked, or Fork must be called by the task's goroutine.
+func (tr *TaskResources) Fork(shareFiles bool, shareFSContext bool) *TaskResources {
+	var fdmap *FDMap
+	if shareFiles {
+		fdmap = tr.FDMap
+		fdmap.IncRef()
+	} else {
+		fdmap = tr.FDMap.Fork()
+	}
+
+	var fsc *FSContext
+	if shareFSContext {
+		fsc = tr.FSContext
+		fsc.IncRef()
+	} else {
+		fsc = tr.FSContext.Fork()
+	}
+
+	return &TaskResources{
+		SignalMask:      tr.SignalMask,
+		FDMap:           fdmap,
+		FSContext:       fsc,
+		AbstractSockets: tr.AbstractSockets,
+	}
+}
+
+// FDMap returns t's FDMap.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) FDMap() *FDMap {
+	return t.tr.FDMap
+}
+
+// FSContext returns t's FSContext.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) FSContext() *FSContext {
+	return t.tr.FSContext
+}
+
+// MountNamespace returns t's MountNamespace. MountNamespace does not take an additional
+// reference on the returned MountNamespace.
+func (t *Task) MountNamespace() *fs.MountNamespace {
+	return t.k.mounts
+}
+
+// AbstractSockets returns t's AbstractSocketNamespace.
+func (t *Task) AbstractSockets() *AbstractSocketNamespace {
+	return t.tr.AbstractSockets
+}
+
+// IsChrooted returns true if the root directory of t's FSContext is not the
+// root directory of t's MountNamespace.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) IsChrooted() bool {
+	realRoot := t.k.mounts.Root()
+	defer realRoot.DecRef()
+	return t.tr.FSContext.root != realRoot
+}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
new file mode 100644
index 000000000..94ce5582b
--- /dev/null
+++ b/pkg/sentry/kernel/task_run.go
@@ -0,0 +1,346 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"bytes"
+	"runtime"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// A taskRunState is a reified state in the task state machine. See README.md
+// for details. The canonical list of all run states, as well as transitions
+// between them, is given in run_states.dot.
+//
+// The set of possible states is enumerable and completely defined by the
+// kernel package, so taskRunState would ideally be represented by a
+// discriminated union. However, Go does not support sum types.
+//
+// Hence, as with TaskStop, data-free taskRunStates should be represented as
+// typecast nils to avoid unnecessary allocation.
+type taskRunState interface {
+	// execute executes the code associated with this state over the given task
+	// and returns the following state. If execute returns nil, the task
+	// goroutine should exit.
+	//
+	// It is valid to tail-call a following state's execute to avoid the
+	// overhead of converting the following state to an interface object and
+	// checking for stops, provided that the tail-call cannot recurse.
+	execute(*Task) taskRunState
+}
+
+// run runs the task goroutine.
+//
+// threadID a dummy value set to the task's TID in the root PID namespace to
+// make it visible in stack dumps. A goroutine for a given task can be identified
+// searching for Task.run()'s argument value.
+func (t *Task) run(threadID uintptr) {
+	// Construct t.blockingTimer here. We do this here because we can't
+	// reconstruct t.blockingTimer during restore in Task.afterLoad(), because
+	// kernel.timekeeper.SetClocks() hasn't been called yet.
+	blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier()
+	t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier)
+	defer t.blockingTimer.Destroy()
+	t.blockingTimerChan = blockingTimerChan
+
+	// Activate our address space.
+	t.Activate()
+	// The corresponding t.Deactivate occurs in the exit path
+	// (runExitMain.execute) so that when
+	// Platform.CooperativelySharesAddressSpace() == true, we give up the
+	// AddressSpace before the task goroutine finishes executing.
+
+	// Ensure that thread group timers for execution time reflect that this
+	// task now exists.
+	t.tg.tm.kick()
+
+	// If this is a newly-started task, it should check for participation in
+	// group stops. If this is a task resuming after restore, it was
+	// interrupted by saving. In either case, the task is initially
+	// interrupted.
+	t.interruptSelf()
+
+	for {
+		// Explanation for this ordering:
+		//
+		// - A freshly-started task that is stopped should not do anything
+		// before it enters the stop.
+		//
+		// - If taskRunState.execute returns nil, the task goroutine should
+		// exit without checking for a stop.
+		//
+		// - Task.Start won't start Task.run if t.runState is nil, so this
+		// ordering is safe.
+		t.doStop()
+		t.runState = t.runState.execute(t)
+		if t.runState == nil {
+			t.accountTaskGoroutineEnter(TaskGoroutineNonexistent)
+			t.goroutineStopped.Done()
+			t.tg.liveGoroutines.Done()
+			t.tg.pidns.owner.liveGoroutines.Done()
+			t.tg.pidns.owner.runningGoroutines.Done()
+
+			// Keep argument alive because stack trace for dead variables may not be correct.
+			runtime.KeepAlive(threadID)
+			return
+		}
+	}
+}
+
+// doStop is called by Task.run to block until the task is not stopped.
+func (t *Task) doStop() {
+	if atomic.LoadInt32(&t.stopCount) == 0 {
+		return
+	}
+	t.Deactivate()
+	// NOTE: t.Activate() must be called without any locks held, so
+	// this defer must precede the defer for unlocking the signal mutex.
+	defer t.Activate()
+	t.accountTaskGoroutineEnter(TaskGoroutineStopped)
+	defer t.accountTaskGoroutineLeave(TaskGoroutineStopped)
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.tg.pidns.owner.runningGoroutines.Add(-1)
+	defer t.tg.pidns.owner.runningGoroutines.Add(1)
+	t.goroutineStopped.Add(-1)
+	defer t.goroutineStopped.Add(1)
+	for t.stopCount > 0 {
+		t.endStopCond.Wait()
+	}
+}
+
+// The runApp state checks for interrupts before executing untrusted
+// application code.
+type runApp struct{}
+
+func (*runApp) execute(t *Task) taskRunState {
+	if t.interrupted() {
+		// Checkpointing instructs tasks to stop by sending an interrupt, so we
+		// must check for stops before entering runInterrupt (instead of
+		// tail-calling it).
+		return (*runInterrupt)(nil)
+	}
+
+	// We're about to switch to the application again. If there's still a
+	// unhandled SyscallRestartErrno that wasn't translated to an EINTR,
+	// restart the syscall that was interrupted. If there's a saved signal
+	// mask, restore it. (Note that restoring the saved signal mask may unblock
+	// a pending signal, causing another interruption, but that signal should
+	// not interact with the interrupted syscall.)
+	if t.haveSyscallReturn {
+		if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+			if sre == ERESTART_RESTARTBLOCK {
+				t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
+				t.Arch().RestartSyscallWithRestartBlock()
+			} else {
+				t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
+				t.Arch().RestartSyscall()
+			}
+		}
+		t.haveSyscallReturn = false
+	}
+	if t.haveSavedSignalMask {
+		t.SetSignalMask(t.savedSignalMask)
+		t.haveSavedSignalMask = false
+		if t.interrupted() {
+			return (*runInterrupt)(nil)
+		}
+	}
+
+	// Apply restartable sequences.
+	if t.rseqPreempted {
+		t.rseqPreempted = false
+		if t.rseqCPUAddr != 0 {
+			if err := t.rseqCopyOutCPU(); err != nil {
+				t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err)
+				t.forceSignal(linux.SIGSEGV, false)
+				t.SendSignal(sigPriv(linux.SIGSEGV))
+				// Re-enter the task run loop for signal delivery.
+				return (*runApp)(nil)
+			}
+		}
+		t.rseqInterrupt()
+	}
+
+	// Check if we need to enable single-stepping. Tracers expect that the
+	// kernel preserves the value of the single-step flag set by PTRACE_SETREGS
+	// whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this
+	// includes our ptrace platform, by the way), so we should only clear the
+	// single-step flag if we're responsible for setting it. (clearSinglestep
+	// is therefore analogous to Linux's TIF_FORCED_TF.)
+	//
+	// Strictly speaking, we should also not clear the single-step flag if we
+	// single-step through an instruction that sets the single-step flag
+	// (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their
+	// own TF. (Famous last words, I know.)
+	clearSinglestep := false
+	if t.hasTracer() {
+		t.tg.pidns.owner.mu.RLock()
+		if t.ptraceSinglestep {
+			clearSinglestep = !t.Arch().SingleStep()
+			t.Arch().SetSingleStep()
+		}
+		t.tg.pidns.owner.mu.RUnlock()
+	}
+
+	t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
+	info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU)
+	t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
+
+	if clearSinglestep {
+		t.Arch().ClearSingleStep()
+	}
+
+	switch err {
+	case nil:
+		// Handle application system call.
+		return t.doSyscall()
+
+	case platform.ErrContextInterrupt:
+		// Interrupted by platform.Context.Interrupt(). Re-enter the run
+		// loop to figure out why.
+		return (*runApp)(nil)
+
+	case platform.ErrContextSignal:
+		// Looks like a signal has been delivered to us. If it's a synchronous
+		// signal (SEGV, SIGBUS, etc.), it should be sent to the application
+		// thread that received it.
+		sig := linux.Signal(info.Signo)
+
+		// Was it a fault that we should handle internally? If so, this wasn't
+		// an application-generated signal and we should continue execution
+		// normally.
+		if at.Any() {
+			addr := usermem.Addr(info.Addr())
+			err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
+			if err == nil {
+				// The fault was handled appropriately.
+				// We can resume running the application.
+				return (*runApp)(nil)
+			}
+
+			// Is this a vsyscall that we need emulate?
+			if at.Execute {
+				if sysno, ok := t.tc.st.LookupEmulate(addr); ok {
+					return t.doVsyscall(addr, sysno)
+				}
+			}
+
+			// The JVM will trigger these errors constantly, so don't
+			// spam logs with this error.
+			if err == syserror.EFAULT || err == syserror.EPERM {
+				t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
+			} else {
+				t.Warningf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
+			}
+			t.DebugDumpState()
+
+			// Continue to signal handling.
+			//
+			// Convert a BusError error to a SIGBUS from a SIGSEGV. All
+			// other info bits stay the same (address, etc.).
+			if _, ok := err.(*memmap.BusError); ok {
+				sig = linux.SIGBUS
+				info.Signo = int32(linux.SIGBUS)
+			}
+		}
+
+		switch sig {
+		case linux.SIGILL:
+			// N.B. The debug stuff here is arguably
+			// expensive.  Don't fret. This gets called
+			// about 5 times for a typical application, if
+			// that.
+			t.Debugf("SIGILL @ %x", t.Arch().IP())
+
+			// Is this a CPUID instruction?
+			expected := arch.CPUIDInstruction[:]
+			found := make([]byte, len(expected))
+			_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+			if err == nil && bytes.Equal(expected, found) {
+				// Skip the cpuid instruction.
+				t.Arch().CPUIDEmulate(t)
+				t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+				break
+			}
+
+			// Treat it like any other synchronous signal.
+			fallthrough
+
+		case linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP:
+			// Synchronous signal. Send it to ourselves. Assume the signal is
+			// legitimate and force it (work around the signal being ignored or
+			// blocked) like Linux does. Conveniently, this is even the correct
+			// behavior for SIGTRAP from single-stepping.
+			t.forceSignal(linux.Signal(sig), false /* unconditional */)
+			t.SendSignal(info)
+
+		case platform.SignalInterrupt:
+			// Assume that a call to platform.Context.Interrupt() misfired.
+
+		case linux.SIGPROF:
+			// It's a profiling interrupt: there's not much
+			// we can do. We've already paid a decent cost
+			// by intercepting the signal, at this point we
+			// simply ignore it.
+
+		default:
+			// Asynchronous signal. Let the system deal with it.
+			t.k.sendExternalSignal(info, "application")
+		}
+
+		return (*runApp)(nil)
+
+	case platform.ErrContextCPUPreempted:
+		// Ensure that RSEQ critical sections are interrupted and per-thread
+		// CPU values are updated before the next platform.Context.Switch().
+		t.rseqPreempted = true
+		return (*runApp)(nil)
+
+	default:
+		// What happened? Can't continue.
+		t.Warningf("Unexpected SwitchToApp error: %v", err)
+		t.PrepareExit(ExitStatus{Code: t.ExtractErrno(err, -1)})
+		return (*runExit)(nil)
+	}
+}
+
+// waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits.
+func (t *Task) waitGoroutineStoppedOrExited() {
+	t.goroutineStopped.Wait()
+}
+
+// WaitExited blocks until all task goroutines in tg have exited.
+//
+// WaitExited does not correspond to anything in Linux; it's provided so that
+// external callers of Kernel.CreateProcess can wait for the created thread
+// group to terminate.
+func (tg *ThreadGroup) WaitExited() {
+	tg.liveGoroutines.Wait()
+}
+
+// Yield yields the processor for the calling task.
+func (t *Task) Yield() {
+	atomic.AddUint64(&t.yieldCount, 1)
+	runtime.Gosched()
+}
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
new file mode 100644
index 000000000..b50139077
--- /dev/null
+++ b/pkg/sentry/kernel/task_sched.go
@@ -0,0 +1,329 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// CPU scheduling, real and fake.
+
+import (
+	"fmt"
+	"sync/atomic"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TaskGoroutineState is a coarse representation of the current execution
+// status of a kernel.Task goroutine.
+type TaskGoroutineState int
+
+const (
+	// TaskGoroutineNonexistent indicates that the task goroutine has either
+	// not yet been created by Task.Start() or has returned from Task.run().
+	// This must be the zero value for TaskGoroutineState.
+	TaskGoroutineNonexistent TaskGoroutineState = iota
+
+	// TaskGoroutineRunningSys indicates that the task goroutine is executing
+	// sentry code.
+	TaskGoroutineRunningSys
+
+	// TaskGoroutineRunningApp indicates that the task goroutine is executing
+	// application code.
+	TaskGoroutineRunningApp
+
+	// TaskGoroutineBlockedInterruptible indicates that the task goroutine is
+	// blocked in Task.block(), and hence may be woken by Task.interrupt()
+	// (e.g. due to signal delivery).
+	TaskGoroutineBlockedInterruptible
+
+	// TaskGoroutineBlockedUninterruptible indicates that the task goroutine is
+	// stopped outside of Task.block() and Task.doStop(), and hence cannot be
+	// woken by Task.interrupt().
+	TaskGoroutineBlockedUninterruptible
+
+	// TaskGoroutineStopped indicates that the task goroutine is blocked in
+	// Task.doStop(). TaskGoroutineStopped is similar to
+	// TaskGoroutineBlockedUninterruptible, but is a separate state to make it
+	// possible to determine when Task.stop is meaningful.
+	TaskGoroutineStopped
+)
+
+// TaskGoroutineSchedInfo contains task goroutine scheduling state which must
+// be read and updated atomically.
+type TaskGoroutineSchedInfo struct {
+	// Timestamp was the value of Kernel.cpuClock when this
+	// TaskGoroutineSchedInfo was last updated.
+	Timestamp uint64
+
+	// State is the current state of the task goroutine.
+	State TaskGoroutineState
+
+	// UserTicks is the amount of time the task goroutine has spent executing
+	// its associated Task's application code, in units of linux.ClockTick.
+	UserTicks uint64
+
+	// SysTicks is the amount of time the task goroutine has spent executing in
+	// the sentry, in units of linux.ClockTick.
+	SysTicks uint64
+}
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
+	now := t.k.CPUClockNow()
+	if t.gosched.State != TaskGoroutineRunningSys {
+		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state))
+	}
+	t.goschedSeq.BeginWrite()
+	// This function is very hot; avoid defer.
+	t.gosched.SysTicks += now - t.gosched.Timestamp
+	t.gosched.Timestamp = now
+	t.gosched.State = state
+	t.goschedSeq.EndWrite()
+}
+
+// Preconditions: The caller must be running on the task goroutine, and leaving
+// a state indicated by a previous call to
+// t.accountTaskGoroutineEnter(state).
+func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
+	now := t.k.CPUClockNow()
+	if t.gosched.State != state {
+		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys))
+	}
+	t.goschedSeq.BeginWrite()
+	// This function is very hot; avoid defer.
+	if state == TaskGoroutineRunningApp {
+		t.gosched.UserTicks += now - t.gosched.Timestamp
+	}
+	t.gosched.Timestamp = now
+	t.gosched.State = TaskGoroutineRunningSys
+	t.goschedSeq.EndWrite()
+}
+
+// TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
+// Most clients should use t.CPUStats() instead.
+func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
+	return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched)
+}
+
+// CPUStats returns the CPU usage statistics of t.
+func (t *Task) CPUStats() usage.CPUStats {
+	return t.cpuStatsAt(t.k.CPUClockNow())
+}
+
+// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
+// monotonic, this is satisfied if now is the result of a previous call to
+// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
+// change to t.gosched can cause cpuStatsAt to adjust stats by too much, making
+// the returned stats non-monotonic.
+func (t *Task) cpuStatsAt(now uint64) usage.CPUStats {
+	tsched := t.TaskGoroutineSchedInfo()
+	if tsched.Timestamp < now {
+		// Update stats to reflect execution since the last update to
+		// t.gosched.
+		switch tsched.State {
+		case TaskGoroutineRunningSys:
+			tsched.SysTicks += now - tsched.Timestamp
+		case TaskGoroutineRunningApp:
+			tsched.UserTicks += now - tsched.Timestamp
+		}
+	}
+	return usage.CPUStats{
+		UserTime:          time.Duration(tsched.UserTicks * uint64(linux.ClockTick)),
+		SysTime:           time.Duration(tsched.SysTicks * uint64(linux.ClockTick)),
+		VoluntarySwitches: atomic.LoadUint64(&t.yieldCount),
+	}
+}
+
+// CPUStats returns the combined CPU usage statistics of all past and present
+// threads in tg.
+func (tg *ThreadGroup) CPUStats() usage.CPUStats {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	// Hack to get a pointer to the Kernel.
+	if tg.leader == nil {
+		// Per comment on tg.leader, this is only possible if nothing in the
+		// ThreadGroup has ever executed anyway.
+		return usage.CPUStats{}
+	}
+	now := tg.leader.k.CPUClockNow()
+	stats := tg.exitedCPUStats
+	// Account for active tasks.
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		stats.Accumulate(t.cpuStatsAt(now))
+	}
+	return stats
+}
+
+// JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return
+// resource usage statistics for all children of [tg] that have terminated and
+// been waited for. These statistics will include the resources used by
+// grandchildren, and further removed descendants, if all of the intervening
+// descendants waited on their terminated children."
+func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.childCPUStats
+}
+
+// StateStatus returns a string representation of the task's current state,
+// appropriate for /proc/[pid]/status.
+func (t *Task) StateStatus() string {
+	switch s := t.TaskGoroutineSchedInfo().State; s {
+	case TaskGoroutineNonexistent:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		switch t.exitState {
+		case TaskExitZombie:
+			return "Z (zombie)"
+		case TaskExitDead:
+			return "X (dead)"
+		default:
+			// The task goroutine can't exit before passing through
+			// runExitNotify, so this indicates that the task has been created,
+			// but the task goroutine hasn't yet started. The Linux equivalent
+			// is struct task_struct::state == TASK_NEW
+			// (kernel/fork.c:copy_process() =>
+			// kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is
+			// masked out by TASK_REPORT for /proc/[pid]/status, leaving only
+			// TASK_RUNNING.
+			return "R (running)"
+		}
+	case TaskGoroutineRunningSys, TaskGoroutineRunningApp:
+		return "R (running)"
+	case TaskGoroutineBlockedInterruptible:
+		return "S (sleeping)"
+	case TaskGoroutineStopped:
+		t.tg.signalHandlers.mu.Lock()
+		defer t.tg.signalHandlers.mu.Unlock()
+		switch t.stop.(type) {
+		case *groupStop:
+			return "T (stopped)"
+		case *ptraceStop:
+			return "t (tracing stop)"
+		}
+		fallthrough
+	case TaskGoroutineBlockedUninterruptible:
+		// This is the name Linux uses for TASK_UNINTERRUPTIBLE and
+		// TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL):
+		// fs/proc/array.c:task_state_array.
+		return "D (disk sleep)"
+	default:
+		panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s))
+	}
+}
+
+// CPUMask returns a copy of t's allowed CPU mask.
+func (t *Task) CPUMask() sched.CPUSet {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.allowedCPUMask.Copy()
+}
+
+// SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of
+// mask.
+//
+// Preconditions: mask.Size() ==
+// sched.CPUSetSize(t.Kernel().ApplicationCores()).
+func (t *Task) SetCPUMask(mask sched.CPUSet) error {
+	if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want {
+		panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want))
+	}
+
+	// Remove CPUs in mask above Kernel.applicationCores.
+	mask.ClearAbove(t.k.applicationCores)
+
+	// Ensure that at least 1 CPU is still allowed.
+	if mask.NumCPUs() == 0 {
+		return syserror.EINVAL
+	}
+
+	if t.k.useHostCores {
+		// No-op; pretend the mask was immediately changed back.
+		return nil
+	}
+
+	t.tg.pidns.owner.mu.RLock()
+	rootTID := t.tg.pidns.owner.Root.tids[t]
+	t.tg.pidns.owner.mu.RUnlock()
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.allowedCPUMask = mask
+	atomic.StoreInt32(&t.cpu, assignCPU(mask, rootTID))
+	return nil
+}
+
+// CPU returns the cpu id for a given task.
+func (t *Task) CPU() int32 {
+	if t.k.useHostCores {
+		return int32(hostcpu.GetCPU())
+	}
+
+	return atomic.LoadInt32(&t.cpu)
+}
+
+// assignCPU returns the virtualized CPU number for the task with global TID
+// tid and allowedCPUMask allowed.
+func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) {
+	// To pretend that threads are evenly distributed to allowed CPUs, choose n
+	// to be less than the number of CPUs in allowed ...
+	n := int(tid) % int(allowed.NumCPUs())
+	// ... then pick the nth CPU in allowed.
+	allowed.ForEachCPU(func(c uint) {
+		if n--; n == 0 {
+			cpu = int32(c)
+		}
+	})
+	return cpu
+}
+
+// Niceness returns t's niceness.
+func (t *Task) Niceness() int {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.niceness
+}
+
+// Priority returns t's priority.
+func (t *Task) Priority() int {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.niceness + 20
+}
+
+// SetNiceness sets t's niceness to n.
+func (t *Task) SetNiceness(n int) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.niceness = n
+}
+
+// NumaPolicy returns t's current numa policy.
+func (t *Task) NumaPolicy() (policy int32, nodeMask uint32) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.numaPolicy, t.numaNodeMask
+}
+
+// SetNumaPolicy sets t's numa policy.
+func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.numaPolicy = policy
+	t.numaNodeMask = nodeMask
+}
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
new file mode 100644
index 000000000..2340256b0
--- /dev/null
+++ b/pkg/sentry/kernel/task_signals.go
@@ -0,0 +1,1056 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file defines the behavior of task signal handling.
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SignalAction is an internal signal action.
+type SignalAction int
+
+// Available signal actions.
+// Note that although we refer the complete set internally,
+// the application is only capable of using the Default and
+// Ignore actions from the system call interface.
+const (
+	SignalActionTerm SignalAction = iota
+	SignalActionCore
+	SignalActionStop
+	SignalActionIgnore
+	SignalActionHandler
+)
+
+// Default signal handler actions. Note that for most signals,
+// (except SIGKILL and SIGSTOP) these can be overridden by the app.
+var defaultActions = map[linux.Signal]SignalAction{
+	// POSIX.1-1990 standard.
+	linux.SIGHUP:  SignalActionTerm,
+	linux.SIGINT:  SignalActionTerm,
+	linux.SIGQUIT: SignalActionCore,
+	linux.SIGILL:  SignalActionCore,
+	linux.SIGABRT: SignalActionCore,
+	linux.SIGFPE:  SignalActionCore,
+	linux.SIGKILL: SignalActionTerm, // but see ThreadGroup.applySignalSideEffects
+	linux.SIGSEGV: SignalActionCore,
+	linux.SIGPIPE: SignalActionTerm,
+	linux.SIGALRM: SignalActionTerm,
+	linux.SIGTERM: SignalActionTerm,
+	linux.SIGUSR1: SignalActionTerm,
+	linux.SIGUSR2: SignalActionTerm,
+	linux.SIGCHLD: SignalActionIgnore,
+	linux.SIGCONT: SignalActionIgnore, // but see ThreadGroup.applySignalSideEffects
+	linux.SIGSTOP: SignalActionStop,
+	linux.SIGTSTP: SignalActionStop,
+	linux.SIGTTIN: SignalActionStop,
+	linux.SIGTTOU: SignalActionStop,
+	// POSIX.1-2001 standard.
+	linux.SIGBUS:    SignalActionCore,
+	linux.SIGPROF:   SignalActionTerm,
+	linux.SIGSYS:    SignalActionCore,
+	linux.SIGTRAP:   SignalActionCore,
+	linux.SIGURG:    SignalActionIgnore,
+	linux.SIGVTALRM: SignalActionTerm,
+	linux.SIGXCPU:   SignalActionCore,
+	linux.SIGXFSZ:   SignalActionCore,
+	// The rest on linux.
+	linux.SIGSTKFLT: SignalActionTerm,
+	linux.SIGIO:     SignalActionTerm,
+	linux.SIGPWR:    SignalActionTerm,
+	linux.SIGWINCH:  SignalActionIgnore,
+}
+
+// computeAction figures out what to do given a signal number
+// and an arch.SignalAct. SIGSTOP always results in a SignalActionStop,
+// and SIGKILL always results in a SignalActionTerm.
+// Signal 0 is always ignored as many programs use it for various internal functions
+// and don't expect it to do anything.
+//
+// In the event the signal is not one of these, act.Handler determines what
+// happens next.
+// If act.Handler is:
+// 0, the default action is taken;
+// 1, the signal is ignored;
+// anything else, the function returns SignalActionHandler.
+func computeAction(sig linux.Signal, act arch.SignalAct) SignalAction {
+	switch sig {
+	case linux.SIGSTOP:
+		return SignalActionStop
+	case linux.SIGKILL:
+		return SignalActionTerm
+	case linux.Signal(0):
+		return SignalActionIgnore
+	}
+
+	switch act.Handler {
+	case arch.SignalActDefault:
+		return defaultActions[sig]
+	case arch.SignalActIgnore:
+		return SignalActionIgnore
+	default:
+		return SignalActionHandler
+	}
+}
+
+// UnblockableSignals contains the set of signals which cannot be blocked.
+var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP)
+
+// StopSignals is the set of signals whose default action is SignalActionStop.
+var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU)
+
+// dequeueSignalLocked returns a pending unmasked signal. If there are no
+// pending unmasked signals, dequeueSignalLocked returns nil.
+//
+// Preconditions: t.tg.signalHandlers.mu must be locked.
+func (t *Task) dequeueSignalLocked() *arch.SignalInfo {
+	if info := t.pendingSignals.dequeue(t.tr.SignalMask); info != nil {
+		return info
+	}
+	if info := t.tg.pendingSignals.dequeue(t.tr.SignalMask); info != nil {
+		return info
+	}
+	return nil
+}
+
+// TakeSignal returns a pending signal not blocked by mask. Signal handlers are
+// not affected. If there are no pending signals not blocked by mask,
+// TakeSignal returns a nil SignalInfo.
+func (t *Task) TakeSignal(mask linux.SignalSet) *arch.SignalInfo {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if info := t.pendingSignals.dequeue(mask); info != nil {
+		return info
+	}
+	if info := t.tg.pendingSignals.dequeue(mask); info != nil {
+		return info
+	}
+	return nil
+}
+
+// discardSpecificLocked removes all instances of the given signal from all
+// signal queues in tg.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) discardSpecificLocked(sig linux.Signal) {
+	tg.pendingSignals.discardSpecific(sig)
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		t.pendingSignals.discardSpecific(sig)
+	}
+}
+
+// PendingSignals returns the set of pending signals.
+func (t *Task) PendingSignals() linux.SignalSet {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet
+}
+
+// deliverSignal delivers the given signal and returns the following run state.
+func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunState {
+	sigact := computeAction(linux.Signal(info.Signo), act)
+
+	if t.haveSyscallReturn {
+		if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+			// Signals that are ignored, cause a thread group stop, or
+			// terminate the thread group do not interact with interrupted
+			// syscalls; in Linux terms, they are never returned to the signal
+			// handling path from get_signal => get_signal_to_deliver. The
+			// behavior of an interrupted syscall is determined by the first
+			// signal that is actually handled (by userspace).
+			if sigact == SignalActionHandler {
+				switch {
+				case sre == ERESTARTNOHAND:
+					fallthrough
+				case sre == ERESTART_RESTARTBLOCK:
+					fallthrough
+				case (sre == ERESTARTSYS && !act.IsRestart()):
+					t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
+					t.Arch().SetReturn(uintptr(-t.ExtractErrno(syserror.EINTR, -1)))
+				default:
+					t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
+					t.Arch().RestartSyscall()
+				}
+			}
+		}
+	}
+
+	switch sigact {
+	case SignalActionTerm, SignalActionCore:
+		// "Default action is to terminate the process." - signal(7)
+		t.Debugf("Signal %d: terminating thread group", info.Signo)
+		t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)})
+		return (*runExit)(nil)
+
+	case SignalActionStop:
+		// "Default action is to stop the process."
+		t.initiateGroupStop(info)
+
+	case SignalActionIgnore:
+		// "Default action is to ignore the signal."
+		t.Debugf("Signal %d: ignored", info.Signo)
+
+	case SignalActionHandler:
+		// Try to deliver the signal to the user-configured handler.
+		t.Debugf("Signal %d: delivering to handler", info.Signo)
+		if err := t.deliverSignalToHandler(info, act); err != nil {
+			t.Warningf("Failed to deliver signal %+v to user handler: %v", info, err)
+			// Send a forced SIGSEGV. If the signal that couldn't be delivered
+			// was a SIGSEGV, force the handler to SIG_DFL.
+			t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */)
+			t.SendSignal(sigPriv(linux.SIGSEGV))
+		}
+
+	default:
+		panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(linux.Signal(info.Signo), act)))
+	}
+	return (*runInterrupt)(nil)
+}
+
+// deliverSignalToHandler changes the task's userspace state to enter the given
+// user-configured handler for the given signal.
+func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) error {
+	// Signal delivery to an application handler interrupts restartable
+	// sequences.
+	t.rseqInterrupt()
+
+	// Are executing on the main stack,
+	// or the provided alternate stack?
+	sp := usermem.Addr(t.Arch().Stack())
+
+	// N.B. This is a *copy* of the alternate stack that the user's signal
+	// handler expects to see in its ucontext (even if it's not in use).
+	alt := t.signalStack
+	if act.IsOnStack() && alt.IsEnabled() {
+		alt.SetOnStack()
+		if !t.OnSignalStack(alt) {
+			sp = usermem.Addr(alt.Top())
+		}
+	}
+
+	// Set up the signal handler. If we have a saved signal mask, the signal
+	// handler should run with the current mask, but sigreturn should restore
+	// the saved one.
+	st := &arch.Stack{t.Arch(), t.MemoryManager(), sp}
+	mask := t.tr.SignalMask
+	if t.haveSavedSignalMask {
+		mask = t.savedSignalMask
+	}
+	if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
+		return err
+	}
+	t.haveSavedSignalMask = false
+
+	// Add our signal mask.
+	newMask := t.tr.SignalMask | act.Mask
+	if !act.IsNoDefer() {
+		newMask |= linux.SignalSetOf(linux.Signal(info.Signo))
+	}
+	t.SetSignalMask(newMask)
+
+	return nil
+}
+
+var ctrlResume = &SyscallControl{ignoreReturn: true}
+
+// SignalReturn implements sigreturn(2) (if rt is false) or rt_sigreturn(2) (if
+// rt is true).
+func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
+	st := t.Stack()
+	sigset, err := t.Arch().SignalRestore(st, rt)
+	if err != nil {
+		return nil, err
+	}
+
+	// Restore our signal mask. SIGKILL and SIGSTOP should not be blocked.
+	t.SetSignalMask(sigset &^ UnblockableSignals)
+
+	// TODO: sys_rt_sigreturn also calls restore_altstack from
+	// uc.stack, allowing the signal handler to implicitly mutate the signal
+	// stack.
+
+	return ctrlResume, nil
+}
+
+// SendSignal sends the given signal to t.
+//
+// The following errors may be returned:
+//
+//	syserror.ESRCH - The task has exited.
+//	syserror.EINVAL - The signal is not valid.
+//	syserror.EAGAIN - THe signal is realtime, and cannot be queued.
+//
+func (t *Task) SendSignal(info *arch.SignalInfo) error {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.sendSignalLocked(info, false /* group */)
+}
+
+// SendGroupSignal sends the given signal to t's thread group.
+func (t *Task) SendGroupSignal(info *arch.SignalInfo) error {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.sendSignalLocked(info, true /* group */)
+}
+
+// SendSignal sends the given signal to tg, using tg's leader to determine if
+// the signal is blocked.
+func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	return tg.leader.sendSignalLocked(info, true /* group */)
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) onCPULocked(includeSys bool) bool {
+	// Task is exiting.
+	if t.exitState != TaskExitNone {
+		return false
+	}
+
+	switch t.TaskGoroutineSchedInfo().State {
+	case TaskGoroutineRunningSys:
+		return includeSys
+	case TaskGoroutineRunningApp:
+		return true
+	default:
+		return false
+	}
+}
+
+// SendTimerSignal mimics the process timer signal delivery behavior in linux:
+// signals are delivered to the thread that triggers the timer expiration (see
+// kernel/time/posix-cpu-timers.c:check_process_timers(). This
+// means
+//   1) the thread is running on cpu at the time.
+//   2) a thread runs more frequently will get more of those signals.
+//
+// We approximate this behavior by selecting a running task in a round-robin
+// fashion. Statistically, a thread running more often should have a higher
+// probability to be selected.
+func (tg *ThreadGroup) SendTimerSignal(info *arch.SignalInfo, includeSys bool) error {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+
+	// Find the next running threads.
+	var t *Task
+	if tg.lastTimerSignalTask == nil {
+		t = tg.tasks.Front()
+	} else {
+		t = tg.lastTimerSignalTask.Next()
+	}
+
+	// Iterate from lastTimerSignalTask.Next() to the last task in the task list.
+	for t != nil {
+		if t.onCPULocked(includeSys) {
+			tg.lastTimerSignalTask = t
+			return t.sendSignalLocked(info, true /* group */)
+		}
+		t = t.Next()
+	}
+
+	// t is nil when we reach here. If lastTimerSignalTask is not nil, iterate
+	// from Front to lastTimerSignalTask.
+	if tg.lastTimerSignalTask != nil {
+		for t := tg.tasks.Front(); t != tg.lastTimerSignalTask.Next(); t = t.Next() {
+			if t.onCPULocked(includeSys) {
+				tg.lastTimerSignalTask = t
+				return t.sendSignalLocked(info, true /* group */)
+			}
+		}
+	}
+
+	// No running threads? Just try the leader.
+	tg.lastTimerSignalTask = tg.leader
+	return tg.leader.sendSignalLocked(info, true /* group */)
+}
+
+func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error {
+	if t.exitState == TaskExitDead {
+		return syserror.ESRCH
+	}
+	sig := linux.Signal(info.Signo)
+	if sig == 0 {
+		return nil
+	}
+	if !sig.IsValid() {
+		return syserror.EINVAL
+	}
+
+	// Signal side effects apply even if the signal is ultimately discarded.
+	t.tg.applySignalSideEffectsLocked(sig)
+
+	// TODO: "Only signals for which the "init" process has established a
+	// signal handler can be sent to the "init" process by other members of the
+	// PID namespace. This restriction applies even to privileged processes,
+	// and prevents other members of the PID namespace from accidentally
+	// killing the "init" process." - pid_namespaces(7). We don't currently do
+	// this for child namespaces, though we should; we also don't do this for
+	// the root namespace (the same restriction applies to global init on
+	// Linux), where whether or not we should is much murkier. In practice,
+	// most sandboxed applications are not prepared to function as an init
+	// process.
+
+	// Unmasked, ignored signals are discarded without being queued, unless
+	// they will be visible to a tracer. Even for group signals, it's the
+	// originally-targeted task's signal mask and tracer that matter; compare
+	// Linux's kernel/signal.c:__send_signal() => prepare_signal() =>
+	// sig_ignored().
+	ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore
+	if linux.SignalSetOf(sig)&t.tr.SignalMask == 0 && ignored && !t.hasTracer() {
+		t.Debugf("Discarding ignored signal %d", sig)
+		return nil
+	}
+
+	q := &t.pendingSignals
+	if group {
+		q = &t.tg.pendingSignals
+	}
+	if !q.enqueue(info) {
+		if sig.IsRealtime() {
+			return syserror.EAGAIN
+		}
+		t.Debugf("Discarding duplicate signal %d", sig)
+		return nil
+	}
+
+	// Find a receiver to notify. Note that the task we choose to notify, if
+	// any, may not be the task that actually dequeues and handles the signal;
+	// e.g. a racing signal mask change may cause the notified task to become
+	// ineligible, or a racing sibling task may dequeue the signal first.
+	if t.canReceiveSignalLocked(sig) {
+		t.Debugf("Notified of signal %d", sig)
+		t.interrupt()
+		return nil
+	}
+	if group {
+		if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
+			nt.Debugf("Notified of group signal %d", sig)
+			nt.interrupt()
+			return nil
+		}
+	}
+	t.Debugf("No task notified of signal %d", sig)
+	return nil
+}
+
+func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) {
+	switch {
+	case linux.SignalSetOf(sig)&StopSignals != 0:
+		// Stop signals cause all prior SIGCONT to be discarded. (This is
+		// despite the fact this has little effect since SIGCONT's most
+		// important effect is applied when the signal is sent in the branch
+		// below, not when the signal is delivered.)
+		tg.discardSpecificLocked(linux.SIGCONT)
+	case sig == linux.SIGCONT:
+		// "The SIGCONT signal has a side effect of waking up (all threads of)
+		// a group-stopped process. This side effect happens before
+		// signal-delivery-stop. The tracer can't suppress this side effect (it
+		// can only suppress signal injection, which only causes the SIGCONT
+		// handler to not be executed in the tracee, if such a handler is
+		// installed." - ptrace(2)
+		tg.endGroupStopLocked(true)
+	case sig == linux.SIGKILL:
+		// "SIGKILL does not generate signal-delivery-stop and therefore the
+		// tracer can't suppress it. SIGKILL kills even within system calls
+		// (syscall-exit-stop is not generated prior to death by SIGKILL)." -
+		// ptrace(2)
+		//
+		// Note that this differs from ThreadGroup.requestExit in that it
+		// ignores tg.execing.
+		if !tg.exiting {
+			tg.exiting = true
+			tg.exitStatus = ExitStatus{Signo: int(linux.SIGKILL)}
+		}
+		for t := tg.tasks.Front(); t != nil; t = t.Next() {
+			t.killLocked()
+		}
+	}
+}
+
+// canReceiveSignalLocked returns true if t should be interrupted to receive
+// the given signal. canReceiveSignalLocked is analogous to Linux's
+// kernel/signal.c:wants_signal(), but see below for divergences.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
+	// - Do not choose tasks that are blocking the signal.
+	if linux.SignalSetOf(sig)&t.tr.SignalMask != 0 {
+		return false
+	}
+	// - No need to check Task.exitState, as the exit path sets every bit in the
+	// signal mask when it transitions from TaskExitNone to TaskExitInitiated.
+	// - No special case for SIGKILL: SIGKILL already interrupted all tasks in the
+	// task group via applySignalSideEffects => killLocked.
+	// - Do not choose stopped tasks, which cannot handle signals.
+	if t.stop != nil {
+		return false
+	}
+	// - TODO: No special case for when t is also the sending task,
+	// because the identity of the sender is unknown.
+	// - Do not choose tasks that have already been interrupted, as they may be
+	// busy handling another signal.
+	if len(t.interruptChan) != 0 {
+		return false
+	}
+	return true
+}
+
+// findSignalReceiverLocked returns a task in tg that should be interrupted to
+// receive the given signal. If no such task exists, findSignalReceiverLocked
+// returns nil.
+//
+// Linux actually records curr_target to balance the group signal targets.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) findSignalReceiverLocked(sig linux.Signal) *Task {
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		if t.canReceiveSignalLocked(sig) {
+			return t
+		}
+	}
+	return nil
+}
+
+// forceSignal ensures that the task is not ignoring or blocking the given
+// signal. If unconditional is true, forceSignal takes action even if the
+// signal isn't being ignored or blocked.
+func (t *Task) forceSignal(sig linux.Signal, unconditional bool) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.forceSignalLocked(sig, unconditional)
+}
+
+func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) {
+	blocked := linux.SignalSetOf(sig)&t.tr.SignalMask != 0
+	act := t.tg.signalHandlers.actions[sig]
+	ignored := act.Handler == arch.SignalActIgnore
+	if blocked || ignored || unconditional {
+		act.Handler = arch.SignalActDefault
+		t.tg.signalHandlers.actions[sig] = act
+		if blocked {
+			t.setSignalMaskLocked(t.tr.SignalMask &^ linux.SignalSetOf(sig))
+		}
+	}
+}
+
+// SignalMask returns a copy of t's signal mask.
+func (t *Task) SignalMask() linux.SignalSet {
+	return linux.SignalSet(atomic.LoadUint64((*uint64)(&t.tr.SignalMask)))
+}
+
+// SetSignalMask sets t's signal mask.
+//
+// Preconditions: SetSignalMask can only be called by the task goroutine.
+// t.exitState < TaskExitZombie.
+func (t *Task) SetSignalMask(mask linux.SignalSet) {
+	// By precondition, t prevents t.tg from completing an execve and mutating
+	// t.tg.signalHandlers, so we can skip the TaskSet mutex.
+	t.tg.signalHandlers.mu.Lock()
+	t.setSignalMaskLocked(mask)
+	t.tg.signalHandlers.mu.Unlock()
+}
+
+// Preconditions: The signal mutex must be locked.
+func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
+	oldMask := t.tr.SignalMask
+	atomic.StoreUint64((*uint64)(&t.tr.SignalMask), uint64(mask))
+
+	// If the new mask blocks any signals that were not blocked by the old
+	// mask, and at least one such signal is pending in tg.pendingSignals, and
+	// t has been woken, it could be the case that t was woken to handle that
+	// signal, but will no longer do so as a result of its new signal mask, so
+	// we have to pick a replacement.
+	blocked := mask &^ oldMask
+	blockedGroupPending := blocked & t.tg.pendingSignals.pendingSet
+	if blockedGroupPending != 0 && t.interrupted() {
+		linux.ForEachSignal(blockedGroupPending, func(sig linux.Signal) {
+			if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
+				nt.interrupt()
+				return
+			}
+		})
+		// We have to re-issue the interrupt consumed by t.interrupted() since
+		// it might have been for a different reason.
+		t.interruptSelf()
+	}
+
+	// Conversely, if the new mask unblocks any signals that were blocked by
+	// the old mask, and at least one such signal is pending, we may now need
+	// to handle that signal.
+	unblocked := oldMask &^ mask
+	unblockedPending := unblocked & (t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet)
+	if unblockedPending != 0 {
+		t.interruptSelf()
+	}
+}
+
+// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
+// comment).
+//
+// Preconditions: SetSavedSignalMask can only be called by the task goroutine.
+func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
+	t.savedSignalMask = mask
+	t.haveSavedSignalMask = true
+}
+
+// SignalStack returns the task-private signal stack.
+func (t *Task) SignalStack() arch.SignalStack {
+	return t.signalStack
+}
+
+// OnSignalStack returns true if, when the task resumes running, it will run on
+// the task-private signal stack.
+func (t *Task) OnSignalStack(s arch.SignalStack) bool {
+	sp := usermem.Addr(t.Arch().Stack())
+	return usermem.Addr(s.Addr) <= sp && sp < usermem.Addr(s.Addr+s.Size)
+}
+
+// SetSignalStack sets the task-private signal stack and clears the
+// SignalStackFlagDisable, since we have a signal stack.
+func (t *Task) SetSignalStack(alt arch.SignalStack) error {
+	// Mask out irrelevant parts: only disable matters.
+	alt.Flags &= arch.SignalStackFlagDisable
+	t.signalStack = alt
+	return nil
+}
+
+// SetSignalAct atomically sets the thread group's signal action for signal sig
+// to *actptr (if actptr is not nil) and returns the old signal action.
+func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (arch.SignalAct, error) {
+	if !sig.IsValid() {
+		return arch.SignalAct{}, syserror.EINVAL
+	}
+
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	sh := tg.signalHandlers
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	oldact := sh.actions[sig]
+	if actptr != nil {
+		if sig == linux.SIGKILL || sig == linux.SIGSTOP {
+			return oldact, syserror.EINVAL
+		}
+
+		act := *actptr
+		act.Mask &^= UnblockableSignals
+		sh.actions[sig] = act
+		// From POSIX, by way of Linux:
+		//
+		// "Setting a signal action to SIG_IGN for a signal that is pending
+		// shall cause the pending signal to be discarded, whether or not it is
+		// blocked."
+		//
+		// "Setting a signal action to SIG_DFL for a signal that is pending and
+		// whose default action is to ignore the signal (for example, SIGCHLD),
+		// shall cause the pending signal to be discarded, whether or not it is
+		// blocked."
+		if computeAction(sig, act) == SignalActionIgnore {
+			tg.discardSpecificLocked(sig)
+		}
+	}
+	return oldact, nil
+}
+
+// CopyOutSignalAct converts the given SignalAct into an architecture-specific
+// type and then copies it out to task memory.
+func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
+	n := t.Arch().NewSignalAct()
+	n.SerializeFrom(s)
+	_, err := t.CopyOut(addr, n)
+	return err
+}
+
+// CopyInSignalAct copies an architecture-specific sigaction type from task
+// memory and then converts it into a SignalAct.
+func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
+	n := t.Arch().NewSignalAct()
+	var s arch.SignalAct
+	if _, err := t.CopyIn(addr, n); err != nil {
+		return s, err
+	}
+	n.DeserializeTo(&s)
+	return s, nil
+}
+
+// CopyOutSignalStack converts the given SignalStack into an
+// architecture-specific type and then copies it out to task memory.
+func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error {
+	n := t.Arch().NewSignalStack()
+	n.SerializeFrom(s)
+	_, err := t.CopyOut(addr, n)
+	return err
+}
+
+// CopyInSignalStack copies an architecture-specific stack_t from task memory
+// and then converts it into a SignalStack.
+func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
+	n := t.Arch().NewSignalStack()
+	var s arch.SignalStack
+	if _, err := t.CopyIn(addr, n); err != nil {
+		return s, err
+	}
+	n.DeserializeTo(&s)
+	return s, nil
+}
+
+// groupStop is a TaskStop placed on tasks that have received a stop signal
+// (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from
+// the ptrace man page.)
+type groupStop struct{}
+
+// Killable implements TaskStop.Killable.
+func (*groupStop) Killable() bool { return true }
+
+type groupStopPhase int
+
+const (
+	// groupStopNone indicates that a thread group is not in, or attempting to
+	// enter or leave, a group stop.
+	groupStopNone groupStopPhase = iota
+
+	// groupStopDequeued indicates that at least one task in a thread group has
+	// dequeued a stop signal (or dequeued any signal and entered a
+	// signal-delivery-stop as a result, which allows ptrace to change the
+	// signal into a stop signal), but temporarily dropped the signal mutex
+	// without initiating the group stop.
+	//
+	// groupStopDequeued is analogous to JOBCTL_STOP_DEQUEUED in Linux.
+	groupStopDequeued
+
+	// groupStopInitiated indicates that a task in a thread group has initiated
+	// a group stop, but not all tasks in the thread group have acknowledged
+	// entering the group stop.
+	//
+	// groupStopInitiated is represented by JOBCTL_STOP_PENDING &&
+	// !SIGNAL_STOP_STOPPED in Linux.
+	groupStopInitiated
+
+	// groupStopComplete indicates that all tasks in a thread group have
+	// acknowledged entering the group stop, and the last one to do so has
+	// notified the thread group's parent.
+	//
+	// groupStopComplete is represented by JOBCTL_STOP_PENDING &&
+	// SIGNAL_STOP_STOPPED in Linux.
+	groupStopComplete
+)
+
+// initiateGroupStop attempts to initiate a group stop based on a
+// previously-dequeued stop signal.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) initiateGroupStop(info *arch.SignalInfo) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.tg.groupStopPhase != groupStopDequeued {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing signal", info.Signo)
+		return
+	}
+	if t.tg.exiting {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing group exit", info.Signo)
+		return
+	}
+	if t.tg.execing != nil {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo)
+		return
+	}
+	t.Debugf("Signal %d: stopping thread group", info.Signo)
+	t.tg.groupStopPhase = groupStopInitiated
+	t.tg.groupStopSignal = linux.Signal(info.Signo)
+	t.tg.groupStopCount = 0
+	for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() {
+		t2.groupStopRequired = true
+		t2.groupStopAcknowledged = false
+		t2.interrupt()
+	}
+}
+
+// endGroupStopLocked ensures that all prior stop signals received by tg are
+// not stopping tg and will not stop tg in the future. If broadcast is true,
+// parent and tracer notification will be scheduled if appropriate.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) {
+	// Discard all previously-queued stop signals.
+	linux.ForEachSignal(StopSignals, tg.discardSpecificLocked)
+
+	if tg.groupStopPhase != groupStopNone {
+		tg.leader.Debugf("Ending group stop currently in phase %d", tg.groupStopPhase)
+		if tg.groupStopPhase == groupStopInitiated || tg.groupStopPhase == groupStopComplete {
+			tg.groupStopSignal = 0
+			for t := tg.tasks.Front(); t != nil; t = t.Next() {
+				if _, ok := t.stop.(*groupStop); ok {
+					t.endInternalStopLocked()
+				}
+			}
+			if broadcast {
+				// Instead of notifying the parent here, set groupContNotify so
+				// that one of the continuing tasks does so. (Linux does
+				// something similar.) The reason we do this is to keep locking
+				// sane. In order to send a signal to the parent, we need to
+				// lock its signal mutex, but we're already holding tg's signal
+				// mutex, and the TaskSet mutex must be locked for writing for
+				// us to hold two signal mutexes. Since we don't want to
+				// require this for endGroupStopLocked (which is called from
+				// signal-sending paths), nor do we want to lose atomicity by
+				// releasing the mutexes we're already holding, just let the
+				// continuing thread group deal with it.
+				tg.groupContNotify = true
+				tg.groupContInterrupted = tg.groupStopPhase == groupStopInitiated
+				tg.groupContWaitable = true
+			}
+		}
+		// If groupStopPhase was groupStopDequeued, setting it to groupStopNone
+		// will cause following calls to initiateGroupStop to recognize that
+		// the group stop has been cancelled.
+		tg.groupStopPhase = groupStopNone
+	}
+}
+
+// signalStop sends a signal to t's thread group of a new group stop, group
+// continue, or ptrace stop, if appropriate. code and status are set in the
+// signal sent to tg, if any.
+//
+// Preconditions: The TaskSet mutex must be locked (for reading or writing).
+func (t *Task) signalStop(target *Task, code int32, status int32) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	act, ok := t.tg.signalHandlers.actions[linux.SIGCHLD]
+	if !ok || (act.Handler != arch.SignalActIgnore && act.Flags&arch.SignalFlagNoCldStop == 0) {
+		sigchld := &arch.SignalInfo{
+			Signo: int32(linux.SIGCHLD),
+			Code:  code,
+		}
+		sigchld.SetPid(int32(t.tg.pidns.tids[target]))
+		sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+		sigchld.SetStatus(status)
+		// TODO: Set utime, stime.
+		t.sendSignalLocked(sigchld, true /* group */)
+	}
+}
+
+// The runInterrupt state handles conditions indicated by interrupts.
+type runInterrupt struct{}
+
+func (*runInterrupt) execute(t *Task) taskRunState {
+	// Interrupts are de-duplicated (if t is interrupted twice before
+	// t.interrupted() is called, t.interrupted() will only return true once),
+	// so early exits from this function must re-enter the runInterrupt state
+	// to check for more interrupt-signaled conditions.
+
+	t.tg.signalHandlers.mu.Lock()
+
+	// Did we just leave a group stop?
+	if t.tg.groupContNotify {
+		t.tg.groupContNotify = false
+		sig := t.tg.groupStopSignal
+		intr := t.tg.groupContInterrupted
+		t.tg.signalHandlers.mu.Unlock()
+		t.tg.pidns.owner.mu.RLock()
+		// For consistency with Linux, if the parent and (thread group
+		// leader's) tracer are in the same thread group, deduplicate
+		// notifications.
+		notifyParent := t.tg.leader.parent != nil
+		if tracer := t.tg.leader.ptraceTracer.Load().(*Task); tracer != nil {
+			if notifyParent && tracer.tg == t.tg.leader.parent.tg {
+				notifyParent = false
+			}
+			// Sending CLD_STOPPED to the tracer doesn't really make any sense;
+			// the thread group leader may have already entered the stop and
+			// notified its tracer accordingly. But it's consistent with
+			// Linux...
+			if intr {
+				tracer.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+				if !notifyParent {
+					tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop | EventChildGroupStop)
+				} else {
+					tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop)
+				}
+			} else {
+				tracer.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
+				tracer.tg.eventQueue.Notify(EventGroupContinue)
+			}
+		}
+		if notifyParent {
+			// If groupContInterrupted, do as Linux does and pretend the group
+			// stop completed just before it ended. The theoretical behavior in
+			// this case would be to send a SIGCHLD indicating the completed
+			// stop, followed by a SIGCHLD indicating the continue. However,
+			// SIGCHLD is a standard signal, so the latter would always be
+			// dropped. Hence sending only the former is equivalent.
+			if intr {
+				t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+				t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue | EventChildGroupStop)
+			} else {
+				t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
+				t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue)
+			}
+		}
+		t.tg.pidns.owner.mu.RUnlock()
+		return (*runInterrupt)(nil)
+	}
+
+	// Do we need to enter a group stop?
+	if t.groupStopRequired {
+		t.groupStopRequired = false
+		sig := t.tg.groupStopSignal
+		notifyParent := false
+		if !t.groupStopAcknowledged {
+			t.groupStopAcknowledged = true
+			t.tg.groupStopCount++
+			if t.tg.groupStopCount == t.tg.activeTasks {
+				t.Debugf("Completing group stop")
+				notifyParent = true
+				t.tg.groupStopPhase = groupStopComplete
+				t.tg.groupStopWaitable = true
+				t.tg.groupContNotify = false
+				t.tg.groupContWaitable = false
+			}
+		}
+		// Drop the signal mutex so we can take the TaskSet mutex.
+		t.tg.signalHandlers.mu.Unlock()
+
+		t.tg.pidns.owner.mu.RLock()
+		if t.tg.leader.parent == nil {
+			notifyParent = false
+		}
+		if tracer := t.Tracer(); tracer != nil {
+			t.ptraceCode = int32(sig)
+			t.ptraceSiginfo = nil
+			if t.beginPtraceStopLocked() {
+				tracer.signalStop(t, arch.CLD_STOPPED, int32(sig))
+				// For consistency with Linux, if the parent and tracer are in the
+				// same thread group, deduplicate notification signals.
+				if notifyParent && tracer.tg == t.tg.leader.parent.tg {
+					notifyParent = false
+					tracer.tg.eventQueue.Notify(EventChildGroupStop | EventTraceeStop)
+				} else {
+					tracer.tg.eventQueue.Notify(EventTraceeStop)
+				}
+			}
+		} else {
+			t.tg.signalHandlers.mu.Lock()
+			if !t.killedLocked() {
+				t.beginInternalStopLocked((*groupStop)(nil))
+			}
+			t.tg.signalHandlers.mu.Unlock()
+		}
+		if notifyParent {
+			t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+			t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
+		}
+		t.tg.pidns.owner.mu.RUnlock()
+
+		return (*runInterrupt)(nil)
+	}
+
+	// Are there signals pending?
+	if info := t.dequeueSignalLocked(); info != nil {
+		if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 && t.tg.groupStopPhase == groupStopNone {
+			// Indicate that we've dequeued a stop signal before
+			// unlocking the signal mutex; initiateGroupStop will check
+			// that the phase hasn't changed (or is at least another
+			// "stop signal dequeued" phase) after relocking it.
+			t.tg.groupStopPhase = groupStopDequeued
+		}
+		if t.ptraceSignalLocked(info) {
+			// Dequeueing the signal action must wait until after the
+			// signal-delivery-stop ends since the tracer can change or
+			// suppress the signal.
+			t.tg.signalHandlers.mu.Unlock()
+			return (*runInterruptAfterSignalDeliveryStop)(nil)
+		}
+		act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
+		t.tg.signalHandlers.mu.Unlock()
+		return t.deliverSignal(info, act)
+	}
+
+	t.tg.signalHandlers.mu.Unlock()
+	return (*runApp)(nil)
+}
+
+type runInterruptAfterSignalDeliveryStop struct{}
+
+func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
+	t.tg.pidns.owner.mu.Lock()
+	// Can't defer unlock: deliverSignal must be called without holding TaskSet
+	// mutex.
+	sig := linux.Signal(t.ptraceCode)
+	defer func() {
+		t.ptraceSiginfo = nil
+	}()
+	if !sig.IsValid() {
+		t.tg.pidns.owner.mu.Unlock()
+		return (*runInterrupt)(nil)
+	}
+	info := t.ptraceSiginfo
+	if sig != linux.Signal(info.Signo) {
+		info.Signo = int32(sig)
+		info.Errno = 0
+		info.Code = arch.SignalInfoUser
+		// pid isn't a valid field for all signal numbers, but Linux
+		// doesn't care (kernel/signal.c:ptrace_signal()).
+		//
+		// Linux uses t->parent for the tid and uid here, which is the tracer
+		// if it hasn't detached or the real parent otherwise.
+		parent := t.parent
+		if tracer := t.Tracer(); tracer != nil {
+			parent = tracer
+		}
+		if parent == nil {
+			// Tracer has detached and t was created by Kernel.CreateProcess().
+			// Pretend the parent is in an ancestor PID + user namespace.
+			info.SetPid(0)
+			info.SetUid(int32(auth.OverflowUID))
+		} else {
+			info.SetPid(int32(t.tg.pidns.tids[parent]))
+			info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+		}
+	}
+	t.tg.signalHandlers.mu.Lock()
+	t.tg.pidns.owner.mu.Unlock()
+	// If the signal is masked, re-queue it.
+	if linux.SignalSetOf(sig)&t.tr.SignalMask != 0 {
+		t.sendSignalLocked(info, false /* group */)
+		t.tg.signalHandlers.mu.Unlock()
+		return (*runInterrupt)(nil)
+	}
+	act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
+	t.tg.signalHandlers.mu.Unlock()
+	return t.deliverSignal(info, act)
+}
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
new file mode 100644
index 000000000..801cb3395
--- /dev/null
+++ b/pkg/sentry/kernel/task_start.go
@@ -0,0 +1,252 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TaskConfig defines the configuration of a new Task (see below).
+type TaskConfig struct {
+	// Kernel is the owning Kernel.
+	*Kernel
+
+	// Parent is the new task's parent. Parent may be nil.
+	Parent *Task
+
+	// ThreadGroup is the ThreadGroup the new task belongs to.
+	*ThreadGroup
+
+	// TaskContext is the TaskContext of the new task.
+	*TaskContext
+
+	// TaskResources is the TaskResources of the new task.
+	*TaskResources
+
+	// Credentials is the Credentials of the new task.
+	Credentials *auth.Credentials
+
+	// Niceness is the niceness of the new task.
+	Niceness int
+
+	// If NetworkNamespaced is true, the new task should observe a non-root
+	// network namespace.
+	NetworkNamespaced bool
+
+	// AllowedCPUMask contains the cpus that this task can run on.
+	AllowedCPUMask sched.CPUSet
+
+	// UTSNamespace is the UTSNamespace of the new task.
+	UTSNamespace *UTSNamespace
+
+	// IPCNamespace is the IPCNamespace of the new task.
+	IPCNamespace *IPCNamespace
+}
+
+// NewTask creates a new task defined by TaskConfig.
+// Whether or not NewTask is successful, it takes ownership of both TaskContext
+// and TaskResources of the TaskConfig.
+//
+// NewTask does not start the returned task; the caller must call Task.Start.
+func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
+	t, err := ts.newTask(cfg)
+	if err != nil {
+		cfg.TaskContext.release()
+		cfg.TaskResources.release()
+		return nil, err
+	}
+	return t, nil
+}
+
+// newTask is a helper for TaskSet.NewTask that only takes ownership of TaskContext
+// and TaskResources of the TaskConfig if it succeeds.
+func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
+	tg := cfg.ThreadGroup
+	tc := cfg.TaskContext
+	t := &Task{
+		taskNode: taskNode{
+			tg:       tg,
+			parent:   cfg.Parent,
+			children: make(map[*Task]struct{}),
+		},
+		runState:       (*runApp)(nil),
+		interruptChan:  make(chan struct{}, 1),
+		signalStack:    arch.SignalStack{Flags: arch.SignalStackFlagDisable},
+		tc:             *tc,
+		tr:             *cfg.TaskResources,
+		p:              cfg.Kernel.Platform.NewContext(),
+		k:              cfg.Kernel,
+		ptraceTracees:  make(map[*Task]struct{}),
+		allowedCPUMask: cfg.AllowedCPUMask.Copy(),
+		ioUsage:        &usage.IO{},
+		creds:          cfg.Credentials,
+		niceness:       cfg.Niceness,
+		netns:          cfg.NetworkNamespaced,
+		utsns:          cfg.UTSNamespace,
+		ipcns:          cfg.IPCNamespace,
+		rseqCPU:        -1,
+		futexWaiter:    futex.NewWaiter(),
+	}
+	t.endStopCond.L = &t.tg.signalHandlers.mu
+	t.ptraceTracer.Store((*Task)(nil))
+	// We don't construct t.blockingTimer until Task.run(); see that function
+	// for justification.
+
+	// Make the new task (and possibly thread group) visible to the rest of
+	// the system atomically.
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	if tg.exiting || tg.execing != nil {
+		// If the caller is in the same thread group, then what we return
+		// doesn't matter too much since the caller will exit before it returns
+		// to userspace. If the caller isn't in the same thread group, then
+		// we're in uncharted territory and can return whatever we want.
+		return nil, syserror.EINTR
+	}
+	if err := ts.assignTIDsLocked(t); err != nil {
+		return nil, err
+	}
+	// Below this point, newTask is expected not to fail (there is no rollback
+	// of assignTIDsLocked or any of the following).
+
+	// Logging on t's behalf will panic if t.logPrefix hasn't been initialized.
+	// This is the earliest point at which we can do so (since t now has thread
+	// IDs).
+	t.updateLogPrefixLocked()
+
+	if t.parent != nil {
+		t.parent.children[t] = struct{}{}
+	}
+
+	if tg.leader == nil {
+		// New thread group.
+		tg.leader = t
+		if parentPG := tg.parentPG(); parentPG == nil {
+			tg.createSession()
+		} else {
+			// Inherit the process group.
+			parentPG.incRefWithParent(parentPG)
+			tg.processGroup = parentPG
+		}
+	}
+	tg.tasks.PushBack(t)
+	tg.tasksCount++
+	tg.liveTasks++
+	tg.activeTasks++
+
+	// Propagate external TaskSet stops to the new task.
+	t.stopCount = ts.stopCount
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t])
+
+	t.startTime = t.k.RealtimeClock().Now()
+
+	return t, nil
+}
+
+// assignTIDsLocked ensures that new task t is visible in all PID namespaces in
+// which it should be visible.
+//
+// Preconditions: ts.mu must be locked for writing.
+func (ts *TaskSet) assignTIDsLocked(t *Task) error {
+	type allocatedTID struct {
+		ns  *PIDNamespace
+		tid ThreadID
+	}
+	var allocatedTIDs []allocatedTID
+	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+		tid, err := ns.allocateTID()
+		if err != nil {
+			// Failure. Remove the tids we already allocated in descendant
+			// namespaces.
+			for _, a := range allocatedTIDs {
+				delete(a.ns.tasks, a.tid)
+				delete(a.ns.tids, t)
+			}
+			return err
+		}
+		ns.tasks[tid] = t
+		ns.tids[t] = tid
+		allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
+	}
+	return nil
+}
+
+// allocateTID returns an unused ThreadID from ns.
+//
+// Preconditions: ns.owner.mu must be locked for writing.
+func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
+	if ns.exiting {
+		// "In this case, a subsequent fork(2) into this PID namespace will
+		// fail with the error ENOMEM; it is not possible to create a new
+		// processes [sic] in a PID namespace whose init process has
+		// terminated." - pid_namespaces(7)
+		return 0, syserror.ENOMEM
+	}
+	tid := ns.last
+	for {
+		// Next.
+		tid++
+		if tid > TasksLimit {
+			tid = InitTID + 1
+		}
+
+		// Is it available?
+		_, ok := ns.tasks[tid]
+		if !ok {
+			ns.last = tid
+			return tid, nil
+		}
+
+		// Did we do a full cycle?
+		if tid == ns.last {
+			// No tid available.
+			return 0, syserror.EAGAIN
+		}
+	}
+}
+
+// Start starts the task goroutine. Start must be called exactly once for each
+// task returned by NewTask.
+//
+// 'tid' must be the task's TID in the root PID namespace and it's used for
+// debugging purposes only (set as parameter to Task.run to make it visible
+// in stack dumps).
+func (t *Task) Start(tid ThreadID) {
+	// If the task was restored, it may be "starting" after having already exited.
+	if t.runState == nil {
+		return
+	}
+	t.goroutineStopped.Add(1)
+	t.tg.liveGoroutines.Add(1)
+	t.tg.pidns.owner.liveGoroutines.Add(1)
+	t.tg.pidns.owner.runningGoroutines.Add(1)
+
+	// Task is now running in system mode.
+	t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)
+
+	// Use the task's TID in the root PID namespace to make it visible in stack dumps.
+	go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
+}
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
new file mode 100644
index 000000000..feaf6cae4
--- /dev/null
+++ b/pkg/sentry/kernel/task_stop.go
@@ -0,0 +1,226 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements task stops, which represent the equivalent of Linux's
+// uninterruptible sleep states in a way that is compatible with save/restore.
+// Task stops comprise both internal stops (which form part of the task's
+// "normal" control flow) and external stops (which do not); see README.md for
+// details.
+//
+// There are multiple interfaces for interacting with stops because there are
+// multiple cases to consider:
+//
+// - A task goroutine can begin a stop on its associated task (e.g. a
+// vfork() syscall stopping the calling task until the child task releases its
+// MM). In this case, calling Task.interrupt is both unnecessary (the task
+// goroutine obviously cannot be blocked in Task.block or executing application
+// code) and undesirable (as it may spuriously interrupt a in-progress
+// syscall).
+//
+// Beginning internal stops in this case is implemented by
+// Task.beginInternalStop / Task.beginInternalStopLocked. As of this writing,
+// there are no instances of this case that begin external stops, except for
+// autosave; however, autosave terminates the sentry without ending the
+// external stop, so the spurious interrupt is moot.
+//
+// - An arbitrary goroutine can begin a stop on an unrelated task (e.g. all
+// tasks being stopped in preparation for state checkpointing). If the task
+// goroutine may be in Task.block or executing application code, it must be
+// interrupted by Task.interrupt for it to actually enter the stop; since,
+// strictly speaking, we have no way of determining this, we call
+// Task.interrupt unconditionally.
+//
+// Beginning external stops in this case is implemented by
+// Task.BeginExternalStop. As of this writing, there are no instances of this
+// case that begin internal stops.
+//
+// - An arbitrary goroutine can end a stop on an unrelated task (e.g. an
+// exiting task resuming a sibling task that has been blocked in an execve()
+// syscall waiting for other tasks to exit). In this case, Task.endStopCond
+// must be notified to kick the task goroutine out of Task.doStop.
+//
+// Ending internal stops in this case is implemented by
+// Task.endInternalStopLocked. Ending external stops in this case is
+// implemented by Task.EndExternalStop.
+//
+// - Hypothetically, a task goroutine can end an internal stop on its
+// associated task. As of this writing, there are no instances of this case.
+// However, any instances of this case could still use the above functions,
+// since notifying Task.endStopCond would be unnecessary but harmless.
+
+import (
+	"fmt"
+	"sync/atomic"
+)
+
+// A TaskStop is a condition visible to the task control flow graph that
+// prevents a task goroutine from running or exiting, i.e. an internal stop.
+//
+// NOTE: Most TaskStops don't contain any data; they're
+// distinguished by their type. The obvious way to implement such a TaskStop
+// is:
+//
+//     type groupStop struct{}
+//     func (groupStop) Killable() bool { return true }
+//     ...
+//     t.beginInternalStop(groupStop{})
+//
+// However, this doesn't work because the state package can't serialize values,
+// only pointers. Furthermore, the correctness of save/restore depends on the
+// ability to pass a TaskStop to endInternalStop that will compare equal to the
+// TaskStop that was passed to beginInternalStop, even if a save/restore cycle
+// occurred between the two. As a result, the current idiom is to always use a
+// typecast nil for data-free TaskStops:
+//
+//     type groupStop struct{}
+//     func (*groupStop) Killable() bool { return true }
+//     ...
+//     t.beginInternalStop((*groupStop)(nil))
+//
+// This is pretty gross, but the alternatives seem grosser.
+type TaskStop interface {
+	// Killable returns true if Task.Kill should end the stop prematurely.
+	// Killable is analogous to Linux's TASK_WAKEKILL.
+	Killable() bool
+}
+
+// beginInternalStop indicates the start of an internal stop that applies to t.
+//
+// Preconditions: The task must not already be in an internal stop (i.e. t.stop
+// == nil). The caller must be running on the task goroutine.
+func (t *Task) beginInternalStop(s TaskStop) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.beginInternalStopLocked(s)
+}
+
+// Preconditions: The signal mutex must be locked. All preconditions for
+// Task.beginInternalStop also apply.
+func (t *Task) beginInternalStopLocked(s TaskStop) {
+	if t.stop != nil {
+		panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop))
+	}
+	t.Debugf("Entering internal stop %#v", s)
+	t.stop = s
+	t.beginStopLocked()
+}
+
+// endInternalStopLocked indicates the end of an internal stop that applies to
+// t. endInternalStopLocked does not wait for the task to resume.
+//
+// The caller is responsible for ensuring that the internal stop they expect
+// actually applies to t; this requires holding the signal mutex which protects
+// t.stop, which is why there is no endInternalStop that locks the signal mutex
+// for you.
+//
+// Preconditions: The signal mutex must be locked. The task must be in an
+// internal stop (i.e. t.stop != nil).
+func (t *Task) endInternalStopLocked() {
+	if t.stop == nil {
+		panic("Attempting to leave non-existent internal stop")
+	}
+	t.Debugf("Leaving internal stop %#v", t.stop)
+	t.stop = nil
+	t.endStopLocked()
+}
+
+// BeginExternalStop indicates the start of an external stop that applies to t.
+// BeginExternalStop does not wait for t's task goroutine to stop.
+func (t *Task) BeginExternalStop() {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.beginStopLocked()
+	t.interrupt()
+}
+
+// EndExternalStop indicates the end of an external stop started by a previous
+// call to Task.BeginExternalStop. EndExternalStop does not wait for t's task
+// goroutine to resume.
+func (t *Task) EndExternalStop() {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.endStopLocked()
+}
+
+// beginStopLocked increments t.stopCount to indicate that a new internal or
+// external stop applies to t.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) beginStopLocked() {
+	if newval := atomic.AddInt32(&t.stopCount, 1); newval <= 0 {
+		// Most likely overflow.
+		panic(fmt.Sprintf("Invalid stopCount: %d", newval))
+	}
+}
+
+// endStopLocked decerements t.stopCount to indicate that an existing internal
+// or external stop no longer applies to t.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) endStopLocked() {
+	if newval := atomic.AddInt32(&t.stopCount, -1); newval < 0 {
+		panic(fmt.Sprintf("Invalid stopCount: %d", newval))
+	} else if newval == 0 {
+		t.endStopCond.Signal()
+	}
+}
+
+// BeginExternalStop indicates the start of an external stop that applies to
+// all current and future tasks in ts. BeginExternalStop does not wait for
+// task goroutines to stop.
+func (ts *TaskSet) BeginExternalStop() {
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.stopCount++
+	if ts.stopCount <= 0 {
+		panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount))
+	}
+	if ts.Root == nil {
+		return
+	}
+	for t := range ts.Root.tids {
+		t.tg.signalHandlers.mu.Lock()
+		t.beginStopLocked()
+		t.tg.signalHandlers.mu.Unlock()
+		t.interrupt()
+	}
+}
+
+// EndExternalStop indicates the end of an external stop started by a previous
+// call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task
+// goroutines to resume.
+func (ts *TaskSet) EndExternalStop() {
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.stopCount--
+	if ts.stopCount < 0 {
+		panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount))
+	}
+	if ts.Root == nil {
+		return
+	}
+	for t := range ts.Root.tids {
+		t.tg.signalHandlers.mu.Lock()
+		t.endStopLocked()
+		t.tg.signalHandlers.mu.Unlock()
+	}
+}
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
new file mode 100644
index 000000000..79f4ff60c
--- /dev/null
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -0,0 +1,434 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"os"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
+// include/linux/errno.h. These errnos are never returned to userspace
+// directly, but are used to communicate the expected behavior of an
+// interrupted syscall from the syscall to signal handling.
+type SyscallRestartErrno int
+
+// These numeric values are significant because ptrace syscall exit tracing can
+// observe them.
+//
+// For all of the following errnos, if the syscall is not interrupted by a
+// signal delivered to a user handler, the syscall is restarted.
+const (
+	// ERESTARTSYS is returned by an interrupted syscall to indicate that it
+	// should be converted to EINTR if interrupted by a signal delivered to a
+	// user handler without SA_RESTART set, and restarted otherwise.
+	ERESTARTSYS = SyscallRestartErrno(512)
+
+	// ERESTARTNOINTR is returned by an interrupted syscall to indicate that it
+	// should always be restarted.
+	ERESTARTNOINTR = SyscallRestartErrno(513)
+
+	// ERESTARTNOHAND is returned by an interrupted syscall to indicate that it
+	// should be converted to EINTR if interrupted by a signal delivered to a
+	// user handler, and restarted otherwise.
+	ERESTARTNOHAND = SyscallRestartErrno(514)
+
+	// ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate
+	// that it should be restarted using a custom function. The interrupted
+	// syscall must register a custom restart function by calling
+	// Task.SetRestartSyscallFn.
+	ERESTART_RESTARTBLOCK = SyscallRestartErrno(516)
+)
+
+// Error implements error.Error.
+func (e SyscallRestartErrno) Error() string {
+	// Descriptions are borrowed from strace.
+	switch e {
+	case ERESTARTSYS:
+		return "to be restarted if SA_RESTART is set"
+	case ERESTARTNOINTR:
+		return "to be restarted"
+	case ERESTARTNOHAND:
+		return "to be restarted if no handler"
+	case ERESTART_RESTARTBLOCK:
+		return "interrupted by signal"
+	default:
+		return "(unknown interrupt error)"
+	}
+}
+
+// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by
+// rv, the value in a syscall return register.
+func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) {
+	switch int(rv) {
+	case -int(ERESTARTSYS):
+		return ERESTARTSYS, true
+	case -int(ERESTARTNOINTR):
+		return ERESTARTNOINTR, true
+	case -int(ERESTARTNOHAND):
+		return ERESTARTNOHAND, true
+	case -int(ERESTART_RESTARTBLOCK):
+		return ERESTART_RESTARTBLOCK, true
+	default:
+		return 0, false
+	}
+}
+
+// SyscallRestartBlock represents the restart block for a syscall restartable
+// with a custom function. It encapsulates the state required to restart a
+// syscall across a S/R.
+type SyscallRestartBlock interface {
+	Restart(t *Task) (uintptr, error)
+}
+
+// SyscallControl is returned by syscalls to control the behavior of
+// Task.doSyscallInvoke.
+type SyscallControl struct {
+	// next is the state that the task goroutine should switch to. If next is
+	// nil, the task goroutine should continue to syscall exit as usual.
+	next taskRunState
+
+	// If ignoreReturn is true, Task.doSyscallInvoke should not store any value
+	// in the task's syscall return value register.
+	ignoreReturn bool
+}
+
+var (
+	// CtrlDoExit is returned by the implementations of the exit and exit_group
+	// syscalls to enter the task exit path directly, skipping syscall exit
+	// tracing.
+	CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true}
+
+	// ctrlStopAndReinvokeSyscall is returned by syscalls using the external
+	// feature before syscall execution. This causes Task.doSyscallInvoke
+	// to return runSyscallReinvoke, allowing Task.run to check for stops
+	// before immediately re-invoking the syscall (skipping the re-checking
+	// of seccomp filters and ptrace which would confuse userspace
+	// tracing).
+	ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true}
+
+	// ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at
+	// their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather
+	// than tail-calling it, allowing stops to be checked before syscall exit.
+	ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)}
+)
+
+func (t *Task) invokeExternal() {
+	t.BeginExternalStop()
+	go func() { // S/R-SAFE: External control flow.
+		defer t.EndExternalStop()
+		t.SyscallTable().External(t.Kernel())
+	}()
+}
+
+func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) {
+	s := t.SyscallTable()
+
+	fe := s.FeatureEnable.Word(sysno)
+
+	var straceContext interface{}
+	if bits.IsAnyOn32(fe, StraceEnableBits) {
+		straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe)
+	}
+
+	if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) {
+		t.invokeExternal()
+		// Ensure we check for stops, then invoke the syscall again.
+		ctrl = ctrlStopAndReinvokeSyscall
+	} else {
+		fn := s.Lookup(sysno)
+		if fn != nil {
+			// Call our syscall implementation.
+			rval, ctrl, err = fn(t, args)
+		} else {
+			// Use the missing function if not found.
+			rval, err = t.SyscallTable().Missing(t, sysno, args)
+		}
+	}
+
+	if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
+		t.invokeExternal()
+		// Don't reinvoke the syscall.
+	}
+
+	if bits.IsAnyOn32(fe, StraceEnableBits) {
+		s.Stracer.SyscallExit(straceContext, t, sysno, rval, err)
+	}
+
+	return
+}
+
+// doSyscall is the entry point for an invocation of a system call specified by
+// the current state of t's registers.
+//
+// The syscall path is very hot; avoid defer.
+func (t *Task) doSyscall() taskRunState {
+	sysno := t.Arch().SyscallNo()
+	args := t.Arch().SyscallArgs()
+
+	// Tracers expect to see this between when the task traps into the kernel
+	// to perform a syscall and when the syscall is actually invoked.
+	// This useless-looking temporary is needed because Go.
+	tmp := uintptr(syscall.ENOSYS)
+	t.Arch().SetReturn(-tmp)
+
+	// Check seccomp filters. The nil check is for performance (as seccomp use
+	// is rare), not needed for correctness.
+	if t.syscallFilters != nil {
+		switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r {
+		case seccompResultDeny:
+			t.Debugf("Syscall %d: denied by seccomp", sysno)
+			return (*runSyscallExit)(nil)
+		case seccompResultAllow:
+			// ok
+		case seccompResultKill:
+			t.Debugf("Syscall %d: killed by seccomp", sysno)
+			t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+			return (*runExit)(nil)
+		case seccompResultTrace:
+			t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno)
+			return (*runSyscallAfterPtraceEventSeccomp)(nil)
+		default:
+			panic(fmt.Sprintf("Unknown seccomp result %d", r))
+		}
+	}
+
+	return t.doSyscallEnter(sysno, args)
+}
+
+type runSyscallAfterPtraceEventSeccomp struct{}
+
+func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
+	if t.killed() {
+		// "[S]yscall-exit-stop is not generated prior to death by SIGKILL." -
+		// ptrace(2)
+		return (*runInterrupt)(nil)
+	}
+	sysno := t.Arch().SyscallNo()
+	// "The tracer can skip the system call by changing the syscall number to
+	// -1." - Documentation/prctl/seccomp_filter.txt
+	if sysno == ^uintptr(0) {
+		return (*runSyscallExit)(nil).execute(t)
+	}
+	args := t.Arch().SyscallArgs()
+	return t.doSyscallEnter(sysno, args)
+}
+
+func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState {
+	if next, ok := t.ptraceSyscallEnter(); ok {
+		return next
+	}
+	return t.doSyscallInvoke(sysno, args)
+}
+
+type runSyscallAfterSyscallEnterStop struct{}
+
+func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
+	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
+		t.tg.signalHandlers.mu.Lock()
+		t.sendSignalLocked(sigPriv(sig), false /* group */)
+		t.tg.signalHandlers.mu.Unlock()
+	}
+	if t.killed() {
+		return (*runInterrupt)(nil)
+	}
+	sysno := t.Arch().SyscallNo()
+	if sysno == ^uintptr(0) {
+		return (*runSyscallExit)(nil)
+	}
+	args := t.Arch().SyscallArgs()
+	return t.doSyscallInvoke(sysno, args)
+}
+
+type runSyscallAfterSysemuStop struct{}
+
+func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
+	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
+		t.tg.signalHandlers.mu.Lock()
+		t.sendSignalLocked(sigPriv(sig), false /* group */)
+		t.tg.signalHandlers.mu.Unlock()
+	}
+	if t.killed() {
+		return (*runInterrupt)(nil)
+	}
+	return (*runSyscallExit)(nil).execute(t)
+}
+
+func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState {
+	rval, ctrl, err := t.executeSyscall(sysno, args)
+
+	if ctrl != nil {
+		if !ctrl.ignoreReturn {
+			t.Arch().SetReturn(rval)
+		}
+		if ctrl.next != nil {
+			return ctrl.next
+		}
+	} else if err != nil {
+		t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+		t.haveSyscallReturn = true
+	} else {
+		t.Arch().SetReturn(rval)
+	}
+
+	return (*runSyscallExit)(nil).execute(t)
+}
+
+type runSyscallReinvoke struct{}
+
+func (*runSyscallReinvoke) execute(t *Task) taskRunState {
+	if t.killed() {
+		// It's possible that since the last execution, the task has
+		// been forcible killed. Invoking the system call here could
+		// result in an infinite loop if it is again preempted by an
+		// external stop and reinvoked.
+		return (*runInterrupt)(nil)
+	}
+
+	sysno := t.Arch().SyscallNo()
+	args := t.Arch().SyscallArgs()
+	return t.doSyscallInvoke(sysno, args)
+}
+
+type runSyscallExit struct{}
+
+func (*runSyscallExit) execute(t *Task) taskRunState {
+	t.ptraceSyscallExit()
+	return (*runApp)(nil)
+}
+
+// doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
+// indicated by an execution fault at address addr. doVsyscall returns the
+// task's next run state.
+func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
+	// Grab the caller up front, to make sure there's a sensible stack.
+	caller := t.Arch().Native(uintptr(0))
+	if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
+		t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(sigPriv(linux.SIGSEGV))
+		return (*runApp)(nil)
+	}
+
+	// For _vsyscalls_, there is no need to translate System V calling convention
+	// to syscall ABI because they both use RDI, RSI, and RDX for the first three
+	// arguments and none of the vsyscalls uses more than two arguments.
+	args := t.Arch().SyscallArgs()
+	if t.syscallFilters != nil {
+		switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
+		case seccompResultDeny:
+			t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
+			return (*runApp)(nil)
+		case seccompResultAllow:
+			// ok
+		case seccompResultTrace:
+			t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller))
+			return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller}
+		default:
+			panic(fmt.Sprintf("Unknown seccomp result %d", r))
+		}
+	}
+
+	return t.doVsyscallInvoke(sysno, args, caller)
+}
+
+type runVsyscallAfterPtraceEventSeccomp struct {
+	addr   usermem.Addr
+	sysno  uintptr
+	caller interface{}
+}
+
+func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
+	if t.killed() {
+		return (*runInterrupt)(nil)
+	}
+	sysno := t.Arch().SyscallNo()
+	// "... the syscall may not be changed to another system call using the
+	// orig_rax register. It may only be changed to -1 order [sic] to skip the
+	// currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
+	// Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
+	// causes do_exit(SIGSYS), and changing sp is ignored.
+	if (sysno != ^uintptr(0) && sysno != r.sysno) || usermem.Addr(t.Arch().IP()) != r.addr {
+		t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+		return (*runExit)(nil)
+	}
+	if sysno == ^uintptr(0) {
+		return (*runApp)(nil)
+	}
+	return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
+}
+
+func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState {
+	rval, ctrl, err := t.executeSyscall(sysno, args)
+	if ctrl != nil {
+		t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
+		// Set the return value. The stack has already been adjusted.
+		t.Arch().SetReturn(0)
+	} else if err == nil {
+		t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller))
+		// Set the return value. The stack has already been adjusted.
+		t.Arch().SetReturn(uintptr(rval))
+	} else {
+		t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err)
+		if err == syserror.EFAULT {
+			t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+			t.SendSignal(sigPriv(linux.SIGSEGV))
+			// A return is not emulated in this case.
+			return (*runApp)(nil)
+		}
+		t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+	}
+	t.Arch().SetIP(t.Arch().Value(caller))
+	t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
+	return (*runApp)(nil)
+}
+
+// ExtractErrno extracts an integer error number from the error.
+// The syscall number is purely for context in the error case. Use -1 if
+// syscall number is unknown.
+func (t *Task) ExtractErrno(err error, sysno int) int {
+	switch err := err.(type) {
+	case nil:
+		return 0
+	case syscall.Errno:
+		return int(err)
+	case SyscallRestartErrno:
+		return int(err)
+	case *memmap.BusError:
+		// Bus errors may generate SIGBUS, but for syscalls they still
+		// return EFAULT. See case in task_run.go where the fault is
+		// handled (and the SIGBUS is delivered).
+		return int(syscall.EFAULT)
+	case *os.PathError:
+		return t.ExtractErrno(err.Err, sysno)
+	case *os.LinkError:
+		return t.ExtractErrno(err.Err, sysno)
+	case *os.SyscallError:
+		return t.ExtractErrno(err.Err, sysno)
+	default:
+		if errno, ok := syserror.TranslateError(err); ok {
+			return int(errno)
+		}
+	}
+	panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err))
+}
diff --git a/pkg/sentry/kernel/task_test.go b/pkg/sentry/kernel/task_test.go
new file mode 100644
index 000000000..82ef858a1
--- /dev/null
+++ b/pkg/sentry/kernel/task_test.go
@@ -0,0 +1,69 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+)
+
+func TestTaskCPU(t *testing.T) {
+	for _, test := range []struct {
+		mask sched.CPUSet
+		tid  ThreadID
+		cpu  int32
+	}{
+		{
+			mask: []byte{0xff},
+			tid:  1,
+			cpu:  0,
+		},
+		{
+			mask: []byte{0xff},
+			tid:  10,
+			cpu:  1,
+		},
+		{
+			// more than 8 cpus.
+			mask: []byte{0xff, 0xff},
+			tid:  10,
+			cpu:  9,
+		},
+		{
+			// missing the first cpu.
+			mask: []byte{0xfe},
+			tid:  1,
+			cpu:  1,
+		},
+		{
+			mask: []byte{0xfe},
+			tid:  10,
+			cpu:  3,
+		},
+		{
+			// missing the fifth cpu.
+			mask: []byte{0xef},
+			tid:  10,
+			cpu:  2,
+		},
+	} {
+		assigned := assignCPU(test.mask, test.tid)
+		if test.cpu != assigned {
+			t.Errorf("assignCPU(%v, %v) got %v, want %v", test.mask, test.tid, assigned, test.cpu)
+		}
+	}
+
+}
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
new file mode 100644
index 000000000..7a62ab674
--- /dev/null
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -0,0 +1,298 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// _MAX_RW_COUNT is the maximum size in bytes of a single read or write.
+// Reads and writes that exceed this size may be silently truncated.
+// (Linux: include/linux/fs.h:MAX_RW_COUNT)
+var _MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
+
+// Activate ensures that the task has an active address space.
+func (t *Task) Activate() {
+	if mm := t.MemoryManager(); mm != nil {
+		if err := mm.Activate(); err != nil {
+			panic("unable to activate mm: " + err.Error())
+		}
+	}
+}
+
+// Deactivate relinquishes the task's active address space.
+func (t *Task) Deactivate() {
+	if mm := t.MemoryManager(); mm != nil {
+		if err := mm.Deactivate(); err != nil {
+			panic("unable to deactivate mm: " + err.Error())
+		}
+	}
+}
+
+// CopyIn copies a fixed-size value or slice of fixed-size values in from the
+// task's memory. The copy will fail with syscall.EFAULT if it traverses user
+// memory that is unmapped or not readable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) {
+	return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyInBytes is a fast version of CopyIn if the caller can serialize the
+// data without reflection and pass in a byte slice.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+	return t.MemoryManager().CopyIn(t, addr, dst, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyOut copies a fixed-size value or slice of fixed-size values out to the
+// task's memory. The copy will fail with syscall.EFAULT if it traverses user
+// memory that is unmapped or not writeable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) {
+	return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyOutBytes is a fast version of CopyOut if the caller can serialize the
+// data without reflection and pass in a byte slice.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+	return t.MemoryManager().CopyOut(t, addr, src, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyInString copies a NUL-terminated string of length at most maxlen in from
+// the task's memory. The copy will fail with syscall.EFAULT if it traverses
+// user memory that is unmapped or not readable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) {
+	return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxlen, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyInVector copies a NULL-terminated vector of strings from the task's
+// memory. The copy will fail with syscall.EFAULT if it traverses
+// user memory that is unmapped or not readable by the user.
+//
+// maxElemSize is the maximum size of each individual element.
+//
+// maxTotalSize is the maximum total length of all elements plus the total
+// number of elements. For example, the following strings correspond to
+// the following set of sizes:
+//
+//     { "a", "b", "c" } => 6 (3 for lengths, 3 for elements)
+//     { "abc" }         => 4 (3 for length, 1 for elements)
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([]string, error) {
+	var v []string
+	for {
+		argAddr := t.Arch().Native(0)
+		if _, err := t.CopyIn(addr, argAddr); err != nil {
+			return v, err
+		}
+		if t.Arch().Value(argAddr) == 0 {
+			break
+		}
+		// Each string has a zero terminating byte counted, so copying out a string
+		// requires at least one byte of space. Also, see the calculation below.
+		if maxTotalSize <= 0 {
+			return nil, syserror.ENOMEM
+		}
+		thisMax := maxElemSize
+		if maxTotalSize < thisMax {
+			thisMax = maxTotalSize
+		}
+		arg, err := t.CopyInString(usermem.Addr(t.Arch().Value(argAddr)), thisMax)
+		if err != nil {
+			return v, err
+		}
+		v = append(v, arg)
+		addr += usermem.Addr(t.Arch().Width())
+		maxTotalSize -= len(arg) + 1
+	}
+	return v, nil
+}
+
+// CopyOutIovecs converts src to an array of struct iovecs and copies it to the
+// memory mapped at addr.
+//
+// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error {
+	switch t.Arch().Width() {
+	case 8:
+		const itemLen = 16
+		if _, ok := addr.AddLength(uint64(src.NumRanges()) * itemLen); !ok {
+			return syserror.EFAULT
+		}
+
+		b := t.CopyScratchBuffer(itemLen)
+		for ; !src.IsEmpty(); src = src.Tail() {
+			ar := src.Head()
+			usermem.ByteOrder.PutUint64(b[0:8], uint64(ar.Start))
+			usermem.ByteOrder.PutUint64(b[8:16], uint64(ar.Length()))
+			if _, err := t.CopyOutBytes(addr, b); err != nil {
+				return err
+			}
+			addr += itemLen
+		}
+
+	default:
+		return syserror.ENOSYS
+	}
+
+	return nil
+}
+
+// CopyInIovecs copies an array of numIovecs struct iovecs from the memory
+// mapped at addr, converts them to usermem.AddrRanges, and returns them as a
+// usermem.AddrRangeSeq.
+//
+// CopyInIovecs shares the following properties with Linux's
+// lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector():
+//
+// - If the length of any AddrRange would exceed the range of an ssize_t,
+// CopyInIovecs returns EINVAL.
+//
+// - If the length of any AddrRange would cause its end to overflow,
+// CopyInIovecs returns EFAULT.
+//
+// - The combined length of all AddrRanges is limited to _MAX_RW_COUNT. If the
+// combined length of all AddrRanges would otherwise exceed this amount, ranges
+// beyond _MAX_RW_COUNT are silently truncated.
+//
+// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) {
+	if numIovecs == 0 {
+		return usermem.AddrRangeSeq{}, nil
+	}
+
+	var dst []usermem.AddrRange
+	if numIovecs > 1 {
+		dst = make([]usermem.AddrRange, 0, numIovecs)
+	}
+
+	switch t.Arch().Width() {
+	case 8:
+		const itemLen = 16
+		if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok {
+			return usermem.AddrRangeSeq{}, syserror.EFAULT
+		}
+
+		b := t.CopyScratchBuffer(itemLen)
+		for i := 0; i < numIovecs; i++ {
+			if _, err := t.CopyInBytes(addr, b); err != nil {
+				return usermem.AddrRangeSeq{}, err
+			}
+
+			base := usermem.Addr(usermem.ByteOrder.Uint64(b[0:8]))
+			length := usermem.ByteOrder.Uint64(b[8:16])
+			if length > math.MaxInt64 {
+				return usermem.AddrRangeSeq{}, syserror.EINVAL
+			}
+			ar, ok := base.ToRange(length)
+			if !ok {
+				return usermem.AddrRangeSeq{}, syserror.EFAULT
+			}
+
+			if numIovecs == 1 {
+				// Special case to avoid allocating dst.
+				return usermem.AddrRangeSeqOf(ar).TakeFirst(_MAX_RW_COUNT), nil
+			}
+			dst = append(dst, ar)
+
+			addr += itemLen
+		}
+
+	default:
+		return usermem.AddrRangeSeq{}, syserror.ENOSYS
+	}
+
+	// Truncate to _MAX_RW_COUNT.
+	var total uint64
+	for i := range dst {
+		dstlen := uint64(dst[i].Length())
+		if rem := uint64(_MAX_RW_COUNT) - total; rem < dstlen {
+			dst[i].End -= usermem.Addr(dstlen - rem)
+			dstlen = rem
+		}
+		total += dstlen
+	}
+
+	return usermem.AddrRangeSeqFromSlice(dst), nil
+}
+
+// SingleIOSequence returns a usermem.IOSequence representing [addr,
+// addr+length) in t's address space. If length exceeds _MAX_RW_COUNT, it is
+// silently truncated.
+//
+// SingleIOSequence is analogous to Linux's
+// lib/iov_iter.c:import_single_range(). (Note that the non-vectorized read and
+// write syscalls in Linux do not use import_single_range(), but are still
+// truncated to _MAX_RW_COUNT by fs/read_write.c:rw_verify_area().)
+func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+	if length > _MAX_RW_COUNT {
+		length = _MAX_RW_COUNT
+	}
+	ar, ok := addr.ToRange(uint64(length))
+	if !ok {
+		return usermem.IOSequence{}, syserror.EFAULT
+	}
+	return usermem.IOSequence{
+		IO:    t.MemoryManager(),
+		Addrs: usermem.AddrRangeSeqOf(ar),
+		Opts:  opts,
+	}, nil
+}
+
+// IovecsIOSequence returns a usermem.IOSequence representing the array of
+// iovcnt struct iovecs at addr in t's address space. opts applies to the
+// returned IOSequence, not the reading of the struct iovec array.
+//
+// IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
+//
+// Preconditions: As for Task.CopyInIovecs.
+func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+	if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+	ars, err := t.CopyInIovecs(addr, iovcnt)
+	if err != nil {
+		return usermem.IOSequence{}, err
+	}
+	return usermem.IOSequence{
+		IO:    t.MemoryManager(),
+		Addrs: ars,
+		Opts:  opts,
+	}, nil
+}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
new file mode 100644
index 000000000..8fffd3446
--- /dev/null
+++ b/pkg/sentry/kernel/thread_group.go
@@ -0,0 +1,269 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// A ThreadGroup is a logical grouping of tasks that has widespread
+// significance to other kernel features (e.g. signal handling). ("Thread
+// groups" are usually called "processes" in userspace documentation.)
+//
+// ThreadGroup is a superset of Linux's struct signal_struct.
+type ThreadGroup struct {
+	threadGroupNode
+
+	// signalHandlers is the set of signal handlers used by every task in this
+	// thread group. (signalHandlers may also be shared with other thread
+	// groups.)
+	//
+	// signalHandlers.mu (hereafter "the signal mutex") protects state related
+	// to signal handling, as well as state that usually needs to be atomic
+	// with signal handling, for all ThreadGroups and Tasks using
+	// signalHandlers. (This is analogous to Linux's use of struct
+	// sighand_struct::siglock.)
+	//
+	// The signalHandlers pointer can only be mutated during an execve
+	// (Task.finishExec). Consequently, when it's possible for a task in the
+	// thread group to be completing an execve, signalHandlers is protected by
+	// the owning TaskSet.mu. Otherwise, it is possible to read the
+	// signalHandlers pointer without synchronization. In particular,
+	// completing an execve requires that all other tasks in the thread group
+	// have exited, so task goroutines do not need the owning TaskSet.mu to
+	// read the signalHandlers pointer of their thread groups.
+	signalHandlers *SignalHandlers
+
+	// pendingSignals is the set of pending signals that may be handled by any
+	// task in this thread group.
+	//
+	// pendingSignals is protected by the signal mutex.
+	pendingSignals pendingSignals
+
+	// lastTimerSignalTask records the last task we deliver a process timer signal to.
+	// Please see SendTimerSignal for more details.
+	//
+	// lastTimerSignalTask is protected by the signal mutex.
+	lastTimerSignalTask *Task
+
+	// groupStopPhase indicates the state of a group stop in progress on the
+	// thread group, if any.
+	//
+	// groupStopPhase is protected by the signal mutex.
+	groupStopPhase groupStopPhase
+
+	// groupStopSignal is the signal that caused a group stop to be initiated.
+	// groupStopSignal is only meaningful if groupStopPhase is
+	// groupStopInitiated or groupStopComplete.
+	//
+	// groupStopSignal is protected by the signal mutex.
+	groupStopSignal linux.Signal
+
+	// groupStopCount is the number of non-exited tasks in the thread group
+	// that have acknowledged an initiated group stop. groupStopCount is only
+	// meaningful if groupStopPhase is groupStopInitiated.
+	//
+	// groupStopCount is protected by the signal mutex.
+	groupStopCount int
+
+	// If groupStopWaitable is true, the thread group is indicating a waitable
+	// group stop event (as defined by EventChildGroupStop).
+	//
+	// Linux represents the analogous state as SIGNAL_STOP_STOPPED being set
+	// and group_exit_code being non-zero.
+	//
+	// groupStopWaitable is protected by the signal mutex.
+	groupStopWaitable bool
+
+	// If groupContNotify is true, then a SIGCONT has recently ended a group
+	// stop on this thread group, and the first task to observe it should
+	// notify its parent.
+	//
+	// groupContNotify is protected by the signal mutex.
+	groupContNotify bool
+
+	// If groupContNotify is true, groupContInterrupted is true iff SIGCONT
+	// ended a group stop in phase groupStopInitiated. If groupContNotify is
+	// false, groupContInterrupted is meaningless.
+	//
+	// Analogues in Linux:
+	//
+	// - groupContNotify && groupContInterrupted is represented by
+	// SIGNAL_CLD_STOPPED.
+	//
+	// - groupContNotify && !groupContInterrupted is represented by
+	// SIGNAL_CLD_CONTINUED.
+	//
+	// - !groupContNotify is represented by neither flag being set.
+	//
+	// groupContInterrupted is protected by the signal mutex.
+	groupContInterrupted bool
+
+	// If groupContWaitable is true, the thread group is indicating a waitable
+	// continue event (as defined by EventGroupContinue).
+	//
+	// groupContWaitable is analogous to Linux's SIGNAL_STOP_CONTINUED.
+	//
+	// groupContWaitable is protected by the signal mutex.
+	groupContWaitable bool
+
+	// exiting is true if all tasks in the ThreadGroup should exit. exiting is
+	// analogous to Linux's SIGNAL_GROUP_EXIT.
+	//
+	// exiting is protected by the signal mutex. exiting can only transition
+	// from false to true.
+	exiting bool
+
+	// exitStatus is the thread group's exit status.
+	//
+	// While exiting is false, exitStatus is protected by the signal mutex.
+	// When exiting becomes true, exitStatus becomes immutable.
+	exitStatus ExitStatus
+
+	// terminationSignal is the signal that this thread group's leader will
+	// send to its parent when it exits.
+	//
+	// terminationSignal is protected by the TaskSet mutex.
+	terminationSignal linux.Signal
+
+	// liveGoroutines is the number of non-exited task goroutines in the thread
+	// group.
+	//
+	// liveGoroutines is not saved; it is reset as task goroutines are
+	// restarted by Task.Start.
+	liveGoroutines sync.WaitGroup `state:"nosave"`
+
+	// tm contains process timers. TimerManager fields are immutable.
+	tm TimerManager
+
+	// exitedCPUStats is the CPU usage for all exited tasks in the thread
+	// group. exitedCPUStats is protected by the TaskSet mutex.
+	exitedCPUStats usage.CPUStats
+
+	// childCPUStats is the CPU usage of all joined descendants of this thread
+	// group. childCPUStats is protected by the TaskSet mutex.
+	childCPUStats usage.CPUStats
+
+	// ioUsage is the I/O usage for all exited tasks in the thread group.
+	// The ioUsage pointer is immutable.
+	ioUsage *usage.IO
+
+	// maxRSS is the historical maximum resident set size of the thread group, updated when:
+	//
+	// - A task in the thread group exits, since after all tasks have
+	// exited the MemoryManager is no longer reachable.
+	//
+	// - The thread group completes an execve, since this changes
+	// MemoryManagers.
+	//
+	// maxRSS is protected by the TaskSet mutex.
+	maxRSS uint64
+
+	// childMaxRSS is the maximum resident set size in bytes of all joined
+	// descendants of this thread group.
+	//
+	// childMaxRSS is protected by the TaskSet mutex.
+	childMaxRSS uint64
+
+	// Resource limits for this ThreadGroup. The limits pointer is immutable.
+	limits *limits.LimitSet
+
+	// processGroup is the processGroup for this thread group.
+	//
+	// processGroup is protected by the TaskSet mutex.
+	processGroup *ProcessGroup
+
+	// execed indicates an exec has occurred since creation. This will be
+	// set by finishExec, and new TheadGroups will have this field cleared.
+	// When execed is set, the processGroup may no longer be changed.
+	//
+	// execed is protected by the TaskSet mutex.
+	execed bool
+
+	// rscr is the thread group's RSEQ critical region.
+	rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
+}
+
+// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
+// thread group leader will send its parent terminationSignal when it exits.
+// The new thread group isn't visible to the system until a task has been
+// created inside of it by a successful call to TaskSet.NewTask.
+func NewThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
+	tg := &ThreadGroup{
+		threadGroupNode: threadGroupNode{
+			pidns: ns,
+		},
+		signalHandlers:    sh,
+		terminationSignal: terminationSignal,
+		ioUsage:           &usage.IO{},
+		limits:            limits,
+	}
+	tg.tm = newTimerManager(tg, monotonicClock)
+	tg.rscr.Store(&RSEQCriticalRegion{})
+	return tg
+}
+
+// saveRscr is invopked by stateify.
+func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion {
+	return tg.rscr.Load().(*RSEQCriticalRegion)
+}
+
+// loadRscr is invoked by stateify.
+func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) {
+	tg.rscr.Store(rscr)
+}
+
+// SignalHandlers returns the signal handlers used by tg.
+//
+// Preconditions: The caller must provide the synchronization required to read
+// tg.signalHandlers, as described in the field's comment.
+func (tg *ThreadGroup) SignalHandlers() *SignalHandlers {
+	return tg.signalHandlers
+}
+
+// Timer returns tg's timers.
+func (tg *ThreadGroup) Timer() *TimerManager {
+	return &tg.tm
+}
+
+// Limits returns tg's limits.
+func (tg *ThreadGroup) Limits() *limits.LimitSet {
+	return tg.limits
+}
+
+// release releases the thread group's resources.
+func (tg *ThreadGroup) release() {
+	// This must be done without holding the TaskSet mutex since thread group
+	// timers call SendSignal with Timer.mu locked.
+	tg.tm.destroy()
+}
+
+// forEachChildThreadGroupLocked indicates over all child ThreadGroups.
+//
+// Precondition: TaskSet.mu must be held.
+func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		for child := range t.children {
+			if child == child.tg.leader {
+				fn(child.tg)
+			}
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
new file mode 100644
index 000000000..440da9dad
--- /dev/null
+++ b/pkg/sentry/kernel/threads.go
@@ -0,0 +1,443 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// TasksLimit is the maximum number of threads for untrusted application.
+// Linux doesn't really limit this directly, rather it is limited by total
+// memory size, stacks allocated and a global maximum. There's no real reason
+// for us to limit it either, (esp. since threads are backed by go routines),
+// and we would expect to hit resource limits long before hitting this number.
+// However, for correctness, we still check that the user doesn't exceed this
+// number.
+//
+// Note that because of the way futexes are implemented, there *are* in fact
+// serious restrictions on valid thread IDs. They are limited to 2^30 - 1
+// (kernel/fork.c:MAX_THREADS).
+const TasksLimit = (1 << 16)
+
+// ThreadID is a generic thread identifier.
+type ThreadID int32
+
+// String returns a decimal representation of the ThreadID.
+func (tid ThreadID) String() string {
+	return fmt.Sprintf("%d", tid)
+}
+
+// InitTID is the TID given to the first task added to each PID namespace. The
+// thread group led by InitTID is called the namespace's init process. The
+// death of a PID namespace's init process causes all tasks visible in that
+// namespace to be killed.
+const InitTID ThreadID = 1
+
+// A TaskSet comprises all tasks in a system.
+type TaskSet struct {
+	// mu protects all relationships betweens tasks and thread groups in the
+	// TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
+	mu sync.RWMutex `state:"nosave"`
+
+	// Root is the root PID namespace, in which all tasks in the TaskSet are
+	// visible. The Root pointer is immutable.
+	Root *PIDNamespace
+
+	// sessions is the set of all sessions.
+	sessions sessionList
+
+	// stopCount is the number of active external stops applicable to all tasks
+	// in the TaskSet (calls to TaskSet.BeginExternalStop that have not been
+	// paired with a call to TaskSet.EndExternalStop). stopCount is protected
+	// by mu.
+	//
+	// stopCount is not saved for the same reason as Task.stopCount; it is
+	// always reset to zero after restore.
+	stopCount int32 `state:"nosave"`
+
+	// liveGoroutines is the number of non-exited task goroutines in the
+	// TaskSet.
+	//
+	// liveGoroutines is not saved; it is reset as task goroutines are
+	// restarted by Task.Start.
+	liveGoroutines sync.WaitGroup `state:"nosave"`
+
+	// runningGoroutines is the number of running task goroutines in the
+	// TaskSet.
+	//
+	// runningGoroutines is not saved; its counter value is required to be zero
+	// at time of save (but note that this is not necessarily the same thing as
+	// sync.WaitGroup's zero value).
+	runningGoroutines sync.WaitGroup `state:"nosave"`
+}
+
+// newTaskSet returns a new, empty TaskSet.
+func newTaskSet() *TaskSet {
+	ts := &TaskSet{}
+	ts.Root = newPIDNamespace(ts, nil /* parent */, auth.NewRootUserNamespace())
+	return ts
+}
+
+// forEachThreadGroupLocked applies f to each thread group in ts.
+//
+// Preconditions: ts.mu must be locked (for reading or writing).
+func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
+	for t := range ts.Root.tids {
+		if t == t.tg.leader {
+			f(t.tg)
+		}
+	}
+}
+
+// A PIDNamespace represents a PID namespace, a bimap between thread IDs and
+// tasks. See the pid_namespaces(7) man page for further details.
+//
+// N.B. A task is said to be visible in a PID namespace if the PID namespace
+// contains a thread ID that maps to that task.
+type PIDNamespace struct {
+	// owner is the TaskSet that this PID namespace belongs to. The owner
+	// pointer is immutable.
+	owner *TaskSet
+
+	// parent is the PID namespace of the process that created this one. If
+	// this is the root PID namespace, parent is nil. The parent pointer is
+	// immutable.
+	//
+	// Invariant: All tasks that are visible in this namespace are also visible
+	// in all ancestor namespaces.
+	parent *PIDNamespace
+
+	// userns is the user namespace with which this PID namespace is
+	// associated. Privileged operations on this PID namespace must have
+	// appropriate capabilities in userns. The userns pointer is immutable.
+	userns *auth.UserNamespace
+
+	// The following fields are protected by owner.mu.
+
+	// last is the last ThreadID to be allocated in this namespace.
+	last ThreadID
+
+	// tasks is a mapping from ThreadIDs in this namespace to tasks visible in
+	// the namespace.
+	tasks map[ThreadID]*Task
+
+	// tids is a mapping from tasks visible in this namespace to their
+	// identifiers in this namespace.
+	tids map[*Task]ThreadID
+
+	// sessions is a mapping from SessionIDs in this namespace to sessions
+	// visible in the namespace.
+	sessions map[SessionID]*Session
+
+	// sids is a mapping from sessions visible in this namespace to their
+	// identifiers in this namespace.
+	sids map[*Session]SessionID
+
+	// processGroups is a mapping from ProcessGroupIDs in this namespace to
+	// process groups visible in the namespace.
+	processGroups map[ProcessGroupID]*ProcessGroup
+
+	// pgids is a mapping from process groups visible in this namespace to
+	// their identifiers in this namespace.
+	pgids map[*ProcessGroup]ProcessGroupID
+
+	// exiting indicates that the namespace's init process is exiting or has
+	// exited.
+	exiting bool
+}
+
+func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace {
+	return &PIDNamespace{
+		owner:         ts,
+		parent:        parent,
+		userns:        userns,
+		tasks:         make(map[ThreadID]*Task),
+		tids:          make(map[*Task]ThreadID),
+		sessions:      make(map[SessionID]*Session),
+		sids:          make(map[*Session]SessionID),
+		processGroups: make(map[ProcessGroupID]*ProcessGroup),
+		pgids:         make(map[*ProcessGroup]ProcessGroupID),
+	}
+}
+
+// NewChild returns a new, empty PID namespace that is a child of ns. Authority
+// over the new PID namespace is controlled by userns.
+func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace {
+	return newPIDNamespace(ns.owner, ns, userns)
+}
+
+// TaskWithID returns the task with thread ID tid in PID namespace ns. If no
+// task has that TID, TaskWithID returns nil.
+func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	return ns.tasks[tid]
+}
+
+// ThreadGroupWithID returns the thread group lead by the task with thread ID
+// tid in PID namespace ns. If no task has that TID, or if the task with that
+// TID is not a thread group leader, ThreadGroupWithID returns nil.
+func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	t := ns.tasks[tid]
+	if t == nil {
+		return nil
+	}
+	if t != t.tg.leader {
+		return nil
+	}
+	return t.tg
+}
+
+// IDOfTask returns the TID assigned to the given task in PID namespace ns. If
+// the task is not visible in that namespace, IDOfTask returns 0. (This return
+// value is significant in some cases, e.g. getppid() is documented as
+// returning 0 if the caller's parent is in an ancestor namespace and
+// consequently not visible to the caller.) If the task is nil, IDOfTask returns
+// 0.
+func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	return ns.tids[t]
+}
+
+// IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns.
+// If the task is not visible in that namespace, IDOfThreadGroup returns 0.
+func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	return ns.tids[tg.leader]
+}
+
+// Tasks returns a snapshot of the tasks in ns.
+func (ns *PIDNamespace) Tasks() []*Task {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	tasks := make([]*Task, 0, len(ns.tasks))
+	for t := range ns.tids {
+		tasks = append(tasks, t)
+	}
+	return tasks
+}
+
+// ThreadGroups returns a snapshot of the thread groups in ns.
+func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	var tgs []*ThreadGroup
+	for t := range ns.tids {
+		if t == t.tg.leader {
+			tgs = append(tgs, t.tg)
+		}
+	}
+	return tgs
+}
+
+// UserNamespace returns the user namespace associated with PID namespace ns.
+func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace {
+	return ns.userns
+}
+
+// A threadGroupNode defines the relationship between a thread group and the
+// rest of the system. Conceptually, threadGroupNode is data belonging to the
+// owning TaskSet, as if TaskSet contained a field `nodes
+// map[*ThreadGroup]*threadGroupNode`. However, for practical reasons,
+// threadGroupNode is embedded in the ThreadGroup it represents.
+// (threadGroupNode is an anonymous field in ThreadGroup; this is to expose
+// threadGroupEntry's methods on ThreadGroup to make it implement
+// threadGroupLinker.)
+type threadGroupNode struct {
+	// pidns is the PID namespace containing the thread group and all of its
+	// member tasks. The pidns pointer is immutable.
+	pidns *PIDNamespace
+
+	// eventQueue is notified whenever a event of interest to Task.Wait occurs
+	// in a child of this thread group, or a ptrace tracee of a task in this
+	// thread group. Events are defined in task_exit.go.
+	//
+	// Note that we cannot check and save this wait queue similarly to other
+	// wait queues, as the queue will not be empty by the time of saving, due
+	// to the wait sourced from Exec().
+	eventQueue waiter.Queue `state:"nosave"`
+
+	// leader is the thread group's leader, which is the oldest task in the
+	// thread group; usually the last task in the thread group to call
+	// execve(), or if no such task exists then the first task in the thread
+	// group, which was created by a call to fork() or clone() without
+	// CLONE_THREAD. Once a thread group has been made visible to the rest of
+	// the system by TaskSet.newTask, leader is never nil.
+	//
+	// Note that it's possible for the leader to exit without causing the rest
+	// of the thread group to exit; in such a case, leader will still be valid
+	// and non-nil, but leader will not be in tasks.
+	//
+	// leader is protected by the TaskSet mutex.
+	leader *Task
+
+	// If execing is not nil, it is a task in the thread group that has killed
+	// all other tasks so that it can become the thread group leader and
+	// perform an execve. (execing may already be the thread group leader.)
+	//
+	// execing is analogous to Linux's signal_struct::group_exit_task.
+	//
+	// execing is protected by the TaskSet mutex.
+	execing *Task
+
+	// tasks is all tasks in the thread group that have not yet been reaped.
+	//
+	// tasks is protected by both the TaskSet mutex and the signal mutex:
+	// Mutating tasks requires locking the TaskSet mutex for writing *and*
+	// locking the signal mutex. Reading tasks requires locking the TaskSet
+	// mutex *or* locking the signal mutex.
+	tasks taskList
+
+	// tasksCount is the number of tasks in the thread group that have not yet
+	// been reaped; equivalently, tasksCount is the number of tasks in tasks.
+	//
+	// tasksCount is protected by both the TaskSet mutex and the signal mutex,
+	// as with tasks.
+	tasksCount int
+
+	// liveTasks is the number of tasks in the thread group that have not yet
+	// reached TaskExitZombie.
+	//
+	// liveTasks is protected by the TaskSet mutex (NOT the signal mutex).
+	liveTasks int
+
+	// activeTasks is the number of tasks in the thread group that have not yet
+	// reached TaskExitInitiated.
+	//
+	// activeTasks is protected by both the TaskSet mutex and the signal mutex,
+	// as with tasks.
+	activeTasks int
+}
+
+// PIDNamespace returns the PID namespace containing tg.
+func (tg *ThreadGroup) PIDNamespace() *PIDNamespace {
+	return tg.pidns
+}
+
+// TaskSet returns the TaskSet containing tg.
+func (tg *ThreadGroup) TaskSet() *TaskSet {
+	return tg.pidns.owner
+}
+
+// Leader returns tg's leader.
+func (tg *ThreadGroup) Leader() *Task {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.leader
+}
+
+// Count returns the number of non-exited threads in the group.
+func (tg *ThreadGroup) Count() int {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	var count int
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		count++
+	}
+	return count
+}
+
+// MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for
+// all tasks in tg.
+func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+
+	var tasks []ThreadID
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		if id, ok := pidns.tids[t]; ok {
+			tasks = append(tasks, id)
+		}
+	}
+	return tasks
+}
+
+// ID returns tg's leader's thread ID in its own PID namespace. If tg's leader
+// is dead, ID returns 0.
+func (tg *ThreadGroup) ID() ThreadID {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.pidns.tids[tg.leader]
+}
+
+// A taskNode defines the relationship between a task and the rest of the
+// system. The comments on threadGroupNode also apply to taskNode.
+type taskNode struct {
+	// tg is the thread group that this task belongs to. The tg pointer is
+	// immutable.
+	tg *ThreadGroup `state:"wait"`
+
+	// taskEntry links into tg.tasks. Note that this means that
+	// Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread
+	// group. See threadGroupNode.tasks for synchronization info.
+	taskEntry
+
+	// parent is the task's parent. parent may be nil.
+	//
+	// parent is protected by the TaskSet mutex.
+	parent *Task
+
+	// children is this task's children.
+	//
+	// children is protected by the TaskSet mutex.
+	children map[*Task]struct{}
+
+	// If childPIDNamespace is not nil, all new tasks created by this task will
+	// be members of childPIDNamespace rather than this one. (As a corollary,
+	// this task becomes unable to create sibling tasks in the same thread
+	// group.)
+	//
+	// childPIDNamespace is exclusive to the task goroutine.
+	childPIDNamespace *PIDNamespace
+}
+
+// ThreadGroup returns the thread group containing t.
+func (t *Task) ThreadGroup() *ThreadGroup {
+	return t.tg
+}
+
+// PIDNamespace returns the PID namespace containing t.
+func (t *Task) PIDNamespace() *PIDNamespace {
+	return t.tg.pidns
+}
+
+// TaskSet returns the TaskSet containing t.
+func (t *Task) TaskSet() *TaskSet {
+	return t.tg.pidns.owner
+}
+
+// Timekeeper returns the system Timekeeper.
+func (t *Task) Timekeeper() *Timekeeper {
+	return t.k.timekeeper
+}
+
+// Parent returns t's parent.
+func (t *Task) Parent() *Task {
+	return t.parent
+}
+
+// ThreadID returns t's thread ID in its own PID namespace. If the task is
+// dead, ThreadID returns 0.
+func (t *Task) ThreadID() ThreadID {
+	return t.tg.pidns.IDOfTask(t)
+}
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
new file mode 100644
index 000000000..84f31b2dc
--- /dev/null
+++ b/pkg/sentry/kernel/time/BUILD
@@ -0,0 +1,32 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "time_state",
+    srcs = [
+        "time.go",
+    ],
+    out = "time_state.go",
+    package = "time",
+)
+
+go_library(
+    name = "time",
+    srcs = [
+        "context.go",
+        "time.go",
+        "time_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go
new file mode 100644
index 000000000..ac4dc01d8
--- /dev/null
+++ b/pkg/sentry/kernel/time/context.go
@@ -0,0 +1,44 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the time package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxRealtimeClock is a Context.Value key for the current real time.
+	CtxRealtimeClock contextID = iota
+)
+
+// RealtimeClockFromContext returns the real time clock associated with context
+// ctx.
+func RealtimeClockFromContext(ctx context.Context) Clock {
+	if v := ctx.Value(CtxRealtimeClock); v != nil {
+		return v.(Clock)
+	}
+	return nil
+}
+
+// NowFromContext returns the current real time associated with context ctx.
+func NowFromContext(ctx context.Context) Time {
+	if clk := RealtimeClockFromContext(ctx); clk != nil {
+		return clk.Now()
+	}
+	panic("encountered context without RealtimeClock")
+}
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
new file mode 100644
index 000000000..c223c2f19
--- /dev/null
+++ b/pkg/sentry/kernel/time/time.go
@@ -0,0 +1,649 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package time defines the Timer type, which provides a periodic timer that
+// works by sampling a user-provided clock.
+package time
+
+import (
+	"fmt"
+	"math"
+	"sync"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Events that may be generated by a Clock.
+const (
+	// ClockEventSet occurs when a Clock undergoes a discontinuous change.
+	ClockEventSet waiter.EventMask = 1 << iota
+
+	// ClockEventRateIncrease occurs when the rate at which a Clock advances
+	// increases significantly, such that values returned by previous calls to
+	// Clock.WallTimeUntil may be too large.
+	ClockEventRateIncrease
+)
+
+// Time represents an instant in time with nanosecond precision.
+//
+// Time may represent time with respect to any clock and may not have any
+// meaning in the real world.
+type Time struct {
+	ns int64
+}
+
+var (
+	// MinTime is the zero time instant, the lowest possible time that can
+	// be represented by Time.
+	MinTime = Time{ns: math.MinInt64}
+
+	// MaxTime is the highest possible time that can be represented by
+	// Time.
+	MaxTime = Time{ns: math.MaxInt64}
+
+	// ZeroTime represents the zero time in an unspecified Clock's domain.
+	ZeroTime = Time{ns: 0}
+)
+
+const (
+	// MinDuration is the minimum duration representable by time.Duration.
+	MinDuration = time.Duration(math.MinInt64)
+
+	// MaxDuration is the maximum duration representable by time.Duration.
+	MaxDuration = time.Duration(math.MaxInt64)
+)
+
+// FromNanoseconds returns a Time representing the point ns nanoseconds after
+// an unspecified Clock's zero time.
+func FromNanoseconds(ns int64) Time {
+	return Time{ns}
+}
+
+// FromSeconds returns a Time representing the point s seconds after an
+// unspecified Clock's zero time.
+func FromSeconds(s int64) Time {
+	if s > math.MaxInt64/time.Second.Nanoseconds() {
+		return MaxTime
+	}
+	return Time{s * 1e9}
+}
+
+// FromUnix converts from Unix seconds and nanoseconds to Time, assuming a real
+// time Unix clock domain.
+func FromUnix(s int64, ns int64) Time {
+	if s > math.MaxInt64/time.Second.Nanoseconds() {
+		return MaxTime
+	}
+	t := s * 1e9
+	if t > math.MaxInt64-ns {
+		return MaxTime
+	}
+	return Time{t + ns}
+}
+
+// FromTimespec converts from Linux Timespec to Time.
+func FromTimespec(ts linux.Timespec) Time {
+	return Time{ts.ToNsecCapped()}
+}
+
+// FromTimeval converts a Linux Timeval to Time.
+func FromTimeval(tv linux.Timeval) Time {
+	return Time{tv.ToNsecCapped()}
+}
+
+// Nanoseconds returns nanoseconds elapsed since the zero time in t's Clock
+// domain. If t represents walltime, this is nanoseconds since the Unix epoch.
+func (t Time) Nanoseconds() int64 {
+	return t.ns
+}
+
+// Seconds returns seconds elapsed since the zero time in t's Clock domain. If
+// t represents walltime, this is seconds since Unix epoch.
+func (t Time) Seconds() int64 {
+	return t.Nanoseconds() / time.Second.Nanoseconds()
+}
+
+// Timespec converts Time to a Linux timespec.
+func (t Time) Timespec() linux.Timespec {
+	return linux.NsecToTimespec(t.Nanoseconds())
+}
+
+// Unix returns the (seconds, nanoseconds) representation of t such that
+// seconds*1e9 + nanoseconds = t.
+func (t Time) Unix() (s int64, ns int64) {
+	s = t.ns / 1e9
+	ns = t.ns % 1e9
+	return
+}
+
+// TimeT converts Time to a Linux time_t.
+func (t Time) TimeT() linux.TimeT {
+	return linux.NsecToTimeT(t.Nanoseconds())
+}
+
+// Timeval converts Time to a Linux timeval.
+func (t Time) Timeval() linux.Timeval {
+	return linux.NsecToTimeval(t.Nanoseconds())
+}
+
+// Add adds the duration of d to t.
+func (t Time) Add(d time.Duration) Time {
+	if t.ns > 0 && d.Nanoseconds() > math.MaxInt64-int64(t.ns) {
+		return MaxTime
+	}
+	if t.ns < 0 && d.Nanoseconds() < math.MinInt64-int64(t.ns) {
+		return MinTime
+	}
+	return Time{int64(t.ns) + d.Nanoseconds()}
+}
+
+// AddTime adds the duration of u to t.
+func (t Time) AddTime(u Time) Time {
+	return t.Add(time.Duration(u.ns))
+}
+
+// Equal reports whether the two times represent the same instant in time.
+func (t Time) Equal(u Time) bool {
+	return t.ns == u.ns
+}
+
+// Before reports whether the instant t is before the instant u.
+func (t Time) Before(u Time) bool {
+	return t.ns < u.ns
+}
+
+// After reports whether the instant t is after the instant u.
+func (t Time) After(u Time) bool {
+	return t.ns > u.ns
+}
+
+// Sub returns the duration of t - u.
+//
+// N.B. This measure may not make sense for every Time returned by ktime.Clock.
+// Callers who need wall time duration can use ktime.Clock.WallTimeUntil to
+// estimate that wall time.
+func (t Time) Sub(u Time) time.Duration {
+	dur := time.Duration(int64(t.ns)-int64(u.ns)) * time.Nanosecond
+	switch {
+	case u.Add(dur).Equal(t):
+		return dur
+	case t.Before(u):
+		return MinDuration
+	default:
+		return MaxDuration
+	}
+}
+
+// IsMin returns whether t represents the lowest possible time instant.
+func (t Time) IsMin() bool {
+	return t == MinTime
+}
+
+// IsZero returns whether t represents the zero time instant in t's Clock domain.
+func (t Time) IsZero() bool {
+	return t == ZeroTime
+}
+
+// String returns the time represented in nanoseconds as a string.
+func (t Time) String() string {
+	return fmt.Sprintf("%dns", t.Nanoseconds())
+}
+
+// A Clock is an abstract time source.
+type Clock interface {
+	// Now returns the current time in nanoseconds according to the Clock.
+	Now() Time
+
+	// WallTimeUntil returns the estimated wall time until Now will return a
+	// value greater than or equal to t, given that a recent call to Now
+	// returned now. If t has already passed, WallTimeUntil may return 0 or a
+	// negative value.
+	//
+	// WallTimeUntil must be abstract to support Clocks that do not represent
+	// wall time (e.g. thread group execution timers). Clocks that represent
+	// wall times may embed the WallRateClock type to obtain an appropriate
+	// trivial implementation of WallTimeUntil.
+	//
+	// WallTimeUntil is used to determine when associated Timers should next
+	// check for expirations. Returning too small a value may result in
+	// spurious Timer goroutine wakeups, while returning too large a value may
+	// result in late expirations. Implementations should usually err on the
+	// side of underestimating.
+	WallTimeUntil(t, now Time) time.Duration
+
+	// Waitable methods may be used to subscribe to Clock events. Waiters will
+	// not be preserved by Save and must be re-established during restore.
+	//
+	// Since Clock events are transient, implementations of
+	// waiter.Waitable.Readiness should return 0.
+	waiter.Waitable
+}
+
+// WallRateClock implements Clock.WallTimeUntil for Clocks that elapse at the
+// same rate as wall time.
+type WallRateClock struct{}
+
+// WallTimeUntil implements Clock.WallTimeUntil.
+func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
+	return t.Sub(now)
+}
+
+// NoClockEvents implements waiter.Waitable for Clocks that do not generate
+// events.
+type NoClockEvents struct{}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return 0
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (NoClockEvents) EventUnregister(e *waiter.Entry) {
+}
+
+// ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and
+// defining waiter.Waitable.Readiness as required by Clock.
+type ClockEventsQueue struct {
+	waiter.Queue
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return 0
+}
+
+// A TimerListener receives expirations from a Timer.
+type TimerListener interface {
+	// Notify is called when its associated Timer expires. exp is the number of
+	// expirations.
+	//
+	// Notify is called with the associated Timer's mutex locked, so Notify
+	// must not take any locks that precede Timer.mu in lock order.
+	//
+	// Preconditions: exp > 0.
+	Notify(exp uint64)
+
+	// Destroy is called when the timer is destroyed.
+	Destroy()
+}
+
+// Setting contains user-controlled mutable Timer properties.
+type Setting struct {
+	// Enabled is true if the timer is running.
+	Enabled bool
+
+	// Next is the time in nanoseconds of the next expiration.
+	Next Time
+
+	// Period is the time in nanoseconds between expirations. If Period is
+	// zero, the timer will not automatically restart after expiring.
+	//
+	// Invariant: Period >= 0.
+	Period time.Duration
+}
+
+// SettingFromSpec converts a (value, interval) pair to a Setting based on a
+// reading from c. value is interpreted as a time relative to c.Now().
+func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Setting, error) {
+	if value < 0 {
+		return Setting{}, syserror.EINVAL
+	}
+	if value == 0 {
+		return Setting{Period: interval}, nil
+	}
+	return Setting{
+		Enabled: true,
+		Next:    c.Now().Add(value),
+		Period:  interval,
+	}, nil
+}
+
+// SettingFromAbsSpec converts a (value, interval) pair to a Setting based on a
+// reading from c. value is interpreted as an absolute time.
+func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) {
+	if value.Before(ZeroTime) {
+		return Setting{}, syserror.EINVAL
+	}
+	if value.IsZero() {
+		return Setting{Period: interval}, nil
+	}
+	return Setting{
+		Enabled: true,
+		Next:    value,
+		Period:  interval,
+	}, nil
+}
+
+// SpecFromSetting converts a timestamp and a Setting to a (relative value,
+// interval) pair, as used by most Linux syscalls that return a struct
+// itimerval or struct itimerspec.
+func SpecFromSetting(now Time, s Setting) (value, period time.Duration) {
+	if !s.Enabled {
+		return 0, s.Period
+	}
+	return s.Next.Sub(now), s.Period
+}
+
+// advancedTo returns an updated Setting and a number of expirations after
+// the associated Clock indicates a time of now.
+//
+// Settings may be created by successive calls to advancedTo with decreasing
+// values of now (i.e. time may appear to go backward). Supporting this is
+// required to support non-monotonic clocks, as well as allowing
+// Timer.clock.Now() to be called without holding Timer.mu.
+func (s Setting) advancedTo(now Time) (Setting, uint64) {
+	if !s.Enabled {
+		return s, 0
+	}
+	if s.Next.After(now) {
+		return s, 0
+	}
+	if s.Period == 0 {
+		s.Enabled = false
+		return s, 1
+	}
+	exp := 1 + uint64(now.Sub(s.Next).Nanoseconds())/uint64(s.Period)
+	s.Next = s.Next.Add(time.Duration(uint64(s.Period) * exp))
+	return s, exp
+}
+
+// Timer is an optionally-periodic timer driven by sampling a user-specified
+// Clock. Timer's semantics support the requirements of Linux's interval timers
+// (setitimer(2), timer_create(2), timerfd_create(2)).
+//
+// Timers should be created using NewTimer and must be cleaned up by calling
+// Timer.Destroy when no longer used.
+type Timer struct {
+	// clock is the time source. clock is immutable.
+	clock Clock
+
+	// listener is notified of expirations. listener is immutable.
+	listener TimerListener
+
+	// mu protects the following mutable fields.
+	mu sync.Mutex `state:"nosave"`
+
+	// setting is the timer setting. setting is protected by mu.
+	setting Setting
+
+	// paused is true if the Timer is paused. paused is protected by mu.
+	paused bool
+
+	// kicker is used to wake the Timer goroutine. The kicker pointer is
+	// immutable, but its state is protected by mu.
+	kicker *time.Timer `state:"nosave"`
+
+	// entry is registered with clock.EventRegister. entry is immutable.
+	//
+	// Per comment in Clock, entry must be re-registered after restore; per
+	// comment in Timer.Load, this is done in Timer.Resume.
+	entry waiter.Entry `state:"nosave"`
+
+	// events is the channel that will be notified whenever entry receives an
+	// event. It is also closed by Timer.Destroy to instruct the Timer
+	// goroutine to exit.
+	events chan struct{} `state:"nosave"`
+}
+
+// timerTickEvents are Clock events that require the Timer goroutine to Tick
+// prematurely.
+const timerTickEvents = ClockEventSet | ClockEventRateIncrease
+
+// NewTimer returns a new Timer that will obtain time from clock and send
+// expirations to listener. The Timer is initially stopped and has no first
+// expiration or period configured.
+func NewTimer(clock Clock, listener TimerListener) *Timer {
+	t := &Timer{
+		clock:    clock,
+		listener: listener,
+	}
+	t.init()
+	return t
+}
+
+// After waits for the duration to elapse according to clock and then sends a
+// notification on the returned channel. The timer is started immediately and
+// will fire exactly once. The second return value is the start time used with
+// the duration.
+//
+// Callers must call Timer.Destroy.
+func After(clock Clock, duration time.Duration) (*Timer, Time, <-chan struct{}) {
+	notifier, tchan := NewChannelNotifier()
+	t := NewTimer(clock, notifier)
+	now := clock.Now()
+
+	t.Swap(Setting{
+		Enabled: true,
+		Period:  0,
+		Next:    now.Add(duration),
+	})
+	return t, now, tchan
+}
+
+// init initializes Timer state that is not preserved across save/restore. If
+// init has already been called, calling it again is a no-op.
+//
+// Preconditions: t.mu must be locked, or the caller must have exclusive access
+// to t.
+func (t *Timer) init() {
+	if t.kicker != nil {
+		return
+	}
+	// If t.kicker is nil, the Timer goroutine can't be running, so we can't
+	// race with it.
+	t.kicker = time.NewTimer(0)
+	t.entry, t.events = waiter.NewChannelEntry(nil)
+	t.clock.EventRegister(&t.entry, timerTickEvents)
+	go t.runGoroutine() // S/R-SAFE: synchronized by t.mu
+}
+
+// Destroy releases resources owned by the Timer. A Destroyed Timer must not be
+// used again; in particular, a Destroyed Timer should not be Saved.
+func (t *Timer) Destroy() {
+	// Stop the Timer, ensuring that the Timer goroutine will not call
+	// t.kicker.Reset, before calling t.kicker.Stop.
+	t.mu.Lock()
+	t.setting.Enabled = false
+	t.mu.Unlock()
+	t.kicker.Stop()
+	// Unregister t.entry, ensuring that the Clock will not send to t.events,
+	// before closing t.events to instruct the Timer goroutine to exit.
+	t.clock.EventUnregister(&t.entry)
+	close(t.events)
+	t.listener.Destroy()
+}
+
+func (t *Timer) runGoroutine() {
+	for {
+		select {
+		case <-t.kicker.C:
+		case _, ok := <-t.events:
+			if !ok {
+				// Channel closed by Destroy.
+				return
+			}
+		}
+		t.Tick()
+	}
+}
+
+// Tick requests that the Timer immediately check for expirations and
+// re-evaluate when it should next check for expirations.
+func (t *Timer) Tick() {
+	now := t.clock.Now()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.paused {
+		return
+	}
+	s, exp := t.setting.advancedTo(now)
+	t.setting = s
+	if exp > 0 {
+		t.listener.Notify(exp)
+	}
+	t.resetKickerLocked(now)
+}
+
+// Pause pauses the Timer, ensuring that it does not generate any further
+// expirations until Resume is called. If the Timer is already paused, Pause
+// has no effect.
+func (t *Timer) Pause() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.paused = true
+	// t.kicker may be nil if we were restored but never resumed.
+	if t.kicker != nil {
+		t.kicker.Stop()
+	}
+}
+
+// Resume ends the effect of Pause. If the Timer is not paused, Resume has no
+// effect.
+func (t *Timer) Resume() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if !t.paused {
+		return
+	}
+	t.paused = false
+
+	// Lazily initialize the Timer. We can't call Timer.init until Timer.Resume
+	// because save/restore will restore Timers before
+	// kernel.Timekeeper.SetClocks() has been called, so if t.clock is backed
+	// by a kernel.Timekeeper then the Timer goroutine will panic if it calls
+	// t.clock.Now().
+	t.init()
+
+	// Kick the Timer goroutine in case it was already initialized, but the
+	// Timer goroutine was sleeping.
+	t.kicker.Reset(0)
+}
+
+// Get returns a snapshot of the Timer's current Setting and the time
+// (according to the Timer's Clock) at which the snapshot was taken.
+//
+// Preconditions: The Timer must not be paused (since its Setting cannot
+// be advanced to the current time while it is paused.)
+func (t *Timer) Get() (Time, Setting) {
+	now := t.clock.Now()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.paused {
+		panic(fmt.Sprintf("Timer.Get called on paused Timer %p", t))
+	}
+	s, exp := t.setting.advancedTo(now)
+	t.setting = s
+	if exp > 0 {
+		t.listener.Notify(exp)
+	}
+	t.resetKickerLocked(now)
+	return now, s
+}
+
+// Swap atomically changes the Timer's Setting and returns the Timer's previous
+// Setting and the time (according to the Timer's Clock) at which the snapshot
+// was taken. Setting s.Enabled to true starts the Timer, while setting
+// s.Enabled to false stops it.
+//
+// Preconditions: The Timer must not be paused.
+func (t *Timer) Swap(s Setting) (Time, Setting) {
+	return t.SwapAnd(s, nil)
+}
+
+// SwapAnd atomically changes the Timer's Setting, calls f if it is not nil,
+// and returns the Timer's previous Setting and the time (according to the
+// Timer's Clock) at which the Setting was changed. Setting s.Enabled to true
+// starts the timer, while setting s.Enabled to false stops it.
+//
+// Preconditions: The Timer must not be paused. f cannot call any Timer methods
+// since it is called with the Timer mutex locked.
+func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
+	now := t.clock.Now()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.paused {
+		panic(fmt.Sprintf("Timer.SwapAnd called on paused Timer %p", t))
+	}
+	oldS, oldExp := t.setting.advancedTo(now)
+	if oldExp > 0 {
+		t.listener.Notify(oldExp)
+	}
+	if f != nil {
+		f()
+	}
+	newS, newExp := s.advancedTo(now)
+	t.setting = newS
+	if newExp > 0 {
+		t.listener.Notify(newExp)
+	}
+	t.resetKickerLocked(now)
+	return now, oldS
+}
+
+// Preconditions: t.mu must be locked.
+func (t *Timer) resetKickerLocked(now Time) {
+	if t.setting.Enabled {
+		// Clock.WallTimeUntil may return a negative value. This is fine;
+		// time.when treats negative Durations as 0.
+		t.kicker.Reset(t.clock.WallTimeUntil(t.setting.Next, now))
+	}
+	// We don't call t.kicker.Stop if !t.setting.Enabled because in most cases
+	// resetKickerLocked will be called from the Timer goroutine itself, in
+	// which case t.kicker has already fired and t.kicker.Stop will be an
+	// expensive no-op (time.Timer.Stop => time.stopTimer => runtime.stopTimer
+	// => runtime.deltimer).
+}
+
+// Clock returns the Clock used by t.
+func (t *Timer) Clock() Clock {
+	return t.clock
+}
+
+// ChannelNotifier is a TimerListener that sends a message on an empty struct
+// channel.
+//
+// ChannelNotifier cannot be saved or loaded.
+type ChannelNotifier struct {
+	// tchan must be a buffered channel.
+	tchan chan struct{}
+}
+
+// NewChannelNotifier creates a new channel notifier.
+//
+// If the notifier is used with a timer, Timer.Destroy will close the channel
+// returned here.
+func NewChannelNotifier() (TimerListener, <-chan struct{}) {
+	tchan := make(chan struct{}, 1)
+	return &ChannelNotifier{tchan}, tchan
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (c *ChannelNotifier) Notify(uint64) {
+	select {
+	case c.tchan <- struct{}{}:
+	default:
+	}
+}
+
+// Destroy implements ktime.TimerListener.Destroy and will close the channel.
+func (c *ChannelNotifier) Destroy() {
+	close(c.tchan)
+}
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
new file mode 100644
index 000000000..3f16c1676
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -0,0 +1,270 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// Timekeeper manages all of the kernel clocks.
+type Timekeeper struct {
+	// clocks are the clock sources.
+	//
+	// These are not saved directly, as the new machine's clock may behave
+	// differently.
+	//
+	// It is set only once, by SetClocks.
+	clocks sentrytime.Clocks `state:"nosave"`
+
+	// bootTime is the realtime when the system "booted". i.e., when
+	// SetClocks was called in the initial (not restored) run.
+	bootTime ktime.Time
+
+	// monotonicOffset is the offset to apply to the monotonic clock output
+	// from clocks.
+	//
+	// It is set only once, by SetClocks.
+	monotonicOffset int64 `state:"nosave"`
+
+	// restored indicates that this Timekeeper was restored from a state
+	// file.
+	restored bool `state:"nosave"`
+
+	// saveMonotonic is the (offset) value of the monotonic clock at the
+	// time of save.
+	//
+	// It is only valid if restored is true.
+	//
+	// It is only used in SetClocks after restore to compute the new
+	// monotonicOffset.
+	saveMonotonic int64
+
+	// saveRealtime is the value of the realtime clock at the time of save.
+	//
+	// It is only valid if restored is true.
+	//
+	// It is only used in SetClocks after restore to compute the new
+	// monotonicOffset.
+	saveRealtime int64
+
+	// params manages the parameter page.
+	params *VDSOParamPage
+
+	// mu protects destruction with stop and wg.
+	mu sync.Mutex `state:"nosave"`
+
+	// stop is used to tell the update goroutine to exit.
+	stop chan struct{} `state:"nosave"`
+
+	// wg is used to indicate that the update goroutine has exited.
+	wg sync.WaitGroup `state:"nosave"`
+}
+
+// NewTimekeeper returns a Timekeeper that is automatically kept up-to-date.
+// NewTimekeeper does not take ownership of paramPage.
+//
+// SetClocks must be called on the returned Timekeeper before it is usable.
+func NewTimekeeper(platform platform.Platform, paramPage platform.FileRange) (*Timekeeper, error) {
+	return &Timekeeper{
+		params: NewVDSOParamPage(platform, paramPage),
+	}, nil
+}
+
+// SetClocks the backing clock source.
+//
+// SetClocks must be called before the Timekeeper is used, and it may not be
+// called more than once, as changing the clock source without extra correction
+// could cause time discontinuities.
+//
+// It must also be called after Load.
+func (t *Timekeeper) SetClocks(c sentrytime.Clocks) {
+	// Update the params, marking them "not ready", as we may need to
+	// restart calibration on this new machine.
+	if t.restored {
+		if err := t.params.Write(func() vdsoParams {
+			return vdsoParams{}
+		}); err != nil {
+			panic("unable to reset VDSO params: " + err.Error())
+		}
+	}
+
+	if t.clocks != nil {
+		panic("SetClocks called on previously-initialized Timekeeper")
+	}
+
+	t.clocks = c
+
+	// Compute the offset of the monotonic clock from the base Clocks.
+	//
+	// In a fresh (not restored) sentry, monotonic time starts at zero.
+	//
+	// In a restored sentry, monotonic time jumps forward by approximately
+	// the same amount as real time. There are no guarantees here, we are
+	// just making a best-effort attempt to to make it appear that the app
+	// was simply not scheduled for a long period, rather than that the
+	// real time clock was changed.
+	//
+	// If real time went backwards, it remains the same.
+	wantMonotonic := int64(0)
+
+	nowMonotonic, err := t.clocks.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		panic("Unable to get current monotonic time: " + err.Error())
+	}
+
+	nowRealtime, err := t.clocks.GetTime(sentrytime.Realtime)
+	if err != nil {
+		panic("Unable to get current realtime: " + err.Error())
+	}
+
+	if t.restored {
+		wantMonotonic = t.saveMonotonic
+		elapsed := nowRealtime - t.saveRealtime
+		if elapsed > 0 {
+			wantMonotonic += elapsed
+		}
+	}
+
+	t.monotonicOffset = wantMonotonic - nowMonotonic
+
+	if !t.restored {
+		// Hold on to the initial "boot" time.
+		t.bootTime = ktime.FromNanoseconds(nowRealtime)
+	}
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.startUpdater()
+}
+
+// startUpdater starts an update goroutine that keeps the clocks updated.
+//
+// mu must be held.
+func (t *Timekeeper) startUpdater() {
+	if t.stop != nil {
+		// Timekeeper already started
+		return
+	}
+	t.stop = make(chan struct{})
+
+	// Keep the clocks up to date.
+	//
+	// Note that the Go runtime uses host CLOCK_MONOTONIC to service the
+	// timer, so it may run at a *slightly* different rate from the
+	// application CLOCK_MONOTONIC. That is fine, as we only need to update
+	// at approximately this rate.
+	timer := time.NewTicker(sentrytime.ApproxUpdateInterval)
+	t.wg.Add(1)
+	go func() { // S/R-SAFE: stopped during save.
+		for {
+			// Start with an update immediately, so the clocks are
+			// ready ASAP.
+
+			// Call Update within a Write block to prevent the VDSO
+			// from using the old params between Update and
+			// Write.
+			if err := t.params.Write(func() vdsoParams {
+				monotonicParams, monotonicOk, realtimeParams, realtimeOk := t.clocks.Update()
+
+				var p vdsoParams
+				if monotonicOk {
+					p.monotonicReady = 1
+					p.monotonicBaseCycles = int64(monotonicParams.BaseCycles)
+					p.monotonicBaseRef = int64(monotonicParams.BaseRef) + t.monotonicOffset
+					p.monotonicFrequency = monotonicParams.Frequency
+				}
+				if realtimeOk {
+					p.realtimeReady = 1
+					p.realtimeBaseCycles = int64(realtimeParams.BaseCycles)
+					p.realtimeBaseRef = int64(realtimeParams.BaseRef)
+					p.realtimeFrequency = realtimeParams.Frequency
+				}
+
+				log.Debugf("Updating VDSO parameters: %+v", p)
+
+				return p
+			}); err != nil {
+				log.Warningf("Unable to update VDSO parameter page: %v", err)
+			}
+
+			select {
+			case <-timer.C:
+			case <-t.stop:
+				t.wg.Done()
+				return
+			}
+		}
+	}()
+}
+
+// stopUpdater stops the update goroutine, blocking until it exits.
+//
+// mu must be held.
+func (t *Timekeeper) stopUpdater() {
+	if t.stop == nil {
+		// Updater not running.
+		return
+	}
+
+	close(t.stop)
+	t.wg.Wait()
+	t.stop = nil
+}
+
+// Destroy destroys the Timekeeper, freeing all associated resources.
+func (t *Timekeeper) Destroy() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.stopUpdater()
+}
+
+// PauseUpdates stops clock parameter updates. This should only be used when
+// Tasks are not running and thus cannot access the clock.
+func (t *Timekeeper) PauseUpdates() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.stopUpdater()
+}
+
+// ResumeUpdates restarts clock parameter updates stopped by PauseUpdates.
+func (t *Timekeeper) ResumeUpdates() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.startUpdater()
+}
+
+// GetTime returns the current time in nanoseconds.
+func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) {
+	if t.clocks == nil {
+		panic("Timekeeper used before initialized with SetClocks")
+	}
+	now, err := t.clocks.GetTime(c)
+	if err == nil && c == sentrytime.Monotonic {
+		now += t.monotonicOffset
+	}
+	return now, err
+}
+
+// BootTime returns the system boot real time.
+func (t *Timekeeper) BootTime() ktime.Time {
+	return t.bootTime
+}
diff --git a/pkg/sentry/kernel/timekeeper_state.go b/pkg/sentry/kernel/timekeeper_state.go
new file mode 100644
index 000000000..aee983ac7
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper_state.go
@@ -0,0 +1,41 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// beforeSave is invoked by stateify.
+func (t *Timekeeper) beforeSave() {
+	if t.stop != nil {
+		panic("pauseUpdates must be called before Save")
+	}
+
+	// N.B. we want the *offset* monotonic time.
+	var err error
+	if t.saveMonotonic, err = t.GetTime(time.Monotonic); err != nil {
+		panic("unable to get current monotonic time: " + err.Error())
+	}
+
+	if t.saveRealtime, err = t.GetTime(time.Realtime); err != nil {
+		panic("unable to get current realtime: " + err.Error())
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (t *Timekeeper) afterLoad() {
+	t.restored = true
+}
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
new file mode 100644
index 000000000..08bacba4f
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -0,0 +1,156 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// mockClocks is a sentrytime.Clocks that simply returns the times in the
+// struct.
+type mockClocks struct {
+	monotonic int64
+	realtime  int64
+}
+
+// Update implements sentrytime.Clocks.Update. It does nothing.
+func (*mockClocks) Update() (monotonicParams sentrytime.Parameters, monotonicOk bool, realtimeParam sentrytime.Parameters, realtimeOk bool) {
+	return
+}
+
+// Update implements sentrytime.Clocks.GetTime.
+func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) {
+	switch id {
+	case sentrytime.Monotonic:
+		return c.monotonic, nil
+	case sentrytime.Realtime:
+		return c.realtime, nil
+	default:
+		return 0, syserror.EINVAL
+	}
+}
+
+// stateTestClocklessTimekeeper returns a test Timekeeper which has not had
+// SetClocks called.
+func stateTestClocklessTimekeeper(tb testing.TB) *Timekeeper {
+	ctx := contexttest.Context(tb)
+	p := platform.FromContext(ctx)
+	fr, err := p.Memory().Allocate(usermem.PageSize, usage.Anonymous)
+	if err != nil {
+		tb.Fatalf("failed to allocate memory: %v", err)
+	}
+	return &Timekeeper{
+		params: NewVDSOParamPage(p, fr),
+	}
+}
+
+func stateTestTimekeeper(tb testing.TB) *Timekeeper {
+	t := stateTestClocklessTimekeeper(tb)
+	t.SetClocks(sentrytime.NewCalibratedClocks())
+	return t
+}
+
+// TestTimekeeperMonotonicZero tests that monotonic time starts at zero.
+func TestTimekeeperMonotonicZero(t *testing.T) {
+	c := &mockClocks{
+		monotonic: 100000,
+	}
+
+	tk := stateTestClocklessTimekeeper(t)
+	tk.SetClocks(c)
+	defer tk.Destroy()
+
+	now, err := tk.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		t.Errorf("GetTime err got %v want nil", err)
+	}
+	if now != 0 {
+		t.Errorf("GetTime got %d want 0", now)
+	}
+
+	c.monotonic += 10
+
+	now, err = tk.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		t.Errorf("GetTime err got %v want nil", err)
+	}
+	if now != 10 {
+		t.Errorf("GetTime got %d want 10", now)
+	}
+}
+
+// TestTimekeeperMonotonicJumpForward tests that monotonic time jumps forward
+// after restore.
+func TestTimekeeperMonotonicForward(t *testing.T) {
+	c := &mockClocks{
+		monotonic: 900000,
+		realtime:  600000,
+	}
+
+	tk := stateTestClocklessTimekeeper(t)
+	tk.restored = true
+	tk.saveMonotonic = 100000
+	tk.saveRealtime = 400000
+	tk.SetClocks(c)
+	defer tk.Destroy()
+
+	// The monotonic clock should jump ahead by 200000 to 300000.
+	//
+	// The new system monotonic time (900000) is irrelevant to what the app
+	// sees.
+	now, err := tk.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		t.Errorf("GetTime err got %v want nil", err)
+	}
+	if now != 300000 {
+		t.Errorf("GetTime got %d want 300000", now)
+	}
+}
+
+// TestTimekeeperMonotonicJumpBackwards tests that monotonic time does not jump
+// backwards when realtime goes backwards.
+func TestTimekeeperMonotonicJumpBackwards(t *testing.T) {
+	c := &mockClocks{
+		monotonic: 900000,
+		realtime:  400000,
+	}
+
+	tk := stateTestClocklessTimekeeper(t)
+	tk.restored = true
+	tk.saveMonotonic = 100000
+	tk.saveRealtime = 600000
+	tk.SetClocks(c)
+	defer tk.Destroy()
+
+	// The monotonic clock should remain at 100000.
+	//
+	// The new system monotonic time (900000) is irrelevant to what the app
+	// sees and we don't want to jump the monotonic clock backwards like
+	// realtime did.
+	now, err := tk.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		t.Errorf("GetTime err got %v want nil", err)
+	}
+	if now != 100000 {
+		t.Errorf("GetTime got %d want 100000", now)
+	}
+}
diff --git a/pkg/sentry/kernel/timer.go b/pkg/sentry/kernel/timer.go
new file mode 100644
index 000000000..03a3310be
--- /dev/null
+++ b/pkg/sentry/kernel/timer.go
@@ -0,0 +1,282 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// timekeeperClock is a ktime.Clock that reads time from a
+// kernel.Timekeeper-managed clock.
+type timekeeperClock struct {
+	tk *Timekeeper
+	c  sentrytime.ClockID
+
+	// Implements ktime.Clock.WallTimeUntil.
+	ktime.WallRateClock `state:"nosave"`
+
+	// Implements waiter.Waitable. (We have no ability to detect
+	// discontinuities from external changes to CLOCK_REALTIME).
+	ktime.NoClockEvents `state:"nosave"`
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *timekeeperClock) Now() ktime.Time {
+	now, err := tc.tk.GetTime(tc.c)
+	if err != nil {
+		panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err))
+	}
+	return ktime.FromNanoseconds(now)
+}
+
+// tgClock is a ktime.Clock that measures the time a thread group has spent
+// executing.
+type tgClock struct {
+	tg *ThreadGroup
+
+	// If includeSys is true, the tgClock includes both time spent executing
+	// application code as well as time spent in the sentry. Otherwise, the
+	// tgClock includes only time spent executing application code.
+	includeSys bool
+
+	// Implements waiter.Waitable.
+	ktime.ClockEventsQueue `state:"nosave"`
+}
+
+// UserCPUClock returns a ktime.Clock that measures the time that a thread
+// group has spent executing.
+func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
+	return tg.tm.virtClock
+}
+
+// CPUClock returns a ktime.Clock that measures the time that a thread group
+// has spent executing, including sentry time.
+func (tg *ThreadGroup) CPUClock() ktime.Clock {
+	return tg.tm.profClock
+}
+
+// Now implements ktime.Clock.Now.
+func (tgc *tgClock) Now() ktime.Time {
+	stats := tgc.tg.CPUStats()
+	if tgc.includeSys {
+		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+	}
+	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// WallTimeUntil implements ktime.Clock.WallTimeUntil.
+func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
+	// The assumption here is that the time spent in this process (not matter
+	// virtual or prof) should not exceed wall time * active tasks, since
+	// Task.exitThreadGroup stops accounting as it transitions to
+	// TaskExitInitiated.
+	tgc.tg.pidns.owner.mu.RLock()
+	n := tgc.tg.activeTasks
+	tgc.tg.pidns.owner.mu.RUnlock()
+	if n == 0 {
+		if t.Before(now) {
+			return 0
+		}
+		// The timer tick raced with thread group exit, after which no more
+		// tasks can enter the thread group. So tgc.Now() will never advance
+		// again. Return a large delay; the timer should be stopped long before
+		// it comes again anyway.
+		return time.Hour
+	}
+	// This is a lower bound on the amount of time that can elapse before an
+	// associated timer expires, so returning this value tends to result in a
+	// sequence of closely-spaced ticks just before timer expiry. To avoid
+	// this, round up to the nearest ClockTick; CPU usage measurements are
+	// limited to this resolution anyway.
+	remaining := time.Duration(int64(t.Sub(now))/int64(n)) * time.Nanosecond
+	return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
+}
+
+// taskClock is a ktime.Clock that measures the time that a task has spent
+// executing.
+type taskClock struct {
+	t *Task
+
+	// If includeSys is true, the taskClock includes both time spent executing
+	// application code as well as time spent in the sentry. Otherwise, the
+	// taskClock includes only time spent executing application code.
+	includeSys bool
+
+	// Implements waiter.Waitable. TimeUntil wouldn't change its estimation
+	// based on either of the clock events, so there's no event to be
+	// notified for.
+	ktime.NoClockEvents `state:"nosave"`
+
+	// Implements ktime.Clock.WallTimeUntil.
+	//
+	// As an upper bound, a task's clock cannot advance faster than CPU
+	// time. It would have to execute at a rate of more than 1 task-second
+	// per 1 CPU-second, which isn't possible.
+	ktime.WallRateClock `state:"nosave"`
+}
+
+// UserCPUClock returns a clock measuring the CPU time the task has spent
+// executing application code.
+func (t *Task) UserCPUClock() ktime.Clock {
+	return &taskClock{t: t, includeSys: false}
+}
+
+// CPUClock returns a clock measuring the CPU time the task has spent executing
+// application and "kernel" code.
+func (t *Task) CPUClock() ktime.Clock {
+	return &taskClock{t: t, includeSys: true}
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *taskClock) Now() ktime.Time {
+	stats := tc.t.CPUStats()
+	if tc.includeSys {
+		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+	}
+	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// signalNotifier is a ktime.Listener that sends signals to a ThreadGroup.
+type signalNotifier struct {
+	tg         *ThreadGroup
+	signal     linux.Signal
+	realTimer  bool
+	includeSys bool
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (s *signalNotifier) Notify(exp uint64) {
+	// Since all signals sent using a signalNotifier are standard (not
+	// real-time) signals, we can ignore the number of expirations and send
+	// only a single signal.
+	if s.realTimer {
+		// real timer signal sent to leader. See kernel/time/itimer.c:it_real_fn
+		s.tg.SendSignal(sigPriv(s.signal))
+	} else {
+		s.tg.SendTimerSignal(sigPriv(s.signal), s.includeSys)
+	}
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (s *signalNotifier) Destroy() {}
+
+// TimerManager is a collection of supported process cpu timers.
+type TimerManager struct {
+	// Clocks used to drive thread group execution time timers.
+	virtClock *tgClock
+	profClock *tgClock
+
+	RealTimer      *ktime.Timer
+	VirtualTimer   *ktime.Timer
+	ProfTimer      *ktime.Timer
+	SoftLimitTimer *ktime.Timer
+	HardLimitTimer *ktime.Timer
+}
+
+// newTimerManager returns a new instance of TimerManager.
+func newTimerManager(tg *ThreadGroup, monotonicClock ktime.Clock) TimerManager {
+	virtClock := &tgClock{tg: tg, includeSys: false}
+	profClock := &tgClock{tg: tg, includeSys: true}
+	tm := TimerManager{
+		virtClock: virtClock,
+		profClock: profClock,
+		RealTimer: ktime.NewTimer(monotonicClock, &signalNotifier{
+			tg:         tg,
+			signal:     linux.SIGALRM,
+			realTimer:  true,
+			includeSys: false,
+		}),
+		VirtualTimer: ktime.NewTimer(virtClock, &signalNotifier{
+			tg:         tg,
+			signal:     linux.SIGVTALRM,
+			realTimer:  false,
+			includeSys: false,
+		}),
+		ProfTimer: ktime.NewTimer(profClock, &signalNotifier{
+			tg:         tg,
+			signal:     linux.SIGPROF,
+			realTimer:  false,
+			includeSys: true,
+		}),
+		SoftLimitTimer: ktime.NewTimer(profClock, &signalNotifier{
+			tg:         tg,
+			signal:     linux.SIGXCPU,
+			realTimer:  false,
+			includeSys: true,
+		}),
+		HardLimitTimer: ktime.NewTimer(profClock, &signalNotifier{
+			tg:         tg,
+			signal:     linux.SIGKILL,
+			realTimer:  false,
+			includeSys: true,
+		}),
+	}
+	tm.applyCPULimits(tg.Limits().Get(limits.CPU))
+	return tm
+}
+
+// Save saves this TimerManger.
+
+// destroy destroys all timers.
+func (tm *TimerManager) destroy() {
+	tm.RealTimer.Destroy()
+	tm.VirtualTimer.Destroy()
+	tm.ProfTimer.Destroy()
+	tm.SoftLimitTimer.Destroy()
+	tm.HardLimitTimer.Destroy()
+}
+
+func (tm *TimerManager) applyCPULimits(l limits.Limit) {
+	tm.SoftLimitTimer.Swap(ktime.Setting{
+		Enabled: l.Cur != limits.Infinity,
+		Next:    ktime.FromNanoseconds((time.Duration(l.Cur) * time.Second).Nanoseconds()),
+		Period:  time.Second,
+	})
+	tm.HardLimitTimer.Swap(ktime.Setting{
+		Enabled: l.Max != limits.Infinity,
+		Next:    ktime.FromNanoseconds((time.Duration(l.Max) * time.Second).Nanoseconds()),
+	})
+}
+
+// kick is called when the number of threads in the thread group associated
+// with tm increases.
+func (tm *TimerManager) kick() {
+	tm.virtClock.Notify(ktime.ClockEventRateIncrease)
+	tm.profClock.Notify(ktime.ClockEventRateIncrease)
+}
+
+// pause is to pause the timers and stop timer signal delivery.
+func (tm *TimerManager) pause() {
+	tm.RealTimer.Pause()
+	tm.VirtualTimer.Pause()
+	tm.ProfTimer.Pause()
+	tm.SoftLimitTimer.Pause()
+	tm.HardLimitTimer.Pause()
+}
+
+// resume is to resume the timers and continue timer signal delivery.
+func (tm *TimerManager) resume() {
+	tm.RealTimer.Resume()
+	tm.VirtualTimer.Resume()
+	tm.ProfTimer.Resume()
+	tm.SoftLimitTimer.Resume()
+	tm.HardLimitTimer.Resume()
+}
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
new file mode 100644
index 000000000..58e9b4d1b
--- /dev/null
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -0,0 +1,100 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+)
+
+// UTSNamespace represents a UTS namespace, a holder of two system identifiers:
+// the hostname and domain name.
+type UTSNamespace struct {
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	hostName   string
+	domainName string
+
+	// userns is the user namespace associated with the UTSNamespace.
+	// Privileged operations on this UTSNamespace must have appropriate
+	// capabilities in userns.
+	//
+	// userns is immutable.
+	userns *auth.UserNamespace
+}
+
+// NewUTSNamespace creates a new UTS namespace.
+func NewUTSNamespace(hostName, domainName string, userns *auth.UserNamespace) *UTSNamespace {
+	return &UTSNamespace{
+		hostName:   hostName,
+		domainName: domainName,
+		userns:     userns,
+	}
+}
+
+// UTSNamespace returns the task's UTS namespace.
+func (t *Task) UTSNamespace() *UTSNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.utsns
+}
+
+// HostName returns the host name of this UTS namespace.
+func (u *UTSNamespace) HostName() string {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return u.hostName
+}
+
+// SetHostName sets the host name of this UTS namespace.
+func (u *UTSNamespace) SetHostName(host string) {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	u.hostName = host
+}
+
+// DomainName returns the domain name of this UTS namespace.
+func (u *UTSNamespace) DomainName() string {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return u.domainName
+}
+
+// SetDomainName sets the domain name of this UTS namespace.
+func (u *UTSNamespace) SetDomainName(domain string) {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	u.domainName = domain
+}
+
+// UserNamespace returns the user namespace associated with this UTS namespace.
+func (u *UTSNamespace) UserNamespace() *auth.UserNamespace {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return u.userns
+}
+
+// Clone makes a copy of this UTS namespace, associating the given user
+// namespace.
+func (u *UTSNamespace) Clone(userns *auth.UserNamespace) *UTSNamespace {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return &UTSNamespace{
+		hostName:   u.hostName,
+		domainName: u.domainName,
+		userns:     userns,
+	}
+}
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
new file mode 100644
index 000000000..0bacbea49
--- /dev/null
+++ b/pkg/sentry/kernel/vdso.go
@@ -0,0 +1,145 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// vdsoParams are the parameters exposed to the VDSO.
+//
+// They are exposed to the VDSO via a parameter page managed by VDSOParamPage,
+// which also includes a sequence counter.
+type vdsoParams struct {
+	monotonicReady      uint64
+	monotonicBaseCycles int64
+	monotonicBaseRef    int64
+	monotonicFrequency  uint64
+
+	realtimeReady      uint64
+	realtimeBaseCycles int64
+	realtimeBaseRef    int64
+	realtimeFrequency  uint64
+}
+
+// VDSOParamPage manages a VDSO parameter page.
+//
+// Its memory layout looks like:
+//
+// type page struct {
+//	// seq is a sequence counter that protects the fields below.
+//	seq uint64
+//	vdsoParams
+// }
+//
+// Everything in the struct is 8 bytes for easy alignment.
+//
+// It must be kept in sync with params in vdso/vdso_time.cc.
+type VDSOParamPage struct {
+	// The parameter page is fr, allocated from platform.Memory().
+	platform platform.Platform
+	fr       platform.FileRange
+
+	// seq is the current sequence count written to the page.
+	//
+	// A write is in progress if bit 1 of the counter is set.
+	//
+	// Timekeeper's updater goroutine may call Write before equality is
+	// checked in state_test_util tests, causing this field to change across
+	// save / restore.
+	seq uint64
+}
+
+// NewVDSOParamPage returns a VDSOParamPage.
+//
+// Preconditions:
+//
+// * fr is a single page allocated from platform.Memory(). VDSOParamPage does
+//   not take ownership of fr; it must remain allocated for the lifetime of the
+//   VDSOParamPage.
+//
+// * VDSOParamPage must be the only writer to fr.
+//
+// * platform.Memory().MapInternal(fr) must return a single safemem.Block.
+func NewVDSOParamPage(platform platform.Platform, fr platform.FileRange) *VDSOParamPage {
+	return &VDSOParamPage{platform: platform, fr: fr}
+}
+
+// access returns a mapping of the param page.
+func (v *VDSOParamPage) access() (safemem.Block, error) {
+	bs, err := v.platform.Memory().MapInternal(v.fr, usermem.ReadWrite)
+	if err != nil {
+		return safemem.Block{}, err
+	}
+	if bs.NumBlocks() != 1 {
+		panic(fmt.Sprintf("Multiple blocks (%d) in VDSO param BlockSeq", bs.NumBlocks()))
+	}
+	return bs.Head(), nil
+}
+
+// incrementSeq increments the sequence counter in the param page.
+func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error {
+	next := v.seq + 1
+	old, err := safemem.SwapUint64(paramPage, next)
+	if err != nil {
+		return err
+	}
+
+	if old != v.seq {
+		return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d. Application may hang or get incorrect time from the VDSO.", old, v.seq)
+	}
+
+	v.seq = next
+	return nil
+}
+
+// Write updates the VDSO parameters.
+//
+// Write starts a write block, calls f to get the new parameters, writes
+// out the new parameters, then ends the write block.
+func (v *VDSOParamPage) Write(f func() vdsoParams) error {
+	paramPage, err := v.access()
+	if err != nil {
+		return err
+	}
+
+	// Write begin.
+	next := v.seq + 1
+	if next%2 != 1 {
+		panic("Out-of-order sequence count")
+	}
+
+	err = v.incrementSeq(paramPage)
+	if err != nil {
+		return err
+	}
+
+	// Get the new params.
+	p := f()
+	buf := binary.Marshal(nil, usermem.ByteOrder, p)
+
+	// Skip the sequence counter.
+	if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil {
+		panic(fmt.Sprintf("Unable to get set VDSO parameters: %v", err))
+	}
+
+	// Write end.
+	return v.incrementSeq(paramPage)
+}
diff --git a/pkg/sentry/kernel/version.go b/pkg/sentry/kernel/version.go
new file mode 100644
index 000000000..a9e84673f
--- /dev/null
+++ b/pkg/sentry/kernel/version.go
@@ -0,0 +1,33 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// Version defines the application-visible system version.
+type Version struct {
+	// Operating system name (e.g. "Linux").
+	Sysname string
+
+	// Operating system release (e.g. "3.11.10-amd64").
+	Release string
+
+	// Operating system version. On Linux this takes the shape
+	// "#VERSION CONFIG_FLAGS TIMESTAMP"
+	// where:
+	// - VERSION is a sequence counter incremented on every successful build
+	// - CONFIG_FLAGS is a space-separated list of major enabled kernel features
+	//   (e.g. "SMP" and "PREEMPT")
+	// - TIMESTAMP is the build timestamp as returned by `date`
+	Version string
+}