Merge 216da0b7 (automated)

author: gVisor bot <gvisor-bot@google.com> 2019-06-02 06:44:55 +0000
committer: gVisor bot <gvisor-bot@google.com> 2019-06-02 06:44:55 +0000
commit: ceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree: 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/kernel
parent: deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent: 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)
99 files changed, 24657 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
new file mode 100644
index 000000000..5ce52e66c
--- /dev/null
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -0,0 +1,111 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
+)
+
+// +stateify savable
+type abstractEndpoint struct {
+	ep   transport.BoundEndpoint
+	wr   *refs.WeakRef
+	name string
+	ns   *AbstractSocketNamespace
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+func (e *abstractEndpoint) WeakRefGone() {
+	e.ns.mu.Lock()
+	if e.ns.endpoints[e.name].ep == e.ep {
+		delete(e.ns.endpoints, e.name)
+	}
+	e.ns.mu.Unlock()
+}
+
+// AbstractSocketNamespace is used to implement the Linux abstract socket functionality.
+//
+// +stateify savable
+type AbstractSocketNamespace struct {
+	mu sync.Mutex `state:"nosave"`
+
+	// Keeps mapping from name to endpoint.
+	endpoints map[string]abstractEndpoint
+}
+
+// NewAbstractSocketNamespace returns a new AbstractSocketNamespace.
+func NewAbstractSocketNamespace() *AbstractSocketNamespace {
+	return &AbstractSocketNamespace{
+		endpoints: make(map[string]abstractEndpoint),
+	}
+}
+
+// A boundEndpoint wraps a transport.BoundEndpoint to maintain a reference on
+// its backing object.
+type boundEndpoint struct {
+	transport.BoundEndpoint
+	rc refs.RefCounter
+}
+
+// Release implements transport.BoundEndpoint.Release.
+func (e *boundEndpoint) Release() {
+	e.rc.DecRef()
+	e.BoundEndpoint.Release()
+}
+
+// BoundEndpoint retrieves the endpoint bound to the given name. The return
+// value is nil if no endpoint was bound.
+func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndpoint {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	ep, ok := a.endpoints[name]
+	if !ok {
+		return nil
+	}
+
+	rc := ep.wr.Get()
+	if rc == nil {
+		delete(a.endpoints, name)
+		return nil
+	}
+
+	return &boundEndpoint{ep.ep, rc}
+}
+
+// Bind binds the given socket.
+//
+// When the last reference managed by rc is dropped, ep may be removed from the
+// namespace.
+func (a *AbstractSocketNamespace) Bind(name string, ep transport.BoundEndpoint, rc refs.RefCounter) error {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if ep, ok := a.endpoints[name]; ok {
+		if rc := ep.wr.Get(); rc != nil {
+			rc.DecRef()
+			return syscall.EADDRINUSE
+		}
+	}
+
+	ae := abstractEndpoint{ep: ep, name: name, ns: a}
+	ae.wr = refs.NewWeakRef(rc, &ae)
+	a.endpoints[name] = ae
+	return nil
+}
diff --git a/pkg/sentry/kernel/auth/auth.go b/pkg/sentry/kernel/auth/auth.go
new file mode 100644
index 000000000..847d121aa
--- /dev/null
+++ b/pkg/sentry/kernel/auth/auth.go
@@ -0,0 +1,22 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package auth implements an access control model that is a subset of Linux's.
+//
+// The auth package supports two kinds of access controls: user/group IDs and
+// capabilities. Each resource in the security model is associated with a user
+// namespace; "privileged" operations check that the operator's credentials
+// have the required user/group IDs or capabilities within the user namespace
+// of accessed resources.
+package auth
diff --git a/pkg/sentry/kernel/auth/auth_state_autogen.go b/pkg/sentry/kernel/auth/auth_state_autogen.go
new file mode 100755
index 000000000..6f80381c6
--- /dev/null
+++ b/pkg/sentry/kernel/auth/auth_state_autogen.go
@@ -0,0 +1,151 @@
+// automatically generated by stateify.
+
+package auth
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *Credentials) beforeSave() {}
+func (x *Credentials) save(m state.Map) {
+	x.beforeSave()
+	m.Save("RealKUID", &x.RealKUID)
+	m.Save("EffectiveKUID", &x.EffectiveKUID)
+	m.Save("SavedKUID", &x.SavedKUID)
+	m.Save("RealKGID", &x.RealKGID)
+	m.Save("EffectiveKGID", &x.EffectiveKGID)
+	m.Save("SavedKGID", &x.SavedKGID)
+	m.Save("ExtraKGIDs", &x.ExtraKGIDs)
+	m.Save("PermittedCaps", &x.PermittedCaps)
+	m.Save("InheritableCaps", &x.InheritableCaps)
+	m.Save("EffectiveCaps", &x.EffectiveCaps)
+	m.Save("BoundingCaps", &x.BoundingCaps)
+	m.Save("KeepCaps", &x.KeepCaps)
+	m.Save("UserNamespace", &x.UserNamespace)
+}
+
+func (x *Credentials) afterLoad() {}
+func (x *Credentials) load(m state.Map) {
+	m.Load("RealKUID", &x.RealKUID)
+	m.Load("EffectiveKUID", &x.EffectiveKUID)
+	m.Load("SavedKUID", &x.SavedKUID)
+	m.Load("RealKGID", &x.RealKGID)
+	m.Load("EffectiveKGID", &x.EffectiveKGID)
+	m.Load("SavedKGID", &x.SavedKGID)
+	m.Load("ExtraKGIDs", &x.ExtraKGIDs)
+	m.Load("PermittedCaps", &x.PermittedCaps)
+	m.Load("InheritableCaps", &x.InheritableCaps)
+	m.Load("EffectiveCaps", &x.EffectiveCaps)
+	m.Load("BoundingCaps", &x.BoundingCaps)
+	m.Load("KeepCaps", &x.KeepCaps)
+	m.Load("UserNamespace", &x.UserNamespace)
+}
+
+func (x *IDMapEntry) beforeSave() {}
+func (x *IDMapEntry) save(m state.Map) {
+	x.beforeSave()
+	m.Save("FirstID", &x.FirstID)
+	m.Save("FirstParentID", &x.FirstParentID)
+	m.Save("Length", &x.Length)
+}
+
+func (x *IDMapEntry) afterLoad() {}
+func (x *IDMapEntry) load(m state.Map) {
+	m.Load("FirstID", &x.FirstID)
+	m.Load("FirstParentID", &x.FirstParentID)
+	m.Load("Length", &x.Length)
+}
+
+func (x *idMapRange) beforeSave() {}
+func (x *idMapRange) save(m state.Map) {
+	x.beforeSave()
+	m.Save("Start", &x.Start)
+	m.Save("End", &x.End)
+}
+
+func (x *idMapRange) afterLoad() {}
+func (x *idMapRange) load(m state.Map) {
+	m.Load("Start", &x.Start)
+	m.Load("End", &x.End)
+}
+
+func (x *idMapSet) beforeSave() {}
+func (x *idMapSet) save(m state.Map) {
+	x.beforeSave()
+	var root *idMapSegmentDataSlices = x.saveRoot()
+	m.SaveValue("root", root)
+}
+
+func (x *idMapSet) afterLoad() {}
+func (x *idMapSet) load(m state.Map) {
+	m.LoadValue("root", new(*idMapSegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*idMapSegmentDataSlices)) })
+}
+
+func (x *idMapnode) beforeSave() {}
+func (x *idMapnode) save(m state.Map) {
+	x.beforeSave()
+	m.Save("nrSegments", &x.nrSegments)
+	m.Save("parent", &x.parent)
+	m.Save("parentIndex", &x.parentIndex)
+	m.Save("hasChildren", &x.hasChildren)
+	m.Save("keys", &x.keys)
+	m.Save("values", &x.values)
+	m.Save("children", &x.children)
+}
+
+func (x *idMapnode) afterLoad() {}
+func (x *idMapnode) load(m state.Map) {
+	m.Load("nrSegments", &x.nrSegments)
+	m.Load("parent", &x.parent)
+	m.Load("parentIndex", &x.parentIndex)
+	m.Load("hasChildren", &x.hasChildren)
+	m.Load("keys", &x.keys)
+	m.Load("values", &x.values)
+	m.Load("children", &x.children)
+}
+
+func (x *idMapSegmentDataSlices) beforeSave() {}
+func (x *idMapSegmentDataSlices) save(m state.Map) {
+	x.beforeSave()
+	m.Save("Start", &x.Start)
+	m.Save("End", &x.End)
+	m.Save("Values", &x.Values)
+}
+
+func (x *idMapSegmentDataSlices) afterLoad() {}
+func (x *idMapSegmentDataSlices) load(m state.Map) {
+	m.Load("Start", &x.Start)
+	m.Load("End", &x.End)
+	m.Load("Values", &x.Values)
+}
+
+func (x *UserNamespace) beforeSave() {}
+func (x *UserNamespace) save(m state.Map) {
+	x.beforeSave()
+	m.Save("parent", &x.parent)
+	m.Save("owner", &x.owner)
+	m.Save("uidMapFromParent", &x.uidMapFromParent)
+	m.Save("uidMapToParent", &x.uidMapToParent)
+	m.Save("gidMapFromParent", &x.gidMapFromParent)
+	m.Save("gidMapToParent", &x.gidMapToParent)
+}
+
+func (x *UserNamespace) afterLoad() {}
+func (x *UserNamespace) load(m state.Map) {
+	m.Load("parent", &x.parent)
+	m.Load("owner", &x.owner)
+	m.Load("uidMapFromParent", &x.uidMapFromParent)
+	m.Load("uidMapToParent", &x.uidMapToParent)
+	m.Load("gidMapFromParent", &x.gidMapFromParent)
+	m.Load("gidMapToParent", &x.gidMapToParent)
+}
+
+func init() {
+	state.Register("auth.Credentials", (*Credentials)(nil), state.Fns{Save: (*Credentials).save, Load: (*Credentials).load})
+	state.Register("auth.IDMapEntry", (*IDMapEntry)(nil), state.Fns{Save: (*IDMapEntry).save, Load: (*IDMapEntry).load})
+	state.Register("auth.idMapRange", (*idMapRange)(nil), state.Fns{Save: (*idMapRange).save, Load: (*idMapRange).load})
+	state.Register("auth.idMapSet", (*idMapSet)(nil), state.Fns{Save: (*idMapSet).save, Load: (*idMapSet).load})
+	state.Register("auth.idMapnode", (*idMapnode)(nil), state.Fns{Save: (*idMapnode).save, Load: (*idMapnode).load})
+	state.Register("auth.idMapSegmentDataSlices", (*idMapSegmentDataSlices)(nil), state.Fns{Save: (*idMapSegmentDataSlices).save, Load: (*idMapSegmentDataSlices).load})
+	state.Register("auth.UserNamespace", (*UserNamespace)(nil), state.Fns{Save: (*UserNamespace).save, Load: (*UserNamespace).load})
+}
diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go
new file mode 100644
index 000000000..7a0c967cd
--- /dev/null
+++ b/pkg/sentry/kernel/auth/capability_set.go
@@ -0,0 +1,61 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+)
+
+// A CapabilitySet is a set of capabilities implemented as a bitset. The zero
+// value of CapabilitySet is a set containing no capabilities.
+type CapabilitySet uint64
+
+// AllCapabilities is a CapabilitySet containing all valid capabilities.
+var AllCapabilities = CapabilitySetOf(linux.MaxCapability+1) - 1
+
+// CapabilitySetOf returns a CapabilitySet containing only the given
+// capability.
+func CapabilitySetOf(cp linux.Capability) CapabilitySet {
+	return CapabilitySet(bits.MaskOf64(int(cp)))
+}
+
+// CapabilitySetOfMany returns a CapabilitySet containing the given capabilities.
+func CapabilitySetOfMany(cps []linux.Capability) CapabilitySet {
+	var cs uint64
+	for _, cp := range cps {
+		cs |= bits.MaskOf64(int(cp))
+	}
+	return CapabilitySet(cs)
+}
+
+// TaskCapabilities represents all the capability sets for a task. Each of these
+// sets is explained in greater detail in capabilities(7).
+type TaskCapabilities struct {
+	// Permitted is a limiting superset for the effective capabilities that
+	// the thread may assume.
+	PermittedCaps CapabilitySet
+	// Inheritable is a set of capabilities preserved across an execve(2).
+	InheritableCaps CapabilitySet
+	// Effective is the set of capabilities used by the kernel to perform
+	// permission checks for the thread.
+	EffectiveCaps CapabilitySet
+	// Bounding is a limiting superset for the capabilities that a thread
+	// can add to its inheritable set using capset(2).
+	BoundingCaps CapabilitySet
+	// Ambient is a set of capabilities that are preserved across an
+	// execve(2) of a program that is not privileged.
+	AmbientCaps CapabilitySet
+}
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
new file mode 100644
index 000000000..16d110610
--- /dev/null
+++ b/pkg/sentry/kernel/auth/context.go
@@ -0,0 +1,36 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the auth package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxCredentials is a Context.Value key for Credentials.
+	CtxCredentials contextID = iota
+)
+
+// CredentialsFromContext returns a copy of the Credentials used by ctx, or a
+// set of Credentials with no capabilities if ctx does not have Credentials.
+func CredentialsFromContext(ctx context.Context) *Credentials {
+	if v := ctx.Value(CtxCredentials); v != nil {
+		return v.(*Credentials)
+	}
+	return NewAnonymousCredentials()
+}
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
new file mode 100644
index 000000000..1511a0324
--- /dev/null
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -0,0 +1,234 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Credentials contains information required to authorize privileged operations
+// in a user namespace.
+//
+// +stateify savable
+type Credentials struct {
+	// Real/effective/saved user/group IDs in the root user namespace. None of
+	// these should ever be NoID.
+	RealKUID      KUID
+	EffectiveKUID KUID
+	SavedKUID     KUID
+	RealKGID      KGID
+	EffectiveKGID KGID
+	SavedKGID     KGID
+
+	// Filesystem user/group IDs are not implemented. "... setfsuid() is
+	// nowadays unneeded and should be avoided in new applications (likewise
+	// for setfsgid(2))." - setfsuid(2)
+
+	// Supplementary groups used by set/getgroups.
+	//
+	// ExtraKGIDs slices are immutable, allowing multiple Credentials with the
+	// same ExtraKGIDs to share the same slice.
+	ExtraKGIDs []KGID
+
+	// The capability sets applicable to this set of credentials.
+	PermittedCaps   CapabilitySet
+	InheritableCaps CapabilitySet
+	EffectiveCaps   CapabilitySet
+	BoundingCaps    CapabilitySet
+	// Ambient capabilities are not introduced until Linux 4.3.
+
+	// KeepCaps is the flag for PR_SET_KEEPCAPS which allow capabilities to be
+	// maintained after a switch from root user to non-root user via setuid().
+	KeepCaps bool
+
+	// The user namespace associated with the owner of the credentials.
+	UserNamespace *UserNamespace
+}
+
+// NewAnonymousCredentials returns a set of credentials with no capabilities in
+// any user namespace.
+func NewAnonymousCredentials() *Credentials {
+	// Create a new root user namespace. Since the new namespace's owner is
+	// KUID 0 and the returned credentials have non-zero KUID/KGID, the
+	// returned credentials do not have any capabilities in the new namespace.
+	// Since the new namespace is not part of any existing user namespace
+	// hierarchy, the returned credentials do not have any capabilities in any
+	// other namespace.
+	return &Credentials{
+		RealKUID:      NobodyKUID,
+		EffectiveKUID: NobodyKUID,
+		SavedKUID:     NobodyKUID,
+		RealKGID:      NobodyKGID,
+		EffectiveKGID: NobodyKGID,
+		SavedKGID:     NobodyKGID,
+		UserNamespace: NewRootUserNamespace(),
+	}
+}
+
+// NewRootCredentials returns a set of credentials with KUID and KGID 0 (i.e.
+// global root) in user namespace ns.
+func NewRootCredentials(ns *UserNamespace) *Credentials {
+	// I can't find documentation for this anywhere, but it's correct for the
+	// inheritable capability set to be initially empty (the capabilities test
+	// checks for this property).
+	return &Credentials{
+		RealKUID:      RootKUID,
+		EffectiveKUID: RootKUID,
+		SavedKUID:     RootKUID,
+		RealKGID:      RootKGID,
+		EffectiveKGID: RootKGID,
+		SavedKGID:     RootKGID,
+		PermittedCaps: AllCapabilities,
+		EffectiveCaps: AllCapabilities,
+		BoundingCaps:  AllCapabilities,
+		UserNamespace: ns,
+	}
+}
+
+// NewUserCredentials returns a set of credentials based on the given UID, GIDs,
+// and capabilities in a given namespace. If all arguments are their zero
+// values, this returns the same credentials as NewRootCredentials.
+func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *TaskCapabilities, ns *UserNamespace) *Credentials {
+	creds := NewRootCredentials(ns)
+
+	// Set the UID.
+	uid := kuid
+	creds.RealKUID = uid
+	creds.EffectiveKUID = uid
+	creds.SavedKUID = uid
+
+	// Set GID.
+	gid := kgid
+	creds.RealKGID = gid
+	creds.EffectiveKGID = gid
+	creds.SavedKGID = gid
+
+	// Set additional GIDs.
+	creds.ExtraKGIDs = append(creds.ExtraKGIDs, extraKGIDs...)
+
+	// Set capabilities.
+	if capabilities != nil {
+		creds.PermittedCaps = capabilities.PermittedCaps
+		creds.EffectiveCaps = capabilities.EffectiveCaps
+		creds.BoundingCaps = capabilities.BoundingCaps
+		creds.InheritableCaps = capabilities.InheritableCaps
+		// TODO(nlacasse): Support ambient capabilities.
+	} else {
+		// If no capabilities are specified, grant capabilities consistent with
+		// setresuid + setresgid from NewRootCredentials to the given uid and
+		// gid.
+		if kuid == RootKUID {
+			creds.PermittedCaps = AllCapabilities
+			creds.EffectiveCaps = AllCapabilities
+		} else {
+			creds.PermittedCaps = 0
+			creds.EffectiveCaps = 0
+		}
+		creds.BoundingCaps = AllCapabilities
+	}
+
+	return creds
+}
+
+// Fork generates an identical copy of a set of credentials.
+func (c *Credentials) Fork() *Credentials {
+	nc := new(Credentials)
+	*nc = *c // Copy-by-value; this is legal for all fields.
+	return nc
+}
+
+// InGroup returns true if c is in group kgid. Compare Linux's
+// kernel/groups.c:in_group_p().
+func (c *Credentials) InGroup(kgid KGID) bool {
+	if c.EffectiveKGID == kgid {
+		return true
+	}
+	for _, extraKGID := range c.ExtraKGIDs {
+		if extraKGID == kgid {
+			return true
+		}
+	}
+	return false
+}
+
+// HasCapabilityIn returns true if c has capability cp in ns.
+func (c *Credentials) HasCapabilityIn(cp linux.Capability, ns *UserNamespace) bool {
+	for {
+		// "1. A process has a capability inside a user namespace if it is a member
+		// of that namespace and it has the capability in its effective capability
+		// set." - user_namespaces(7)
+		if c.UserNamespace == ns {
+			return CapabilitySetOf(cp)&c.EffectiveCaps != 0
+		}
+		// "3. ... A process that resides in the parent of the user namespace and
+		// whose effective user ID matches the owner of the namespace has all
+		// capabilities in the namespace."
+		if c.UserNamespace == ns.parent && c.EffectiveKUID == ns.owner {
+			return true
+		}
+		// "2. If a process has a capability in a user namespace, then it has that
+		// capability in all child (and further removed descendant) namespaces as
+		// well."
+		if ns.parent == nil {
+			return false
+		}
+		ns = ns.parent
+	}
+}
+
+// HasCapability returns true if c has capability cp in its user namespace.
+func (c *Credentials) HasCapability(cp linux.Capability) bool {
+	return c.HasCapabilityIn(cp, c.UserNamespace)
+}
+
+// UseUID checks that c can use uid in its user namespace, then translates it
+// to the root user namespace.
+//
+// The checks UseUID does are common, but you should verify that it's doing
+// exactly what you want.
+func (c *Credentials) UseUID(uid UID) (KUID, error) {
+	// uid must be mapped.
+	kuid := c.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return NoID, syserror.EINVAL
+	}
+	// If c has CAP_SETUID, then it can use any UID in its user namespace.
+	if c.HasCapability(linux.CAP_SETUID) {
+		return kuid, nil
+	}
+	// Otherwise, c must already have the UID as its real, effective, or saved
+	// set-user-ID.
+	if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID {
+		return kuid, nil
+	}
+	return NoID, syserror.EPERM
+}
+
+// UseGID checks that c can use gid in its user namespace, then translates it
+// to the root user namespace.
+func (c *Credentials) UseGID(gid GID) (KGID, error) {
+	kgid := c.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return NoID, syserror.EINVAL
+	}
+	if c.HasCapability(linux.CAP_SETGID) {
+		return kgid, nil
+	}
+	if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID {
+		return kgid, nil
+	}
+	return NoID, syserror.EPERM
+}
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
new file mode 100644
index 000000000..0a58ba17c
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id.go
@@ -0,0 +1,121 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"math"
+)
+
+// UID is a user ID in an unspecified user namespace.
+type UID uint32
+
+// GID is a group ID in an unspecified user namespace.
+type GID uint32
+
+// In the root user namespace, user/group IDs have a 1-to-1 relationship with
+// the users/groups they represent. In other user namespaces, this is not the
+// case; for example, two different unmapped users may both "have" the overflow
+// UID. This means that it is generally only valid to compare user and group
+// IDs in the root user namespace. We assign distinct types, KUID/KGID, to such
+// IDs to emphasize this distinction. ("k" is for "key", as in "unique key".
+// Linux also uses the prefix "k", but I think they mean "kernel".)
+
+// KUID is a user ID in the root user namespace.
+type KUID uint32
+
+// KGID is a group ID in the root user namespace.
+type KGID uint32
+
+const (
+	// NoID is uint32(-1). -1 is consistently used as a special value, in Linux
+	// and by extension in the auth package, to mean "no ID":
+	//
+	// - ID mapping returns -1 if the ID is not mapped.
+	//
+	// - Most set*id() syscalls accept -1 to mean "do not change this ID".
+	NoID = math.MaxUint32
+
+	// OverflowUID is the default value of /proc/sys/kernel/overflowuid. The
+	// "overflow UID" is usually [1] used when translating a user ID between
+	// namespaces fails because the ID is not mapped. (We don't implement this
+	// file, so the overflow UID is constant.)
+	//
+	// [1] "There is one notable case where unmapped user and group IDs are not
+	// converted to the corresponding overflow ID value. When viewing a uid_map
+	// or gid_map file in which there is no mapping for the second field, that
+	// field is displayed as 4294967295 (-1 as an unsigned integer);" -
+	// user_namespaces(7)
+	OverflowUID = UID(65534)
+	OverflowGID = GID(65534)
+
+	// NobodyKUID is the user ID usually reserved for the least privileged user
+	// "nobody".
+	NobodyKUID = KUID(65534)
+	NobodyKGID = KGID(65534)
+
+	// RootKUID is the user ID usually used for the most privileged user "root".
+	RootKUID = KUID(0)
+	RootKGID = KGID(0)
+	RootUID  = UID(0)
+	RootGID  = GID(0)
+)
+
+// Ok returns true if uid is not -1.
+func (uid UID) Ok() bool {
+	return uid != NoID
+}
+
+// Ok returns true if gid is not -1.
+func (gid GID) Ok() bool {
+	return gid != NoID
+}
+
+// Ok returns true if kuid is not -1.
+func (kuid KUID) Ok() bool {
+	return kuid != NoID
+}
+
+// Ok returns true if kgid is not -1.
+func (kgid KGID) Ok() bool {
+	return kgid != NoID
+}
+
+// OrOverflow returns uid if it is valid and the overflow UID otherwise.
+func (uid UID) OrOverflow() UID {
+	if uid.Ok() {
+		return uid
+	}
+	return OverflowUID
+}
+
+// OrOverflow returns gid if it is valid and the overflow GID otherwise.
+func (gid GID) OrOverflow() GID {
+	if gid.Ok() {
+		return gid
+	}
+	return OverflowGID
+}
+
+// In translates kuid into user namespace ns. If kuid is not mapped in ns, In
+// returns NoID.
+func (kuid KUID) In(ns *UserNamespace) UID {
+	return ns.MapFromKUID(kuid)
+}
+
+// In translates kgid into user namespace ns. If kgid is not mapped in ns, In
+// returns NoID.
+func (kgid KGID) In(ns *UserNamespace) GID {
+	return ns.MapFromKGID(kgid)
+}
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
new file mode 100644
index 000000000..e5d6028d6
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -0,0 +1,285 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns.
+func (ns *UserNamespace) MapFromKUID(kuid KUID) UID {
+	if ns.parent == nil {
+		return UID(kuid)
+	}
+	return UID(ns.mapID(&ns.uidMapFromParent, uint32(ns.parent.MapFromKUID(kuid))))
+}
+
+// MapFromKGID translates kgid, a GID in the root namespace, to a GID in ns.
+func (ns *UserNamespace) MapFromKGID(kgid KGID) GID {
+	if ns.parent == nil {
+		return GID(kgid)
+	}
+	return GID(ns.mapID(&ns.gidMapFromParent, uint32(ns.parent.MapFromKGID(kgid))))
+}
+
+// MapToKUID translates uid, a UID in ns, to a UID in the root namespace.
+func (ns *UserNamespace) MapToKUID(uid UID) KUID {
+	if ns.parent == nil {
+		return KUID(uid)
+	}
+	return ns.parent.MapToKUID(UID(ns.mapID(&ns.uidMapToParent, uint32(uid))))
+}
+
+// MapToKGID translates gid, a GID in ns, to a GID in the root namespace.
+func (ns *UserNamespace) MapToKGID(gid GID) KGID {
+	if ns.parent == nil {
+		return KGID(gid)
+	}
+	return ns.parent.MapToKGID(GID(ns.mapID(&ns.gidMapToParent, uint32(gid))))
+}
+
+func (ns *UserNamespace) mapID(m *idMapSet, id uint32) uint32 {
+	if id == NoID {
+		return NoID
+	}
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	if it := m.FindSegment(id); it.Ok() {
+		return it.Value() + (id - it.Start())
+	}
+	return NoID
+}
+
+// allIDsMapped returns true if all IDs in the range [start, end) are mapped in
+// m.
+//
+// Preconditions: end >= start.
+func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool {
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	return m.SpanRange(idMapRange{start, end}) == end-start
+}
+
+// An IDMapEntry represents a mapping from a range of contiguous IDs in a user
+// namespace to an equally-sized range of contiguous IDs in the namespace's
+// parent.
+//
+// +stateify savable
+type IDMapEntry struct {
+	// FirstID is the first ID in the range in the namespace.
+	FirstID uint32
+
+	// FirstParentID is the first ID in the range in the parent namespace.
+	FirstParentID uint32
+
+	// Length is the number of IDs in the range.
+	Length uint32
+}
+
+// SetUIDMap instructs ns to translate UIDs as specified by entries.
+//
+// Note: SetUIDMap does not place an upper bound on the number of entries, but
+// Linux does. This restriction is implemented in SetUIDMap's caller, the
+// implementation of /proc/[pid]/uid_map.
+func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) error {
+	c := CredentialsFromContext(ctx)
+
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	// "After the creation of a new user namespace, the uid_map file of *one*
+	// of the processes in the namespace may be written to *once* to define the
+	// mapping of user IDs in the new user namespace. An attempt to write more
+	// than once to a uid_map file in a user namespace fails with the error
+	// EPERM. Similar rules apply for gid_map files." - user_namespaces(7)
+	if !ns.uidMapFromParent.IsEmpty() {
+		return syserror.EPERM
+	}
+	// "At least one line must be written to the file."
+	if len(entries) == 0 {
+		return syserror.EINVAL
+	}
+	// """
+	// In order for a process to write to the /proc/[pid]/uid_map
+	// (/proc/[pid]/gid_map) file, all of the following requirements must be
+	// met:
+	//
+	// 1. The writing process must have the CAP_SETUID (CAP_SETGID) capability
+	// in the user namespace of the process pid.
+	// """
+	if !c.HasCapabilityIn(linux.CAP_SETUID, ns) {
+		return syserror.EPERM
+	}
+	// "2. The writing process must either be in the user namespace of the process
+	// pid or be in the parent user namespace of the process pid."
+	if c.UserNamespace != ns && c.UserNamespace != ns.parent {
+		return syserror.EPERM
+	}
+	// """
+	// 3. (see trySetUIDMap)
+	//
+	// 4. One of the following two cases applies:
+	//
+	// * Either the writing process has the CAP_SETUID (CAP_SETGID) capability
+	// in the parent user namespace.
+	// """
+	if !c.HasCapabilityIn(linux.CAP_SETUID, ns.parent) {
+		// """
+		// * Or otherwise all of the following restrictions apply:
+		//
+		//   + The data written to uid_map (gid_map) must consist of a single line
+		//   that maps the writing process' effective user ID (group ID) in the
+		//   parent user namespace to a user ID (group ID) in the user namespace.
+		// """
+		if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 {
+			return syserror.EPERM
+		}
+		// """
+		//   + The writing process must have the same effective user ID as the
+		//   process that created the user namespace.
+		// """
+		if c.EffectiveKUID != ns.owner {
+			return syserror.EPERM
+		}
+	}
+	// trySetUIDMap leaves data in maps if it fails.
+	if err := ns.trySetUIDMap(entries); err != nil {
+		ns.uidMapFromParent.RemoveAll()
+		ns.uidMapToParent.RemoveAll()
+		return err
+	}
+	return nil
+}
+
+func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error {
+	for _, e := range entries {
+		// Determine upper bounds and check for overflow. This implicitly
+		// checks for NoID.
+		lastID := e.FirstID + e.Length
+		if lastID <= e.FirstID {
+			return syserror.EINVAL
+		}
+		lastParentID := e.FirstParentID + e.Length
+		if lastParentID <= e.FirstParentID {
+			return syserror.EINVAL
+		}
+		// "3. The mapped user IDs (group IDs) must in turn have a mapping in
+		// the parent user namespace."
+		// Only the root namespace has a nil parent, and root is assigned
+		// mappings when it's created, so SetUIDMap would have returned EPERM
+		// without reaching this point if ns is root.
+		if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) {
+			return syserror.EPERM
+		}
+		// If either of these Adds fail, we have an overlapping range.
+		if !ns.uidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
+			return syserror.EINVAL
+		}
+		if !ns.uidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
+			return syserror.EINVAL
+		}
+	}
+	return nil
+}
+
+// SetGIDMap instructs ns to translate GIDs as specified by entries.
+func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) error {
+	c := CredentialsFromContext(ctx)
+
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	if !ns.gidMapFromParent.IsEmpty() {
+		return syserror.EPERM
+	}
+	if len(entries) == 0 {
+		return syserror.EINVAL
+	}
+	if !c.HasCapabilityIn(linux.CAP_SETGID, ns) {
+		return syserror.EPERM
+	}
+	if c.UserNamespace != ns && c.UserNamespace != ns.parent {
+		return syserror.EPERM
+	}
+	if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) {
+		if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 {
+			return syserror.EPERM
+		}
+		// It's correct for this to still be UID.
+		if c.EffectiveKUID != ns.owner {
+			return syserror.EPERM
+		}
+		// "In the case of gid_map, use of the setgroups(2) system call must
+		// first be denied by writing "deny" to the /proc/[pid]/setgroups file
+		// (see below) before writing to gid_map." (This file isn't implemented
+		// in the version of Linux we're emulating; see comment in
+		// UserNamespace.)
+	}
+	if err := ns.trySetGIDMap(entries); err != nil {
+		ns.gidMapFromParent.RemoveAll()
+		ns.gidMapToParent.RemoveAll()
+		return err
+	}
+	return nil
+}
+
+func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error {
+	for _, e := range entries {
+		lastID := e.FirstID + e.Length
+		if lastID <= e.FirstID {
+			return syserror.EINVAL
+		}
+		lastParentID := e.FirstParentID + e.Length
+		if lastParentID <= e.FirstParentID {
+			return syserror.EINVAL
+		}
+		if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) {
+			return syserror.EPERM
+		}
+		if !ns.gidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
+			return syserror.EINVAL
+		}
+		if !ns.gidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
+			return syserror.EINVAL
+		}
+	}
+	return nil
+}
+
+// UIDMap returns the user ID mappings configured for ns. If no mappings
+// have been configured, UIDMap returns nil.
+func (ns *UserNamespace) UIDMap() []IDMapEntry {
+	return ns.getIDMap(&ns.uidMapToParent)
+}
+
+// GIDMap returns the group ID mappings configured for ns. If no mappings
+// have been configured, GIDMap returns nil.
+func (ns *UserNamespace) GIDMap() []IDMapEntry {
+	return ns.getIDMap(&ns.gidMapToParent)
+}
+
+func (ns *UserNamespace) getIDMap(m *idMapSet) []IDMapEntry {
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	var entries []IDMapEntry
+	for it := m.FirstSegment(); it.Ok(); it = it.NextSegment() {
+		entries = append(entries, IDMapEntry{
+			FirstID:       it.Start(),
+			FirstParentID: it.Value(),
+			Length:        it.Range().Length(),
+		})
+	}
+	return entries
+}
diff --git a/pkg/sentry/kernel/auth/id_map_functions.go b/pkg/sentry/kernel/auth/id_map_functions.go
new file mode 100644
index 000000000..432dbfb6d
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map_functions.go
@@ -0,0 +1,45 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+// idMapFunctions "implements" generic interface segment.Functions for
+// idMapSet. An idMapSet maps non-overlapping ranges of contiguous IDs in one
+// user namespace to non-overlapping ranges of contiguous IDs in another user
+// namespace. Each such ID mapping is implemented as a range-to-value mapping
+// in the set such that [range.Start(), range.End()) => [value, value +
+// range.Length()).
+type idMapFunctions struct{}
+
+func (idMapFunctions) MinKey() uint32 {
+	return 0
+}
+
+func (idMapFunctions) MaxKey() uint32 {
+	return NoID
+}
+
+func (idMapFunctions) ClearValue(*uint32) {}
+
+func (idMapFunctions) Merge(r1 idMapRange, val1 uint32, r2 idMapRange, val2 uint32) (uint32, bool) {
+	// Mapped ranges have to be contiguous.
+	if val1+r1.Length() != val2 {
+		return 0, false
+	}
+	return val1, true
+}
+
+func (idMapFunctions) Split(r idMapRange, val uint32, split uint32) (uint32, uint32) {
+	return val, val + (split - r.Start)
+}
diff --git a/pkg/sentry/kernel/auth/id_map_range.go b/pkg/sentry/kernel/auth/id_map_range.go
new file mode 100755
index 000000000..833fa3518
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map_range.go
@@ -0,0 +1,62 @@
+package auth
+
+// A Range represents a contiguous range of T.
+//
+// +stateify savable
+type idMapRange struct {
+	// Start is the inclusive start of the range.
+	Start uint32
+
+	// End is the exclusive end of the range.
+	End uint32
+}
+
+// WellFormed returns true if r.Start <= r.End. All other methods on a Range
+// require that the Range is well-formed.
+func (r idMapRange) WellFormed() bool {
+	return r.Start <= r.End
+}
+
+// Length returns the length of the range.
+func (r idMapRange) Length() uint32 {
+	return r.End - r.Start
+}
+
+// Contains returns true if r contains x.
+func (r idMapRange) Contains(x uint32) bool {
+	return r.Start <= x && x < r.End
+}
+
+// Overlaps returns true if r and r2 overlap.
+func (r idMapRange) Overlaps(r2 idMapRange) bool {
+	return r.Start < r2.End && r2.Start < r.End
+}
+
+// IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is
+// contained within r.
+func (r idMapRange) IsSupersetOf(r2 idMapRange) bool {
+	return r.Start <= r2.Start && r.End >= r2.End
+}
+
+// Intersect returns a range consisting of the intersection between r and r2.
+// If r and r2 do not overlap, Intersect returns a range with unspecified
+// bounds, but for which Length() == 0.
+func (r idMapRange) Intersect(r2 idMapRange) idMapRange {
+	if r.Start < r2.Start {
+		r.Start = r2.Start
+	}
+	if r.End > r2.End {
+		r.End = r2.End
+	}
+	if r.End < r.Start {
+		r.End = r.Start
+	}
+	return r
+}
+
+// CanSplitAt returns true if it is legal to split a segment spanning the range
+// r at x; that is, splitting at x would produce two ranges, both of which have
+// non-zero length.
+func (r idMapRange) CanSplitAt(x uint32) bool {
+	return r.Contains(x) && r.Start < x
+}
diff --git a/pkg/sentry/kernel/auth/id_map_set.go b/pkg/sentry/kernel/auth/id_map_set.go
new file mode 100755
index 000000000..f72c839c7
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map_set.go
@@ -0,0 +1,1270 @@
+package auth
+
+import (
+	"bytes"
+	"fmt"
+)
+
+const (
+	// minDegree is the minimum degree of an internal node in a Set B-tree.
+	//
+	// - Any non-root node has at least minDegree-1 segments.
+	//
+	// - Any non-root internal (non-leaf) node has at least minDegree children.
+	//
+	// - The root node may have fewer than minDegree-1 segments, but it may
+	// only have 0 segments if the tree is empty.
+	//
+	// Our implementation requires minDegree >= 3. Higher values of minDegree
+	// usually improve performance, but increase memory usage for small sets.
+	idMapminDegree = 3
+
+	idMapmaxDegree = 2 * idMapminDegree
+)
+
+// A Set is a mapping of segments with non-overlapping Range keys. The zero
+// value for a Set is an empty set. Set values are not safely movable nor
+// copyable. Set is thread-compatible.
+//
+// +stateify savable
+type idMapSet struct {
+	root idMapnode `state:".(*idMapSegmentDataSlices)"`
+}
+
+// IsEmpty returns true if the set contains no segments.
+func (s *idMapSet) IsEmpty() bool {
+	return s.root.nrSegments == 0
+}
+
+// IsEmptyRange returns true iff no segments in the set overlap the given
+// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be
+// more efficient.
+func (s *idMapSet) IsEmptyRange(r idMapRange) bool {
+	switch {
+	case r.Length() < 0:
+		panic(fmt.Sprintf("invalid range %v", r))
+	case r.Length() == 0:
+		return true
+	}
+	_, gap := s.Find(r.Start)
+	if !gap.Ok() {
+		return false
+	}
+	return r.End <= gap.End()
+}
+
+// Span returns the total size of all segments in the set.
+func (s *idMapSet) Span() uint32 {
+	var sz uint32
+	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		sz += seg.Range().Length()
+	}
+	return sz
+}
+
+// SpanRange returns the total size of the intersection of segments in the set
+// with the given range.
+func (s *idMapSet) SpanRange(r idMapRange) uint32 {
+	switch {
+	case r.Length() < 0:
+		panic(fmt.Sprintf("invalid range %v", r))
+	case r.Length() == 0:
+		return 0
+	}
+	var sz uint32
+	for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() {
+		sz += seg.Range().Intersect(r).Length()
+	}
+	return sz
+}
+
+// FirstSegment returns the first segment in the set. If the set is empty,
+// FirstSegment returns a terminal iterator.
+func (s *idMapSet) FirstSegment() idMapIterator {
+	if s.root.nrSegments == 0 {
+		return idMapIterator{}
+	}
+	return s.root.firstSegment()
+}
+
+// LastSegment returns the last segment in the set. If the set is empty,
+// LastSegment returns a terminal iterator.
+func (s *idMapSet) LastSegment() idMapIterator {
+	if s.root.nrSegments == 0 {
+		return idMapIterator{}
+	}
+	return s.root.lastSegment()
+}
+
+// FirstGap returns the first gap in the set.
+func (s *idMapSet) FirstGap() idMapGapIterator {
+	n := &s.root
+	for n.hasChildren {
+		n = n.children[0]
+	}
+	return idMapGapIterator{n, 0}
+}
+
+// LastGap returns the last gap in the set.
+func (s *idMapSet) LastGap() idMapGapIterator {
+	n := &s.root
+	for n.hasChildren {
+		n = n.children[n.nrSegments]
+	}
+	return idMapGapIterator{n, n.nrSegments}
+}
+
+// Find returns the segment or gap whose range contains the given key. If a
+// segment is found, the returned Iterator is non-terminal and the
+// returned GapIterator is terminal. Otherwise, the returned Iterator is
+// terminal and the returned GapIterator is non-terminal.
+func (s *idMapSet) Find(key uint32) (idMapIterator, idMapGapIterator) {
+	n := &s.root
+	for {
+
+		lower := 0
+		upper := n.nrSegments
+		for lower < upper {
+			i := lower + (upper-lower)/2
+			if r := n.keys[i]; key < r.End {
+				if key >= r.Start {
+					return idMapIterator{n, i}, idMapGapIterator{}
+				}
+				upper = i
+			} else {
+				lower = i + 1
+			}
+		}
+		i := lower
+		if !n.hasChildren {
+			return idMapIterator{}, idMapGapIterator{n, i}
+		}
+		n = n.children[i]
+	}
+}
+
+// FindSegment returns the segment whose range contains the given key. If no
+// such segment exists, FindSegment returns a terminal iterator.
+func (s *idMapSet) FindSegment(key uint32) idMapIterator {
+	seg, _ := s.Find(key)
+	return seg
+}
+
+// LowerBoundSegment returns the segment with the lowest range that contains a
+// key greater than or equal to min. If no such segment exists,
+// LowerBoundSegment returns a terminal iterator.
+func (s *idMapSet) LowerBoundSegment(min uint32) idMapIterator {
+	seg, gap := s.Find(min)
+	if seg.Ok() {
+		return seg
+	}
+	return gap.NextSegment()
+}
+
+// UpperBoundSegment returns the segment with the highest range that contains a
+// key less than or equal to max. If no such segment exists, UpperBoundSegment
+// returns a terminal iterator.
+func (s *idMapSet) UpperBoundSegment(max uint32) idMapIterator {
+	seg, gap := s.Find(max)
+	if seg.Ok() {
+		return seg
+	}
+	return gap.PrevSegment()
+}
+
+// FindGap returns the gap containing the given key. If no such gap exists
+// (i.e. the set contains a segment containing that key), FindGap returns a
+// terminal iterator.
+func (s *idMapSet) FindGap(key uint32) idMapGapIterator {
+	_, gap := s.Find(key)
+	return gap
+}
+
+// LowerBoundGap returns the gap with the lowest range that is greater than or
+// equal to min.
+func (s *idMapSet) LowerBoundGap(min uint32) idMapGapIterator {
+	seg, gap := s.Find(min)
+	if gap.Ok() {
+		return gap
+	}
+	return seg.NextGap()
+}
+
+// UpperBoundGap returns the gap with the highest range that is less than or
+// equal to max.
+func (s *idMapSet) UpperBoundGap(max uint32) idMapGapIterator {
+	seg, gap := s.Find(max)
+	if gap.Ok() {
+		return gap
+	}
+	return seg.PrevGap()
+}
+
+// Add inserts the given segment into the set and returns true. If the new
+// segment can be merged with adjacent segments, Add will do so. If the new
+// segment would overlap an existing segment, Add returns false. If Add
+// succeeds, all existing iterators are invalidated.
+func (s *idMapSet) Add(r idMapRange, val uint32) bool {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	gap := s.FindGap(r.Start)
+	if !gap.Ok() {
+		return false
+	}
+	if r.End > gap.End() {
+		return false
+	}
+	s.Insert(gap, r, val)
+	return true
+}
+
+// AddWithoutMerging inserts the given segment into the set and returns true.
+// If it would overlap an existing segment, AddWithoutMerging does nothing and
+// returns false. If AddWithoutMerging succeeds, all existing iterators are
+// invalidated.
+func (s *idMapSet) AddWithoutMerging(r idMapRange, val uint32) bool {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	gap := s.FindGap(r.Start)
+	if !gap.Ok() {
+		return false
+	}
+	if r.End > gap.End() {
+		return false
+	}
+	s.InsertWithoutMergingUnchecked(gap, r, val)
+	return true
+}
+
+// Insert inserts the given segment into the given gap. If the new segment can
+// be merged with adjacent segments, Insert will do so. Insert returns an
+// iterator to the segment containing the inserted value (which may have been
+// merged with other values). All existing iterators (including gap, but not
+// including the returned iterator) are invalidated.
+//
+// If the gap cannot accommodate the segment, or if r is invalid, Insert panics.
+//
+// Insert is semantically equivalent to a InsertWithoutMerging followed by a
+// Merge, but may be more efficient. Note that there is no unchecked variant of
+// Insert since Insert must retrieve and inspect gap's predecessor and
+// successor segments regardless.
+func (s *idMapSet) Insert(gap idMapGapIterator, r idMapRange, val uint32) idMapIterator {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	prev, next := gap.PrevSegment(), gap.NextSegment()
+	if prev.Ok() && prev.End() > r.Start {
+		panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range()))
+	}
+	if next.Ok() && next.Start() < r.End {
+		panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range()))
+	}
+	if prev.Ok() && prev.End() == r.Start {
+		if mval, ok := (idMapFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok {
+			prev.SetEndUnchecked(r.End)
+			prev.SetValue(mval)
+			if next.Ok() && next.Start() == r.End {
+				val = mval
+				if mval, ok := (idMapFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok {
+					prev.SetEndUnchecked(next.End())
+					prev.SetValue(mval)
+					return s.Remove(next).PrevSegment()
+				}
+			}
+			return prev
+		}
+	}
+	if next.Ok() && next.Start() == r.End {
+		if mval, ok := (idMapFunctions{}).Merge(r, val, next.Range(), next.Value()); ok {
+			next.SetStartUnchecked(r.Start)
+			next.SetValue(mval)
+			return next
+		}
+	}
+	return s.InsertWithoutMergingUnchecked(gap, r, val)
+}
+
+// InsertWithoutMerging inserts the given segment into the given gap and
+// returns an iterator to the inserted segment. All existing iterators
+// (including gap, but not including the returned iterator) are invalidated.
+//
+// If the gap cannot accommodate the segment, or if r is invalid,
+// InsertWithoutMerging panics.
+func (s *idMapSet) InsertWithoutMerging(gap idMapGapIterator, r idMapRange, val uint32) idMapIterator {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	if gr := gap.Range(); !gr.IsSupersetOf(r) {
+		panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr))
+	}
+	return s.InsertWithoutMergingUnchecked(gap, r, val)
+}
+
+// InsertWithoutMergingUnchecked inserts the given segment into the given gap
+// and returns an iterator to the inserted segment. All existing iterators
+// (including gap, but not including the returned iterator) are invalidated.
+//
+// Preconditions: r.Start >= gap.Start(); r.End <= gap.End().
+func (s *idMapSet) InsertWithoutMergingUnchecked(gap idMapGapIterator, r idMapRange, val uint32) idMapIterator {
+	gap = gap.node.rebalanceBeforeInsert(gap)
+	copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments])
+	copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments])
+	gap.node.keys[gap.index] = r
+	gap.node.values[gap.index] = val
+	gap.node.nrSegments++
+	return idMapIterator{gap.node, gap.index}
+}
+
+// Remove removes the given segment and returns an iterator to the vacated gap.
+// All existing iterators (including seg, but not including the returned
+// iterator) are invalidated.
+func (s *idMapSet) Remove(seg idMapIterator) idMapGapIterator {
+
+	if seg.node.hasChildren {
+
+		victim := seg.PrevSegment()
+
+		seg.SetRangeUnchecked(victim.Range())
+		seg.SetValue(victim.Value())
+		return s.Remove(victim).NextGap()
+	}
+	copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments])
+	copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments])
+	idMapFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1])
+	seg.node.nrSegments--
+	return seg.node.rebalanceAfterRemove(idMapGapIterator{seg.node, seg.index})
+}
+
+// RemoveAll removes all segments from the set. All existing iterators are
+// invalidated.
+func (s *idMapSet) RemoveAll() {
+	s.root = idMapnode{}
+}
+
+// RemoveRange removes all segments in the given range. An iterator to the
+// newly formed gap is returned, and all existing iterators are invalidated.
+func (s *idMapSet) RemoveRange(r idMapRange) idMapGapIterator {
+	seg, gap := s.Find(r.Start)
+	if seg.Ok() {
+		seg = s.Isolate(seg, r)
+		gap = s.Remove(seg)
+	}
+	for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() {
+		seg = s.Isolate(seg, r)
+		gap = s.Remove(seg)
+	}
+	return gap
+}
+
+// Merge attempts to merge two neighboring segments. If successful, Merge
+// returns an iterator to the merged segment, and all existing iterators are
+// invalidated. Otherwise, Merge returns a terminal iterator.
+//
+// If first is not the predecessor of second, Merge panics.
+func (s *idMapSet) Merge(first, second idMapIterator) idMapIterator {
+	if first.NextSegment() != second {
+		panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range()))
+	}
+	return s.MergeUnchecked(first, second)
+}
+
+// MergeUnchecked attempts to merge two neighboring segments. If successful,
+// MergeUnchecked returns an iterator to the merged segment, and all existing
+// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal
+// iterator.
+//
+// Precondition: first is the predecessor of second: first.NextSegment() ==
+// second, first == second.PrevSegment().
+func (s *idMapSet) MergeUnchecked(first, second idMapIterator) idMapIterator {
+	if first.End() == second.Start() {
+		if mval, ok := (idMapFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok {
+
+			first.SetEndUnchecked(second.End())
+			first.SetValue(mval)
+			return s.Remove(second).PrevSegment()
+		}
+	}
+	return idMapIterator{}
+}
+
+// MergeAll attempts to merge all adjacent segments in the set. All existing
+// iterators are invalidated.
+func (s *idMapSet) MergeAll() {
+	seg := s.FirstSegment()
+	if !seg.Ok() {
+		return
+	}
+	next := seg.NextSegment()
+	for next.Ok() {
+		if mseg := s.MergeUnchecked(seg, next); mseg.Ok() {
+			seg, next = mseg, mseg.NextSegment()
+		} else {
+			seg, next = next, next.NextSegment()
+		}
+	}
+}
+
+// MergeRange attempts to merge all adjacent segments that contain a key in the
+// specific range. All existing iterators are invalidated.
+func (s *idMapSet) MergeRange(r idMapRange) {
+	seg := s.LowerBoundSegment(r.Start)
+	if !seg.Ok() {
+		return
+	}
+	next := seg.NextSegment()
+	for next.Ok() && next.Range().Start < r.End {
+		if mseg := s.MergeUnchecked(seg, next); mseg.Ok() {
+			seg, next = mseg, mseg.NextSegment()
+		} else {
+			seg, next = next, next.NextSegment()
+		}
+	}
+}
+
+// MergeAdjacent attempts to merge the segment containing r.Start with its
+// predecessor, and the segment containing r.End-1 with its successor.
+func (s *idMapSet) MergeAdjacent(r idMapRange) {
+	first := s.FindSegment(r.Start)
+	if first.Ok() {
+		if prev := first.PrevSegment(); prev.Ok() {
+			s.Merge(prev, first)
+		}
+	}
+	last := s.FindSegment(r.End - 1)
+	if last.Ok() {
+		if next := last.NextSegment(); next.Ok() {
+			s.Merge(last, next)
+		}
+	}
+}
+
+// Split splits the given segment at the given key and returns iterators to the
+// two resulting segments. All existing iterators (including seg, but not
+// including the returned iterators) are invalidated.
+//
+// If the segment cannot be split at split (because split is at the start or
+// end of the segment's range, so splitting would produce a segment with zero
+// length, or because split falls outside the segment's range altogether),
+// Split panics.
+func (s *idMapSet) Split(seg idMapIterator, split uint32) (idMapIterator, idMapIterator) {
+	if !seg.Range().CanSplitAt(split) {
+		panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split))
+	}
+	return s.SplitUnchecked(seg, split)
+}
+
+// SplitUnchecked splits the given segment at the given key and returns
+// iterators to the two resulting segments. All existing iterators (including
+// seg, but not including the returned iterators) are invalidated.
+//
+// Preconditions: seg.Start() < key < seg.End().
+func (s *idMapSet) SplitUnchecked(seg idMapIterator, split uint32) (idMapIterator, idMapIterator) {
+	val1, val2 := (idMapFunctions{}).Split(seg.Range(), seg.Value(), split)
+	end2 := seg.End()
+	seg.SetEndUnchecked(split)
+	seg.SetValue(val1)
+	seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), idMapRange{split, end2}, val2)
+
+	return seg2.PrevSegment(), seg2
+}
+
+// SplitAt splits the segment straddling split, if one exists. SplitAt returns
+// true if a segment was split and false otherwise. If SplitAt splits a
+// segment, all existing iterators are invalidated.
+func (s *idMapSet) SplitAt(split uint32) bool {
+	if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) {
+		s.SplitUnchecked(seg, split)
+		return true
+	}
+	return false
+}
+
+// Isolate ensures that the given segment's range does not escape r by
+// splitting at r.Start and r.End if necessary, and returns an updated iterator
+// to the bounded segment. All existing iterators (including seg, but not
+// including the returned iterators) are invalidated.
+func (s *idMapSet) Isolate(seg idMapIterator, r idMapRange) idMapIterator {
+	if seg.Range().CanSplitAt(r.Start) {
+		_, seg = s.SplitUnchecked(seg, r.Start)
+	}
+	if seg.Range().CanSplitAt(r.End) {
+		seg, _ = s.SplitUnchecked(seg, r.End)
+	}
+	return seg
+}
+
+// ApplyContiguous applies a function to a contiguous range of segments,
+// splitting if necessary. The function is applied until the first gap is
+// encountered, at which point the gap is returned. If the function is applied
+// across the entire range, a terminal gap is returned. All existing iterators
+// are invalidated.
+//
+// N.B. The Iterator must not be invalidated by the function.
+func (s *idMapSet) ApplyContiguous(r idMapRange, fn func(seg idMapIterator)) idMapGapIterator {
+	seg, gap := s.Find(r.Start)
+	if !seg.Ok() {
+		return gap
+	}
+	for {
+		seg = s.Isolate(seg, r)
+		fn(seg)
+		if seg.End() >= r.End {
+			return idMapGapIterator{}
+		}
+		gap = seg.NextGap()
+		if !gap.IsEmpty() {
+			return gap
+		}
+		seg = gap.NextSegment()
+		if !seg.Ok() {
+
+			return idMapGapIterator{}
+		}
+	}
+}
+
+// +stateify savable
+type idMapnode struct {
+	// An internal binary tree node looks like:
+	//
+	//   K
+	//  / \
+	// Cl Cr
+	//
+	// where all keys in the subtree rooted by Cl (the left subtree) are less
+	// than K (the key of the parent node), and all keys in the subtree rooted
+	// by Cr (the right subtree) are greater than K.
+	//
+	// An internal B-tree node's indexes work out to look like:
+	//
+	//   K0 K1 K2  ...   Kn-1
+	//  / \/ \/ \  ...  /  \
+	// C0 C1 C2 C3 ... Cn-1 Cn
+	//
+	// where n is nrSegments.
+	nrSegments int
+
+	// parent is a pointer to this node's parent. If this node is root, parent
+	// is nil.
+	parent *idMapnode
+
+	// parentIndex is the index of this node in parent.children.
+	parentIndex int
+
+	// Flag for internal nodes that is technically redundant with "children[0]
+	// != nil", but is stored in the first cache line. "hasChildren" rather
+	// than "isLeaf" because false must be the correct value for an empty root.
+	hasChildren bool
+
+	// Nodes store keys and values in separate arrays to maximize locality in
+	// the common case (scanning keys for lookup).
+	keys     [idMapmaxDegree - 1]idMapRange
+	values   [idMapmaxDegree - 1]uint32
+	children [idMapmaxDegree]*idMapnode
+}
+
+// firstSegment returns the first segment in the subtree rooted by n.
+//
+// Preconditions: n.nrSegments != 0.
+func (n *idMapnode) firstSegment() idMapIterator {
+	for n.hasChildren {
+		n = n.children[0]
+	}
+	return idMapIterator{n, 0}
+}
+
+// lastSegment returns the last segment in the subtree rooted by n.
+//
+// Preconditions: n.nrSegments != 0.
+func (n *idMapnode) lastSegment() idMapIterator {
+	for n.hasChildren {
+		n = n.children[n.nrSegments]
+	}
+	return idMapIterator{n, n.nrSegments - 1}
+}
+
+func (n *idMapnode) prevSibling() *idMapnode {
+	if n.parent == nil || n.parentIndex == 0 {
+		return nil
+	}
+	return n.parent.children[n.parentIndex-1]
+}
+
+func (n *idMapnode) nextSibling() *idMapnode {
+	if n.parent == nil || n.parentIndex == n.parent.nrSegments {
+		return nil
+	}
+	return n.parent.children[n.parentIndex+1]
+}
+
+// rebalanceBeforeInsert splits n and its ancestors if they are full, as
+// required for insertion, and returns an updated iterator to the position
+// represented by gap.
+func (n *idMapnode) rebalanceBeforeInsert(gap idMapGapIterator) idMapGapIterator {
+	if n.parent != nil {
+		gap = n.parent.rebalanceBeforeInsert(gap)
+	}
+	if n.nrSegments < idMapmaxDegree-1 {
+		return gap
+	}
+	if n.parent == nil {
+
+		left := &idMapnode{
+			nrSegments:  idMapminDegree - 1,
+			parent:      n,
+			parentIndex: 0,
+			hasChildren: n.hasChildren,
+		}
+		right := &idMapnode{
+			nrSegments:  idMapminDegree - 1,
+			parent:      n,
+			parentIndex: 1,
+			hasChildren: n.hasChildren,
+		}
+		copy(left.keys[:idMapminDegree-1], n.keys[:idMapminDegree-1])
+		copy(left.values[:idMapminDegree-1], n.values[:idMapminDegree-1])
+		copy(right.keys[:idMapminDegree-1], n.keys[idMapminDegree:])
+		copy(right.values[:idMapminDegree-1], n.values[idMapminDegree:])
+		n.keys[0], n.values[0] = n.keys[idMapminDegree-1], n.values[idMapminDegree-1]
+		idMapzeroValueSlice(n.values[1:])
+		if n.hasChildren {
+			copy(left.children[:idMapminDegree], n.children[:idMapminDegree])
+			copy(right.children[:idMapminDegree], n.children[idMapminDegree:])
+			idMapzeroNodeSlice(n.children[2:])
+			for i := 0; i < idMapminDegree; i++ {
+				left.children[i].parent = left
+				left.children[i].parentIndex = i
+				right.children[i].parent = right
+				right.children[i].parentIndex = i
+			}
+		}
+		n.nrSegments = 1
+		n.hasChildren = true
+		n.children[0] = left
+		n.children[1] = right
+		if gap.node != n {
+			return gap
+		}
+		if gap.index < idMapminDegree {
+			return idMapGapIterator{left, gap.index}
+		}
+		return idMapGapIterator{right, gap.index - idMapminDegree}
+	}
+
+	copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments])
+	copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments])
+	n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[idMapminDegree-1], n.values[idMapminDegree-1]
+	copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1])
+	for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ {
+		n.parent.children[i].parentIndex = i
+	}
+	sibling := &idMapnode{
+		nrSegments:  idMapminDegree - 1,
+		parent:      n.parent,
+		parentIndex: n.parentIndex + 1,
+		hasChildren: n.hasChildren,
+	}
+	n.parent.children[n.parentIndex+1] = sibling
+	n.parent.nrSegments++
+	copy(sibling.keys[:idMapminDegree-1], n.keys[idMapminDegree:])
+	copy(sibling.values[:idMapminDegree-1], n.values[idMapminDegree:])
+	idMapzeroValueSlice(n.values[idMapminDegree-1:])
+	if n.hasChildren {
+		copy(sibling.children[:idMapminDegree], n.children[idMapminDegree:])
+		idMapzeroNodeSlice(n.children[idMapminDegree:])
+		for i := 0; i < idMapminDegree; i++ {
+			sibling.children[i].parent = sibling
+			sibling.children[i].parentIndex = i
+		}
+	}
+	n.nrSegments = idMapminDegree - 1
+
+	if gap.node != n {
+		return gap
+	}
+	if gap.index < idMapminDegree {
+		return gap
+	}
+	return idMapGapIterator{sibling, gap.index - idMapminDegree}
+}
+
+// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient
+// (contain fewer segments than required by B-tree invariants), as required for
+// removal, and returns an updated iterator to the position represented by gap.
+//
+// Precondition: n is the only node in the tree that may currently violate a
+// B-tree invariant.
+func (n *idMapnode) rebalanceAfterRemove(gap idMapGapIterator) idMapGapIterator {
+	for {
+		if n.nrSegments >= idMapminDegree-1 {
+			return gap
+		}
+		if n.parent == nil {
+
+			return gap
+		}
+
+		if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= idMapminDegree {
+			copy(n.keys[1:], n.keys[:n.nrSegments])
+			copy(n.values[1:], n.values[:n.nrSegments])
+			n.keys[0] = n.parent.keys[n.parentIndex-1]
+			n.values[0] = n.parent.values[n.parentIndex-1]
+			n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1]
+			n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1]
+			idMapFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1])
+			if n.hasChildren {
+				copy(n.children[1:], n.children[:n.nrSegments+1])
+				n.children[0] = sibling.children[sibling.nrSegments]
+				sibling.children[sibling.nrSegments] = nil
+				n.children[0].parent = n
+				n.children[0].parentIndex = 0
+				for i := 1; i < n.nrSegments+2; i++ {
+					n.children[i].parentIndex = i
+				}
+			}
+			n.nrSegments++
+			sibling.nrSegments--
+			if gap.node == sibling && gap.index == sibling.nrSegments {
+				return idMapGapIterator{n, 0}
+			}
+			if gap.node == n {
+				return idMapGapIterator{n, gap.index + 1}
+			}
+			return gap
+		}
+		if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= idMapminDegree {
+			n.keys[n.nrSegments] = n.parent.keys[n.parentIndex]
+			n.values[n.nrSegments] = n.parent.values[n.parentIndex]
+			n.parent.keys[n.parentIndex] = sibling.keys[0]
+			n.parent.values[n.parentIndex] = sibling.values[0]
+			copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:])
+			copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:])
+			idMapFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1])
+			if n.hasChildren {
+				n.children[n.nrSegments+1] = sibling.children[0]
+				copy(sibling.children[:sibling.nrSegments], sibling.children[1:])
+				sibling.children[sibling.nrSegments] = nil
+				n.children[n.nrSegments+1].parent = n
+				n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1
+				for i := 0; i < sibling.nrSegments; i++ {
+					sibling.children[i].parentIndex = i
+				}
+			}
+			n.nrSegments++
+			sibling.nrSegments--
+			if gap.node == sibling {
+				if gap.index == 0 {
+					return idMapGapIterator{n, n.nrSegments}
+				}
+				return idMapGapIterator{sibling, gap.index - 1}
+			}
+			return gap
+		}
+
+		p := n.parent
+		if p.nrSegments == 1 {
+
+			left, right := p.children[0], p.children[1]
+			p.nrSegments = left.nrSegments + right.nrSegments + 1
+			p.hasChildren = left.hasChildren
+			p.keys[left.nrSegments] = p.keys[0]
+			p.values[left.nrSegments] = p.values[0]
+			copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments])
+			copy(p.values[:left.nrSegments], left.values[:left.nrSegments])
+			copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments])
+			copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments])
+			if left.hasChildren {
+				copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1])
+				copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1])
+				for i := 0; i < p.nrSegments+1; i++ {
+					p.children[i].parent = p
+					p.children[i].parentIndex = i
+				}
+			} else {
+				p.children[0] = nil
+				p.children[1] = nil
+			}
+			if gap.node == left {
+				return idMapGapIterator{p, gap.index}
+			}
+			if gap.node == right {
+				return idMapGapIterator{p, gap.index + left.nrSegments + 1}
+			}
+			return gap
+		}
+		// Merge n and either sibling, along with the segment separating the
+		// two, into whichever of the two nodes comes first. This is the
+		// reverse of the non-root splitting case in
+		// node.rebalanceBeforeInsert.
+		var left, right *idMapnode
+		if n.parentIndex > 0 {
+			left = n.prevSibling()
+			right = n
+		} else {
+			left = n
+			right = n.nextSibling()
+		}
+
+		if gap.node == right {
+			gap = idMapGapIterator{left, gap.index + left.nrSegments + 1}
+		}
+		left.keys[left.nrSegments] = p.keys[left.parentIndex]
+		left.values[left.nrSegments] = p.values[left.parentIndex]
+		copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments])
+		copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments])
+		if left.hasChildren {
+			copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1])
+			for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ {
+				left.children[i].parent = left
+				left.children[i].parentIndex = i
+			}
+		}
+		left.nrSegments += right.nrSegments + 1
+		copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments])
+		copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments])
+		idMapFunctions{}.ClearValue(&p.values[p.nrSegments-1])
+		copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1])
+		for i := 0; i < p.nrSegments; i++ {
+			p.children[i].parentIndex = i
+		}
+		p.children[p.nrSegments] = nil
+		p.nrSegments--
+
+		n = p
+	}
+}
+
+// A Iterator is conceptually one of:
+//
+// - A pointer to a segment in a set; or
+//
+// - A terminal iterator, which is a sentinel indicating that the end of
+// iteration has been reached.
+//
+// Iterators are copyable values and are meaningfully equality-comparable. The
+// zero value of Iterator is a terminal iterator.
+//
+// Unless otherwise specified, any mutation of a set invalidates all existing
+// iterators into the set.
+type idMapIterator struct {
+	// node is the node containing the iterated segment. If the iterator is
+	// terminal, node is nil.
+	node *idMapnode
+
+	// index is the index of the segment in node.keys/values.
+	index int
+}
+
+// Ok returns true if the iterator is not terminal. All other methods are only
+// valid for non-terminal iterators.
+func (seg idMapIterator) Ok() bool {
+	return seg.node != nil
+}
+
+// Range returns the iterated segment's range key.
+func (seg idMapIterator) Range() idMapRange {
+	return seg.node.keys[seg.index]
+}
+
+// Start is equivalent to Range().Start, but should be preferred if only the
+// start of the range is needed.
+func (seg idMapIterator) Start() uint32 {
+	return seg.node.keys[seg.index].Start
+}
+
+// End is equivalent to Range().End, but should be preferred if only the end of
+// the range is needed.
+func (seg idMapIterator) End() uint32 {
+	return seg.node.keys[seg.index].End
+}
+
+// SetRangeUnchecked mutates the iterated segment's range key. This operation
+// does not invalidate any iterators.
+//
+// Preconditions:
+//
+// - r.Length() > 0.
+//
+// - The new range must not overlap an existing one: If seg.NextSegment().Ok(),
+// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then
+// r.start >= seg.PrevSegment().End().
+func (seg idMapIterator) SetRangeUnchecked(r idMapRange) {
+	seg.node.keys[seg.index] = r
+}
+
+// SetRange mutates the iterated segment's range key. If the new range would
+// cause the iterated segment to overlap another segment, or if the new range
+// is invalid, SetRange panics. This operation does not invalidate any
+// iterators.
+func (seg idMapIterator) SetRange(r idMapRange) {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() {
+		panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range()))
+	}
+	if next := seg.NextSegment(); next.Ok() && r.End > next.Start() {
+		panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range()))
+	}
+	seg.SetRangeUnchecked(r)
+}
+
+// SetStartUnchecked mutates the iterated segment's start. This operation does
+// not invalidate any iterators.
+//
+// Preconditions: The new start must be valid: start < seg.End(); if
+// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End().
+func (seg idMapIterator) SetStartUnchecked(start uint32) {
+	seg.node.keys[seg.index].Start = start
+}
+
+// SetStart mutates the iterated segment's start. If the new start value would
+// cause the iterated segment to overlap another segment, or would result in an
+// invalid range, SetStart panics. This operation does not invalidate any
+// iterators.
+func (seg idMapIterator) SetStart(start uint32) {
+	if start >= seg.End() {
+		panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range()))
+	}
+	if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() {
+		panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range()))
+	}
+	seg.SetStartUnchecked(start)
+}
+
+// SetEndUnchecked mutates the iterated segment's end. This operation does not
+// invalidate any iterators.
+//
+// Preconditions: The new end must be valid: end > seg.Start(); if
+// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start().
+func (seg idMapIterator) SetEndUnchecked(end uint32) {
+	seg.node.keys[seg.index].End = end
+}
+
+// SetEnd mutates the iterated segment's end. If the new end value would cause
+// the iterated segment to overlap another segment, or would result in an
+// invalid range, SetEnd panics. This operation does not invalidate any
+// iterators.
+func (seg idMapIterator) SetEnd(end uint32) {
+	if end <= seg.Start() {
+		panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range()))
+	}
+	if next := seg.NextSegment(); next.Ok() && end > next.Start() {
+		panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range()))
+	}
+	seg.SetEndUnchecked(end)
+}
+
+// Value returns a copy of the iterated segment's value.
+func (seg idMapIterator) Value() uint32 {
+	return seg.node.values[seg.index]
+}
+
+// ValuePtr returns a pointer to the iterated segment's value. The pointer is
+// invalidated if the iterator is invalidated. This operation does not
+// invalidate any iterators.
+func (seg idMapIterator) ValuePtr() *uint32 {
+	return &seg.node.values[seg.index]
+}
+
+// SetValue mutates the iterated segment's value. This operation does not
+// invalidate any iterators.
+func (seg idMapIterator) SetValue(val uint32) {
+	seg.node.values[seg.index] = val
+}
+
+// PrevSegment returns the iterated segment's predecessor. If there is no
+// preceding segment, PrevSegment returns a terminal iterator.
+func (seg idMapIterator) PrevSegment() idMapIterator {
+	if seg.node.hasChildren {
+		return seg.node.children[seg.index].lastSegment()
+	}
+	if seg.index > 0 {
+		return idMapIterator{seg.node, seg.index - 1}
+	}
+	if seg.node.parent == nil {
+		return idMapIterator{}
+	}
+	return idMapsegmentBeforePosition(seg.node.parent, seg.node.parentIndex)
+}
+
+// NextSegment returns the iterated segment's successor. If there is no
+// succeeding segment, NextSegment returns a terminal iterator.
+func (seg idMapIterator) NextSegment() idMapIterator {
+	if seg.node.hasChildren {
+		return seg.node.children[seg.index+1].firstSegment()
+	}
+	if seg.index < seg.node.nrSegments-1 {
+		return idMapIterator{seg.node, seg.index + 1}
+	}
+	if seg.node.parent == nil {
+		return idMapIterator{}
+	}
+	return idMapsegmentAfterPosition(seg.node.parent, seg.node.parentIndex)
+}
+
+// PrevGap returns the gap immediately before the iterated segment.
+func (seg idMapIterator) PrevGap() idMapGapIterator {
+	if seg.node.hasChildren {
+
+		return seg.node.children[seg.index].lastSegment().NextGap()
+	}
+	return idMapGapIterator{seg.node, seg.index}
+}
+
+// NextGap returns the gap immediately after the iterated segment.
+func (seg idMapIterator) NextGap() idMapGapIterator {
+	if seg.node.hasChildren {
+		return seg.node.children[seg.index+1].firstSegment().PrevGap()
+	}
+	return idMapGapIterator{seg.node, seg.index + 1}
+}
+
+// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent,
+// or the gap before the iterated segment otherwise. If seg.Start() ==
+// Functions.MinKey(), PrevNonEmpty will return two terminal iterators.
+// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be
+// non-terminal.
+func (seg idMapIterator) PrevNonEmpty() (idMapIterator, idMapGapIterator) {
+	gap := seg.PrevGap()
+	if gap.Range().Length() != 0 {
+		return idMapIterator{}, gap
+	}
+	return gap.PrevSegment(), idMapGapIterator{}
+}
+
+// NextNonEmpty returns the iterated segment's successor if it is adjacent, or
+// the gap after the iterated segment otherwise. If seg.End() ==
+// Functions.MaxKey(), NextNonEmpty will return two terminal iterators.
+// Otherwise, exactly one of the iterators returned by NextNonEmpty will be
+// non-terminal.
+func (seg idMapIterator) NextNonEmpty() (idMapIterator, idMapGapIterator) {
+	gap := seg.NextGap()
+	if gap.Range().Length() != 0 {
+		return idMapIterator{}, gap
+	}
+	return gap.NextSegment(), idMapGapIterator{}
+}
+
+// A GapIterator is conceptually one of:
+//
+// - A pointer to a position between two segments, before the first segment, or
+// after the last segment in a set, called a *gap*; or
+//
+// - A terminal iterator, which is a sentinel indicating that the end of
+// iteration has been reached.
+//
+// Note that the gap between two adjacent segments exists (iterators to it are
+// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true
+// for such gaps. An empty set contains a single gap, spanning the entire range
+// of the set's keys.
+//
+// GapIterators are copyable values and are meaningfully equality-comparable.
+// The zero value of GapIterator is a terminal iterator.
+//
+// Unless otherwise specified, any mutation of a set invalidates all existing
+// iterators into the set.
+type idMapGapIterator struct {
+	// The representation of a GapIterator is identical to that of an Iterator,
+	// except that index corresponds to positions between segments in the same
+	// way as for node.children (see comment for node.nrSegments).
+	node  *idMapnode
+	index int
+}
+
+// Ok returns true if the iterator is not terminal. All other methods are only
+// valid for non-terminal iterators.
+func (gap idMapGapIterator) Ok() bool {
+	return gap.node != nil
+}
+
+// Range returns the range spanned by the iterated gap.
+func (gap idMapGapIterator) Range() idMapRange {
+	return idMapRange{gap.Start(), gap.End()}
+}
+
+// Start is equivalent to Range().Start, but should be preferred if only the
+// start of the range is needed.
+func (gap idMapGapIterator) Start() uint32 {
+	if ps := gap.PrevSegment(); ps.Ok() {
+		return ps.End()
+	}
+	return idMapFunctions{}.MinKey()
+}
+
+// End is equivalent to Range().End, but should be preferred if only the end of
+// the range is needed.
+func (gap idMapGapIterator) End() uint32 {
+	if ns := gap.NextSegment(); ns.Ok() {
+		return ns.Start()
+	}
+	return idMapFunctions{}.MaxKey()
+}
+
+// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is
+// between two adjacent segments.)
+func (gap idMapGapIterator) IsEmpty() bool {
+	return gap.Range().Length() == 0
+}
+
+// PrevSegment returns the segment immediately before the iterated gap. If no
+// such segment exists, PrevSegment returns a terminal iterator.
+func (gap idMapGapIterator) PrevSegment() idMapIterator {
+	return idMapsegmentBeforePosition(gap.node, gap.index)
+}
+
+// NextSegment returns the segment immediately after the iterated gap. If no
+// such segment exists, NextSegment returns a terminal iterator.
+func (gap idMapGapIterator) NextSegment() idMapIterator {
+	return idMapsegmentAfterPosition(gap.node, gap.index)
+}
+
+// PrevGap returns the iterated gap's predecessor. If no such gap exists,
+// PrevGap returns a terminal iterator.
+func (gap idMapGapIterator) PrevGap() idMapGapIterator {
+	seg := gap.PrevSegment()
+	if !seg.Ok() {
+		return idMapGapIterator{}
+	}
+	return seg.PrevGap()
+}
+
+// NextGap returns the iterated gap's successor. If no such gap exists, NextGap
+// returns a terminal iterator.
+func (gap idMapGapIterator) NextGap() idMapGapIterator {
+	seg := gap.NextSegment()
+	if !seg.Ok() {
+		return idMapGapIterator{}
+	}
+	return seg.NextGap()
+}
+
+// segmentBeforePosition returns the predecessor segment of the position given
+// by n.children[i], which may or may not contain a child. If no such segment
+// exists, segmentBeforePosition returns a terminal iterator.
+func idMapsegmentBeforePosition(n *idMapnode, i int) idMapIterator {
+	for i == 0 {
+		if n.parent == nil {
+			return idMapIterator{}
+		}
+		n, i = n.parent, n.parentIndex
+	}
+	return idMapIterator{n, i - 1}
+}
+
+// segmentAfterPosition returns the successor segment of the position given by
+// n.children[i], which may or may not contain a child. If no such segment
+// exists, segmentAfterPosition returns a terminal iterator.
+func idMapsegmentAfterPosition(n *idMapnode, i int) idMapIterator {
+	for i == n.nrSegments {
+		if n.parent == nil {
+			return idMapIterator{}
+		}
+		n, i = n.parent, n.parentIndex
+	}
+	return idMapIterator{n, i}
+}
+
+func idMapzeroValueSlice(slice []uint32) {
+
+	for i := range slice {
+		idMapFunctions{}.ClearValue(&slice[i])
+	}
+}
+
+func idMapzeroNodeSlice(slice []*idMapnode) {
+	for i := range slice {
+		slice[i] = nil
+	}
+}
+
+// String stringifies a Set for debugging.
+func (s *idMapSet) String() string {
+	return s.root.String()
+}
+
+// String stringifes a node (and all of its children) for debugging.
+func (n *idMapnode) String() string {
+	var buf bytes.Buffer
+	n.writeDebugString(&buf, "")
+	return buf.String()
+}
+
+func (n *idMapnode) writeDebugString(buf *bytes.Buffer, prefix string) {
+	if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) {
+		buf.WriteString(prefix)
+		buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren))
+	}
+	for i := 0; i < n.nrSegments; i++ {
+		if child := n.children[i]; child != nil {
+			cprefix := fmt.Sprintf("%s- % 3d ", prefix, i)
+			if child.parent != n || child.parentIndex != i {
+				buf.WriteString(cprefix)
+				buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i))
+			}
+			child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i))
+		}
+		buf.WriteString(prefix)
+		buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i]))
+	}
+	if child := n.children[n.nrSegments]; child != nil {
+		child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments))
+	}
+}
+
+// SegmentDataSlices represents segments from a set as slices of start, end, and
+// values. SegmentDataSlices is primarily used as an intermediate representation
+// for save/restore and the layout here is optimized for that.
+//
+// +stateify savable
+type idMapSegmentDataSlices struct {
+	Start  []uint32
+	End    []uint32
+	Values []uint32
+}
+
+// ExportSortedSlice returns a copy of all segments in the given set, in ascending
+// key order.
+func (s *idMapSet) ExportSortedSlices() *idMapSegmentDataSlices {
+	var sds idMapSegmentDataSlices
+	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		sds.Start = append(sds.Start, seg.Start())
+		sds.End = append(sds.End, seg.End())
+		sds.Values = append(sds.Values, seg.Value())
+	}
+	sds.Start = sds.Start[:len(sds.Start):len(sds.Start)]
+	sds.End = sds.End[:len(sds.End):len(sds.End)]
+	sds.Values = sds.Values[:len(sds.Values):len(sds.Values)]
+	return &sds
+}
+
+// ImportSortedSlice initializes the given set from the given slice.
+//
+// Preconditions: s must be empty. sds must represent a valid set (the segments
+// in sds must have valid lengths that do not overlap). The segments in sds
+// must be sorted in ascending key order.
+func (s *idMapSet) ImportSortedSlices(sds *idMapSegmentDataSlices) error {
+	if !s.IsEmpty() {
+		return fmt.Errorf("cannot import into non-empty set %v", s)
+	}
+	gap := s.FirstGap()
+	for i := range sds.Start {
+		r := idMapRange{sds.Start[i], sds.End[i]}
+		if !gap.Range().IsSupersetOf(r) {
+			return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i])
+		}
+		gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap()
+	}
+	return nil
+}
+func (s *idMapSet) saveRoot() *idMapSegmentDataSlices {
+	return s.ExportSortedSlices()
+}
+
+func (s *idMapSet) loadRoot(sds *idMapSegmentDataSlices) {
+	if err := s.ImportSortedSlices(sds); err != nil {
+		panic(err)
+	}
+}
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
new file mode 100644
index 000000000..a40dd668f
--- /dev/null
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -0,0 +1,129 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"math"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// A UserNamespace represents a user namespace. See user_namespaces(7) for
+// details.
+//
+// +stateify savable
+type UserNamespace struct {
+	// parent is this namespace's parent. If this is the root namespace, parent
+	// is nil. The parent pointer is immutable.
+	parent *UserNamespace
+
+	// owner is the effective UID of the namespace's creator in the root
+	// namespace. owner is immutable.
+	owner KUID
+
+	// mu protects the following fields.
+	//
+	// If mu will be locked in multiple UserNamespaces, it must be locked in
+	// descendant namespaces before ancestors.
+	mu sync.Mutex `state:"nosave"`
+
+	// Mappings of user/group IDs between this namespace and its parent.
+	//
+	// All ID maps, once set, cannot be changed. This means that successful
+	// UID/GID translations cannot be racy.
+	uidMapFromParent idMapSet
+	uidMapToParent   idMapSet
+	gidMapFromParent idMapSet
+	gidMapToParent   idMapSet
+
+	// TODO(b/27454212): Support disabling setgroups(2).
+}
+
+// NewRootUserNamespace returns a UserNamespace that is appropriate for a
+// system's root user namespace.
+func NewRootUserNamespace() *UserNamespace {
+	var ns UserNamespace
+	// """
+	// The initial user namespace has no parent namespace, but, for
+	// consistency, the kernel provides dummy user and group ID mapping files
+	// for this namespace. Looking at the uid_map file (gid_map is the same)
+	// from a shell in the initial namespace shows:
+	//
+	// $ cat /proc/$$/uid_map
+	// 0          0 4294967295
+	// """ - user_namespaces(7)
+	for _, m := range []*idMapSet{
+		&ns.uidMapFromParent,
+		&ns.uidMapToParent,
+		&ns.gidMapFromParent,
+		&ns.gidMapToParent,
+	} {
+		if !m.Add(idMapRange{0, math.MaxUint32}, 0) {
+			panic("Failed to insert into empty ID map")
+		}
+	}
+	return &ns
+}
+
+// Root returns the root of the user namespace tree containing ns.
+func (ns *UserNamespace) Root() *UserNamespace {
+	for ns.parent != nil {
+		ns = ns.parent
+	}
+	return ns
+}
+
+// "The kernel imposes (since version 3.11) a limit of 32 nested levels of user
+// namespaces." - user_namespaces(7)
+const maxUserNamespaceDepth = 32
+
+func (ns *UserNamespace) depth() int {
+	var i int
+	for ns != nil {
+		i++
+		ns = ns.parent
+	}
+	return i
+}
+
+// NewChildUserNamespace returns a new user namespace created by a caller with
+// credentials c.
+func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) {
+	if c.UserNamespace.depth() >= maxUserNamespaceDepth {
+		// "... Calls to unshare(2) or clone(2) that would cause this limit to
+		// be exceeded fail with the error EUSERS." - user_namespaces(7)
+		return nil, syserror.EUSERS
+	}
+	// "EPERM: CLONE_NEWUSER was specified in flags, but either the effective
+	// user ID or the effective group ID of the caller does not have a mapping
+	// in the parent namespace (see user_namespaces(7))." - clone(2)
+	// "CLONE_NEWUSER requires that the user ID and group ID of the calling
+	// process are mapped to user IDs and group IDs in the user namespace of
+	// the calling process at the time of the call." - unshare(2)
+	if !c.EffectiveKUID.In(c.UserNamespace).Ok() {
+		return nil, syserror.EPERM
+	}
+	if !c.EffectiveKGID.In(c.UserNamespace).Ok() {
+		return nil, syserror.EPERM
+	}
+	return &UserNamespace{
+		parent: c.UserNamespace,
+		owner:  c.EffectiveKUID,
+		// "When a user namespace is created, it starts without a mapping of
+		// user IDs (group IDs) to the parent user namespace." -
+		// user_namespaces(7)
+	}, nil
+}
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
new file mode 100644
index 000000000..a1a084eab
--- /dev/null
+++ b/pkg/sentry/kernel/context.go
@@ -0,0 +1,135 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the kernel package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxCanTrace is a Context.Value key for a function with the same
+	// signature and semantics as kernel.Task.CanTrace.
+	CtxCanTrace contextID = iota
+
+	// CtxKernel is a Context.Value key for a Kernel.
+	CtxKernel
+
+	// CtxPIDNamespace is a Context.Value key for a PIDNamespace.
+	CtxPIDNamespace
+
+	// CtxTask is a Context.Value key for a Task.
+	CtxTask
+
+	// CtxUTSNamespace is a Context.Value key for a UTSNamespace.
+	CtxUTSNamespace
+
+	// CtxIPCNamespace is a Context.Value key for a IPCNamespace.
+	CtxIPCNamespace
+)
+
+// ContextCanTrace returns true if ctx is permitted to trace t, in the same sense
+// as kernel.Task.CanTrace.
+func ContextCanTrace(ctx context.Context, t *Task, attach bool) bool {
+	if v := ctx.Value(CtxCanTrace); v != nil {
+		return v.(func(*Task, bool) bool)(t, attach)
+	}
+	return false
+}
+
+// KernelFromContext returns the Kernel in which ctx is executing, or nil if
+// there is no such Kernel.
+func KernelFromContext(ctx context.Context) *Kernel {
+	if v := ctx.Value(CtxKernel); v != nil {
+		return v.(*Kernel)
+	}
+	return nil
+}
+
+// PIDNamespaceFromContext returns the PID namespace in which ctx is executing,
+// or nil if there is no such PID namespace.
+func PIDNamespaceFromContext(ctx context.Context) *PIDNamespace {
+	if v := ctx.Value(CtxPIDNamespace); v != nil {
+		return v.(*PIDNamespace)
+	}
+	return nil
+}
+
+// UTSNamespaceFromContext returns the UTS namespace in which ctx is executing,
+// or nil if there is no such UTS namespace.
+func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace {
+	if v := ctx.Value(CtxUTSNamespace); v != nil {
+		return v.(*UTSNamespace)
+	}
+	return nil
+}
+
+// IPCNamespaceFromContext returns the IPC namespace in which ctx is executing,
+// or nil if there is no such IPC namespace.
+func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace {
+	if v := ctx.Value(CtxIPCNamespace); v != nil {
+		return v.(*IPCNamespace)
+	}
+	return nil
+}
+
+// TaskFromContext returns the Task associated with ctx, or nil if there is no
+// such Task.
+func TaskFromContext(ctx context.Context) *Task {
+	if v := ctx.Value(CtxTask); v != nil {
+		return v.(*Task)
+	}
+	return nil
+}
+
+// AsyncContext returns a context.Context that may be used by goroutines that
+// do work on behalf of t and therefore share its contextual values, but are
+// not t's task goroutine (e.g. asynchronous I/O).
+func (t *Task) AsyncContext() context.Context {
+	return taskAsyncContext{t: t}
+}
+
+type taskAsyncContext struct {
+	context.NoopSleeper
+	t *Task
+}
+
+// Debugf implements log.Logger.Debugf.
+func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
+	ctx.t.Debugf(format, v...)
+}
+
+// Infof implements log.Logger.Infof.
+func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
+	ctx.t.Infof(format, v...)
+}
+
+// Warningf implements log.Logger.Warningf.
+func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
+	ctx.t.Warningf(format, v...)
+}
+
+// IsLogging implements log.Logger.IsLogging.
+func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
+	return ctx.t.IsLogging(level)
+}
+
+// Value implements context.Context.Value.
+func (ctx taskAsyncContext) Value(key interface{}) interface{} {
+	return ctx.t.Value(key)
+}
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
new file mode 100644
index 000000000..bbacba1f4
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -0,0 +1,473 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package epoll provides an implementation of Linux's IO event notification
+// facility. See epoll(7) for more details.
+package epoll
+
+import (
+	"fmt"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Event describes the event mask that was observed and the user data to be
+// returned when one of the events occurs. It has this format to match the linux
+// format to avoid extra copying/allocation when writing events to userspace.
+type Event struct {
+	// Events is the event mask containing the set of events that have been
+	// observed on an entry.
+	Events uint32
+
+	// Data is an opaque 64-bit value provided by the caller when adding the
+	// entry, and returned to the caller when the entry reports an event.
+	Data [2]int32
+}
+
+// EntryFlags is a bitmask that holds an entry's flags.
+type EntryFlags int
+
+// Valid entry flags.
+const (
+	OneShot EntryFlags = 1 << iota
+	EdgeTriggered
+)
+
+// FileIdentifier identifies a file. We cannot use just the FD because it could
+// potentially be reassigned. We also cannot use just the file pointer because
+// it is possible to have multiple entries for the same file object as long as
+// they are created with different FDs (i.e., the FDs point to the same file).
+//
+// +stateify savable
+type FileIdentifier struct {
+	File *fs.File `state:"wait"`
+	Fd   kdefs.FD
+}
+
+// pollEntry holds all the state associated with an event poll entry, that is,
+// a file being observed by an event poll object.
+//
+// +stateify savable
+type pollEntry struct {
+	pollEntryEntry
+	file     *refs.WeakRef  `state:"manual"`
+	id       FileIdentifier `state:"wait"`
+	userData [2]int32
+	waiter   waiter.Entry `state:"manual"`
+	mask     waiter.EventMask
+	flags    EntryFlags
+
+	epoll *EventPoll
+
+	// We cannot save the current list pointer as it points into EventPoll
+	// struct, while state framework currently does not support such
+	// in-struct pointers. Instead, EventPoll will properly set this field
+	// in its loading logic.
+	curList *pollEntryList `state:"nosave"`
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+// weakReferenceGone is called when the file in the weak reference is destroyed.
+// The poll entry is removed in response to this.
+func (p *pollEntry) WeakRefGone() {
+	p.epoll.RemoveEntry(p.id)
+}
+
+// EventPoll holds all the state associated with an event poll object, that is,
+// collection of files to observe and their current state.
+//
+// +stateify savable
+type EventPoll struct {
+	fsutil.FilePipeSeek             `state:"zerovalue"`
+	fsutil.FileNotDirReaddir        `state:"zerovalue"`
+	fsutil.FileNoFsync              `state:"zerovalue"`
+	fsutil.FileNoopFlush            `state:"zerovalue"`
+	fsutil.FileNoIoctl              `state:"zerovalue"`
+	fsutil.FileNoMMap               `state:"zerovalue"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	// Wait queue is used to notify interested parties when the event poll
+	// object itself becomes readable or writable.
+	waiter.Queue `state:"zerovalue"`
+
+	// files is the map of all the files currently being observed, it is
+	// protected by mu.
+	mu    sync.Mutex `state:"nosave"`
+	files map[FileIdentifier]*pollEntry
+
+	// listsMu protects manipulation of the lists below. It needs to be a
+	// different lock to avoid circular lock acquisition order involving
+	// the wait queue mutexes and mu. The full order is mu, observed file
+	// wait queue mutex, then listsMu; this allows listsMu to be acquired
+	// when readyCallback is called.
+	//
+	// An entry is always in one of the following lists:
+	//	readyList -- when there's a chance that it's ready to have
+	//		events delivered to epoll waiters. Given that being
+	//		ready is a transient state, the Readiness() and
+	//		readEvents() functions always call the entry's file
+	//		Readiness() function to confirm it's ready.
+	//	waitingList -- when there's no chance that the entry is ready,
+	//		so it's waiting for the readyCallback to be called
+	//		on it before it gets moved to the readyList.
+	//	disabledList -- when the entry is disabled. This happens when
+	//		a one-shot entry gets delivered via readEvents().
+	listsMu      sync.Mutex `state:"nosave"`
+	readyList    pollEntryList
+	waitingList  pollEntryList
+	disabledList pollEntryList
+}
+
+// cycleMu is used to serialize all the cycle checks. This is only used when
+// an event poll file is added as an entry to another event poll. Such checks
+// are serialized to avoid lock acquisition order inversion: if a thread is
+// adding A to B, and another thread is adding B to A, each would acquire A's
+// and B's mutexes in reverse order, and could cause deadlocks. Having this
+// lock prevents this by allowing only one check at a time to happen.
+//
+// We do the cycle check to prevent callers from introducing potentially
+// infinite recursions. If a caller were to add A to B and then B to A, for
+// event poll A to know if it's readable, it would need to check event poll B,
+// which in turn would need event poll A and so on indefinitely.
+var cycleMu sync.Mutex
+
+// NewEventPoll allocates and initializes a new event poll object.
+func NewEventPoll(ctx context.Context) *fs.File {
+	// name matches fs/eventpoll.c:epoll_create1.
+	dirent := fs.NewDirent(anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]"))
+	return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{
+		files: make(map[FileIdentifier]*pollEntry),
+	})
+}
+
+// Release implements fs.FileOperations.Release.
+func (e *EventPoll) Release() {
+	// We need to take the lock now because files may be attempting to
+	// remove entries in parallel if they get destroyed.
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Go through all entries and clean up.
+	for _, entry := range e.files {
+		entry.id.File.EventUnregister(&entry.waiter)
+		entry.file.Drop()
+	}
+}
+
+// Read implements fs.FileOperations.Read.
+func (*EventPoll) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syscall.ENOSYS
+}
+
+// Write implements fs.FileOperations.Write.
+func (*EventPoll) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syscall.ENOSYS
+}
+
+// eventsAvailable determines if 'e' has events available for delivery.
+func (e *EventPoll) eventsAvailable() bool {
+	e.listsMu.Lock()
+
+	for it := e.readyList.Front(); it != nil; {
+		entry := it
+		it = it.Next()
+
+		// If the entry is ready, we know 'e' has at least one entry
+		// ready for delivery.
+		ready := entry.id.File.Readiness(entry.mask)
+		if ready != 0 {
+			e.listsMu.Unlock()
+			return true
+		}
+
+		// Entry is not ready, so move it to waiting list.
+		e.readyList.Remove(entry)
+		e.waitingList.PushBack(entry)
+		entry.curList = &e.waitingList
+	}
+
+	e.listsMu.Unlock()
+
+	return false
+}
+
+// Readiness determines if the event poll object is currently readable (i.e.,
+// if there are pending events for delivery).
+func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ready := waiter.EventMask(0)
+
+	if (mask&waiter.EventIn) != 0 && e.eventsAvailable() {
+		ready |= waiter.EventIn
+	}
+
+	return ready
+}
+
+// ReadEvents returns up to max available events.
+func (e *EventPoll) ReadEvents(max int) []Event {
+	var local pollEntryList
+	var ret []Event
+
+	e.listsMu.Lock()
+
+	// Go through all entries we believe may be ready.
+	for it := e.readyList.Front(); it != nil && len(ret) < max; {
+		entry := it
+		it = it.Next()
+
+		// Check the entry's readiness. It it's not really ready, we
+		// just put it back in the waiting list and move on to the next
+		// entry.
+		ready := entry.id.File.Readiness(entry.mask) & entry.mask
+		if ready == 0 {
+			e.readyList.Remove(entry)
+			e.waitingList.PushBack(entry)
+			entry.curList = &e.waitingList
+
+			continue
+		}
+
+		// Add event to the array that will be returned to caller.
+		ret = append(ret, Event{
+			Events: uint32(ready),
+			Data:   entry.userData,
+		})
+
+		// The entry is consumed, so we must move it to the disabled
+		// list in case it's one-shot, or back to the wait list if it's
+		// edge-triggered. If it's neither, we leave it in the ready
+		// list so that its readiness can be checked the next time
+		// around; however, we must move it to the end of the list so
+		// that other events can be delivered as well.
+		e.readyList.Remove(entry)
+		if entry.flags&OneShot != 0 {
+			e.disabledList.PushBack(entry)
+			entry.curList = &e.disabledList
+		} else if entry.flags&EdgeTriggered != 0 {
+			e.waitingList.PushBack(entry)
+			entry.curList = &e.waitingList
+		} else {
+			local.PushBack(entry)
+		}
+	}
+
+	e.readyList.PushBackList(&local)
+
+	e.listsMu.Unlock()
+
+	return ret
+}
+
+// readyCallback is called when one of the files we're polling becomes ready. It
+// moves said file to the readyList if it's currently in the waiting list.
+type readyCallback struct{}
+
+// Callback implements waiter.EntryCallback.Callback.
+func (*readyCallback) Callback(w *waiter.Entry) {
+	entry := w.Context.(*pollEntry)
+	e := entry.epoll
+
+	e.listsMu.Lock()
+
+	if entry.curList == &e.waitingList {
+		e.waitingList.Remove(entry)
+		e.readyList.PushBack(entry)
+		entry.curList = &e.readyList
+
+		e.Notify(waiter.EventIn)
+	}
+
+	e.listsMu.Unlock()
+}
+
+// initEntryReadiness initializes the entry's state with regards to its
+// readiness by placing it in the appropriate list and registering for
+// notifications.
+func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
+	// A new entry starts off in the waiting list.
+	e.listsMu.Lock()
+	e.waitingList.PushBack(entry)
+	entry.curList = &e.waitingList
+	e.listsMu.Unlock()
+
+	// Register for event notifications.
+	f := entry.id.File
+	f.EventRegister(&entry.waiter, entry.mask)
+
+	// Check if the file happens to already be in a ready state.
+	ready := f.Readiness(entry.mask) & entry.mask
+	if ready != 0 {
+		(*readyCallback).Callback(nil, &entry.waiter)
+	}
+}
+
+// observes checks if event poll object e is directly or indirectly observing
+// event poll object ep. It uses a bounded recursive depth-first search.
+func (e *EventPoll) observes(ep *EventPoll, depthLeft int) bool {
+	// If we reached the maximum depth, we'll consider that we found it
+	// because we don't want to allow chains that are too long.
+	if depthLeft <= 0 {
+		return true
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Go through each observed file and check if it is or observes ep.
+	for id := range e.files {
+		f, ok := id.File.FileOperations.(*EventPoll)
+		if !ok {
+			continue
+		}
+
+		if f == ep || f.observes(ep, depthLeft-1) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// AddEntry adds a new file to the collection of files observed by e.
+func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
+	// Acquire cycle check lock if another event poll is being added.
+	ep, ok := id.File.FileOperations.(*EventPoll)
+	if ok {
+		cycleMu.Lock()
+		defer cycleMu.Unlock()
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Fail if the file already has an entry.
+	if _, ok := e.files[id]; ok {
+		return syscall.EEXIST
+	}
+
+	// Check if a cycle would be created. We use 4 as the limit because
+	// that's the value used by linux and we want to emulate it.
+	if ep != nil {
+		if e == ep {
+			return syscall.EINVAL
+		}
+
+		if ep.observes(e, 4) {
+			return syscall.ELOOP
+		}
+	}
+
+	// Create new entry and add it to map.
+	//
+	// N.B. Even though we are creating a weak reference here, we know it
+	//      won't trigger a callback because we hold a reference to the file
+	//      throughout the execution of this function.
+	entry := &pollEntry{
+		id:       id,
+		userData: data,
+		epoll:    e,
+		flags:    flags,
+		waiter:   waiter.Entry{Callback: &readyCallback{}},
+		mask:     mask,
+	}
+	entry.waiter.Context = entry
+	e.files[id] = entry
+	entry.file = refs.NewWeakRef(id.File, entry)
+
+	// Initialize the readiness state of the new entry.
+	e.initEntryReadiness(entry)
+
+	return nil
+}
+
+// UpdateEntry updates the flags, mask and user data associated with a file that
+// is already part of the collection of observed files.
+func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Fail if the file doesn't have an entry.
+	entry, ok := e.files[id]
+	if !ok {
+		return syscall.ENOENT
+	}
+
+	// Unregister the old mask and remove entry from the list it's in, so
+	// readyCallback is guaranteed to not be called on this entry anymore.
+	entry.id.File.EventUnregister(&entry.waiter)
+
+	// Remove entry from whatever list it's in. This ensure that no other
+	// threads have access to this entry as the only way left to find it
+	// is via e.files, but we hold e.mu, which prevents that.
+	e.listsMu.Lock()
+	entry.curList.Remove(entry)
+	e.listsMu.Unlock()
+
+	// Initialize new readiness state.
+	entry.flags = flags
+	entry.mask = mask
+	entry.userData = data
+	e.initEntryReadiness(entry)
+
+	return nil
+}
+
+// RemoveEntry a files from the collection of observed files.
+func (e *EventPoll) RemoveEntry(id FileIdentifier) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Fail if the file doesn't have an entry.
+	entry, ok := e.files[id]
+	if !ok {
+		return syscall.ENOENT
+	}
+
+	// Unregister from file first so that no concurrent attempts will be
+	// made to manipulate the file.
+	entry.id.File.EventUnregister(&entry.waiter)
+
+	// Remove from the current list.
+	e.listsMu.Lock()
+	entry.curList.Remove(entry)
+	entry.curList = nil
+	e.listsMu.Unlock()
+
+	// Remove file from map, and drop weak reference.
+	delete(e.files, id)
+	entry.file.Drop()
+
+	return nil
+}
+
+// UnregisterEpollWaiters removes the epoll waiter objects from the waiting
+// queues. This is different from Release() as the file is not dereferenced.
+func (e *EventPoll) UnregisterEpollWaiters() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	for _, entry := range e.files {
+		entry.id.File.EventUnregister(&entry.waiter)
+	}
+}
diff --git a/pkg/sentry/kernel/epoll/epoll_list.go b/pkg/sentry/kernel/epoll/epoll_list.go
new file mode 100755
index 000000000..94d5c9e57
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll_list.go
@@ -0,0 +1,173 @@
+package epoll
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type pollEntryElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (pollEntryElementMapper) linkerFor(elem *pollEntry) *pollEntry { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+//      for e := l.Front(); e != nil; e = e.Next() {
+// 		// do something with e.
+//      }
+//
+// +stateify savable
+type pollEntryList struct {
+	head *pollEntry
+	tail *pollEntry
+}
+
+// Reset resets list l to the empty state.
+func (l *pollEntryList) Reset() {
+	l.head = nil
+	l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+func (l *pollEntryList) Empty() bool {
+	return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+func (l *pollEntryList) Front() *pollEntry {
+	return l.head
+}
+
+// Back returns the last element of list l or nil.
+func (l *pollEntryList) Back() *pollEntry {
+	return l.tail
+}
+
+// PushFront inserts the element e at the front of list l.
+func (l *pollEntryList) PushFront(e *pollEntry) {
+	pollEntryElementMapper{}.linkerFor(e).SetNext(l.head)
+	pollEntryElementMapper{}.linkerFor(e).SetPrev(nil)
+
+	if l.head != nil {
+		pollEntryElementMapper{}.linkerFor(l.head).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+
+	l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+func (l *pollEntryList) PushBack(e *pollEntry) {
+	pollEntryElementMapper{}.linkerFor(e).SetNext(nil)
+	pollEntryElementMapper{}.linkerFor(e).SetPrev(l.tail)
+
+	if l.tail != nil {
+		pollEntryElementMapper{}.linkerFor(l.tail).SetNext(e)
+	} else {
+		l.head = e
+	}
+
+	l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+func (l *pollEntryList) PushBackList(m *pollEntryList) {
+	if l.head == nil {
+		l.head = m.head
+		l.tail = m.tail
+	} else if m.head != nil {
+		pollEntryElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+		pollEntryElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+		l.tail = m.tail
+	}
+
+	m.head = nil
+	m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+func (l *pollEntryList) InsertAfter(b, e *pollEntry) {
+	a := pollEntryElementMapper{}.linkerFor(b).Next()
+	pollEntryElementMapper{}.linkerFor(e).SetNext(a)
+	pollEntryElementMapper{}.linkerFor(e).SetPrev(b)
+	pollEntryElementMapper{}.linkerFor(b).SetNext(e)
+
+	if a != nil {
+		pollEntryElementMapper{}.linkerFor(a).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+}
+
+// InsertBefore inserts e before a.
+func (l *pollEntryList) InsertBefore(a, e *pollEntry) {
+	b := pollEntryElementMapper{}.linkerFor(a).Prev()
+	pollEntryElementMapper{}.linkerFor(e).SetNext(a)
+	pollEntryElementMapper{}.linkerFor(e).SetPrev(b)
+	pollEntryElementMapper{}.linkerFor(a).SetPrev(e)
+
+	if b != nil {
+		pollEntryElementMapper{}.linkerFor(b).SetNext(e)
+	} else {
+		l.head = e
+	}
+}
+
+// Remove removes e from l.
+func (l *pollEntryList) Remove(e *pollEntry) {
+	prev := pollEntryElementMapper{}.linkerFor(e).Prev()
+	next := pollEntryElementMapper{}.linkerFor(e).Next()
+
+	if prev != nil {
+		pollEntryElementMapper{}.linkerFor(prev).SetNext(next)
+	} else {
+		l.head = next
+	}
+
+	if next != nil {
+		pollEntryElementMapper{}.linkerFor(next).SetPrev(prev)
+	} else {
+		l.tail = prev
+	}
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type pollEntryEntry struct {
+	next *pollEntry
+	prev *pollEntry
+}
+
+// Next returns the entry that follows e in the list.
+func (e *pollEntryEntry) Next() *pollEntry {
+	return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+func (e *pollEntryEntry) Prev() *pollEntry {
+	return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+func (e *pollEntryEntry) SetNext(elem *pollEntry) {
+	e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+func (e *pollEntryEntry) SetPrev(elem *pollEntry) {
+	e.prev = elem
+}
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
new file mode 100644
index 000000000..4c3c38f9e
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -0,0 +1,49 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package epoll
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// afterLoad is invoked by stateify.
+func (p *pollEntry) afterLoad() {
+	p.waiter = waiter.Entry{Callback: &readyCallback{}}
+	p.waiter.Context = p
+	p.file = refs.NewWeakRef(p.id.File, p)
+	p.id.File.EventRegister(&p.waiter, p.mask)
+}
+
+// afterLoad is invoked by stateify.
+func (e *EventPoll) afterLoad() {
+	e.listsMu.Lock()
+	defer e.listsMu.Unlock()
+
+	for _, ls := range []*pollEntryList{&e.waitingList, &e.readyList, &e.disabledList} {
+		for it := ls.Front(); it != nil; it = it.Next() {
+			it.curList = ls
+		}
+	}
+
+	for it := e.waitingList.Front(); it != nil; it = it.Next() {
+		if it.id.File.Readiness(it.mask) != 0 {
+			e.waitingList.Remove(it)
+			e.readyList.PushBack(it)
+			it.curList = &e.readyList
+			e.Notify(waiter.EventIn)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/epoll/epoll_state_autogen.go b/pkg/sentry/kernel/epoll/epoll_state_autogen.go
new file mode 100755
index 000000000..a361ff37b
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll_state_autogen.go
@@ -0,0 +1,99 @@
+// automatically generated by stateify.
+
+package epoll
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *FileIdentifier) beforeSave() {}
+func (x *FileIdentifier) save(m state.Map) {
+	x.beforeSave()
+	m.Save("File", &x.File)
+	m.Save("Fd", &x.Fd)
+}
+
+func (x *FileIdentifier) afterLoad() {}
+func (x *FileIdentifier) load(m state.Map) {
+	m.LoadWait("File", &x.File)
+	m.Load("Fd", &x.Fd)
+}
+
+func (x *pollEntry) beforeSave() {}
+func (x *pollEntry) save(m state.Map) {
+	x.beforeSave()
+	m.Save("pollEntryEntry", &x.pollEntryEntry)
+	m.Save("id", &x.id)
+	m.Save("userData", &x.userData)
+	m.Save("mask", &x.mask)
+	m.Save("flags", &x.flags)
+	m.Save("epoll", &x.epoll)
+}
+
+func (x *pollEntry) load(m state.Map) {
+	m.Load("pollEntryEntry", &x.pollEntryEntry)
+	m.LoadWait("id", &x.id)
+	m.Load("userData", &x.userData)
+	m.Load("mask", &x.mask)
+	m.Load("flags", &x.flags)
+	m.Load("epoll", &x.epoll)
+	m.AfterLoad(x.afterLoad)
+}
+
+func (x *EventPoll) beforeSave() {}
+func (x *EventPoll) save(m state.Map) {
+	x.beforeSave()
+	if !state.IsZeroValue(x.FilePipeSeek) { m.Failf("FilePipeSeek is %v, expected zero", x.FilePipeSeek) }
+	if !state.IsZeroValue(x.FileNotDirReaddir) { m.Failf("FileNotDirReaddir is %v, expected zero", x.FileNotDirReaddir) }
+	if !state.IsZeroValue(x.FileNoFsync) { m.Failf("FileNoFsync is %v, expected zero", x.FileNoFsync) }
+	if !state.IsZeroValue(x.FileNoopFlush) { m.Failf("FileNoopFlush is %v, expected zero", x.FileNoopFlush) }
+	if !state.IsZeroValue(x.FileNoIoctl) { m.Failf("FileNoIoctl is %v, expected zero", x.FileNoIoctl) }
+	if !state.IsZeroValue(x.FileNoMMap) { m.Failf("FileNoMMap is %v, expected zero", x.FileNoMMap) }
+	if !state.IsZeroValue(x.Queue) { m.Failf("Queue is %v, expected zero", x.Queue) }
+	m.Save("files", &x.files)
+	m.Save("readyList", &x.readyList)
+	m.Save("waitingList", &x.waitingList)
+	m.Save("disabledList", &x.disabledList)
+}
+
+func (x *EventPoll) load(m state.Map) {
+	m.Load("files", &x.files)
+	m.Load("readyList", &x.readyList)
+	m.Load("waitingList", &x.waitingList)
+	m.Load("disabledList", &x.disabledList)
+	m.AfterLoad(x.afterLoad)
+}
+
+func (x *pollEntryList) beforeSave() {}
+func (x *pollEntryList) save(m state.Map) {
+	x.beforeSave()
+	m.Save("head", &x.head)
+	m.Save("tail", &x.tail)
+}
+
+func (x *pollEntryList) afterLoad() {}
+func (x *pollEntryList) load(m state.Map) {
+	m.Load("head", &x.head)
+	m.Load("tail", &x.tail)
+}
+
+func (x *pollEntryEntry) beforeSave() {}
+func (x *pollEntryEntry) save(m state.Map) {
+	x.beforeSave()
+	m.Save("next", &x.next)
+	m.Save("prev", &x.prev)
+}
+
+func (x *pollEntryEntry) afterLoad() {}
+func (x *pollEntryEntry) load(m state.Map) {
+	m.Load("next", &x.next)
+	m.Load("prev", &x.prev)
+}
+
+func init() {
+	state.Register("epoll.FileIdentifier", (*FileIdentifier)(nil), state.Fns{Save: (*FileIdentifier).save, Load: (*FileIdentifier).load})
+	state.Register("epoll.pollEntry", (*pollEntry)(nil), state.Fns{Save: (*pollEntry).save, Load: (*pollEntry).load})
+	state.Register("epoll.EventPoll", (*EventPoll)(nil), state.Fns{Save: (*EventPoll).save, Load: (*EventPoll).load})
+	state.Register("epoll.pollEntryList", (*pollEntryList)(nil), state.Fns{Save: (*pollEntryList).save, Load: (*pollEntryList).load})
+	state.Register("epoll.pollEntryEntry", (*pollEntryEntry)(nil), state.Fns{Save: (*pollEntryEntry).save, Load: (*pollEntryEntry).load})
+}
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
new file mode 100644
index 000000000..2f900be38
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -0,0 +1,283 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package eventfd provides an implementation of Linux's file-based event
+// notification.
+package eventfd
+
+import (
+	"math"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// EventOperations represents an event with the semantics of Linux's file-based event
+// notification (eventfd). Eventfds are usually internal to the Sentry but in certain
+// situations they may be converted into a host-backed eventfd.
+//
+// +stateify savable
+type EventOperations struct {
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	// Mutex that protects accesses to the fields of this event.
+	mu sync.Mutex `state:"nosave"`
+
+	// Queue is used to notify interested parties when the event object
+	// becomes readable or writable.
+	wq waiter.Queue `state:"zerovalue"`
+
+	// val is the current value of the event counter.
+	val uint64
+
+	// semMode specifies whether the event is in "semaphore" mode.
+	semMode bool
+
+	// hostfd indicates whether this eventfd is passed through to the host.
+	hostfd int
+}
+
+// New creates a new event object with the supplied initial value and mode.
+func New(ctx context.Context, initVal uint64, semMode bool) *fs.File {
+	// name matches fs/eventfd.c:eventfd_file_create.
+	dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[eventfd]")
+	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{
+		val:     initVal,
+		semMode: semMode,
+		hostfd:  -1,
+	})
+}
+
+// HostFD returns the host eventfd associated with this event.
+func (e *EventOperations) HostFD() (int, error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if e.hostfd >= 0 {
+		return e.hostfd, nil
+	}
+
+	flags := linux.EFD_NONBLOCK
+	if e.semMode {
+		flags |= linux.EFD_SEMAPHORE
+	}
+
+	fd, _, err := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(e.val), uintptr(flags), 0)
+	if err != 0 {
+		return -1, err
+	}
+
+	if err := fdnotifier.AddFD(int32(fd), &e.wq); err != nil {
+		syscall.Close(int(fd))
+		return -1, err
+	}
+
+	e.hostfd = int(fd)
+	return e.hostfd, nil
+}
+
+// Release implements fs.FileOperations.Release.
+func (e *EventOperations) Release() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if e.hostfd >= 0 {
+		fdnotifier.RemoveFD(int32(e.hostfd))
+		syscall.Close(e.hostfd)
+		e.hostfd = -1
+	}
+}
+
+// Read implements fs.FileOperations.Read.
+func (e *EventOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	if dst.NumBytes() < 8 {
+		return 0, syscall.EINVAL
+	}
+	if err := e.read(ctx, dst); err != nil {
+		return 0, err
+	}
+	return 8, nil
+}
+
+// Write implements fs.FileOperations.Write.
+func (e *EventOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	if src.NumBytes() < 8 {
+		return 0, syscall.EINVAL
+	}
+	if err := e.write(ctx, src); err != nil {
+		return 0, err
+	}
+	return 8, nil
+}
+
+// Must be called with e.mu locked.
+func (e *EventOperations) hostRead(ctx context.Context, dst usermem.IOSequence) error {
+	var buf [8]byte
+
+	if _, err := syscall.Read(e.hostfd, buf[:]); err != nil {
+		if err == syscall.EWOULDBLOCK {
+			return syserror.ErrWouldBlock
+		}
+		return err
+	}
+
+	_, err := dst.CopyOut(ctx, buf[:])
+	return err
+}
+
+func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) error {
+	e.mu.Lock()
+
+	if e.hostfd >= 0 {
+		defer e.mu.Unlock()
+		return e.hostRead(ctx, dst)
+	}
+
+	// We can't complete the read if the value is currently zero.
+	if e.val == 0 {
+		e.mu.Unlock()
+		return syserror.ErrWouldBlock
+	}
+
+	// Update the value based on the mode the event is operating in.
+	var val uint64
+	if e.semMode {
+		val = 1
+		// Consistent with Linux, this is done even if writing to memory fails.
+		e.val--
+	} else {
+		val = e.val
+		e.val = 0
+	}
+
+	e.mu.Unlock()
+
+	// Notify writers. We do this even if we were already writable because
+	// it is possible that a writer is waiting to write the maximum value
+	// to the event.
+	e.wq.Notify(waiter.EventOut)
+
+	var buf [8]byte
+	usermem.ByteOrder.PutUint64(buf[:], val)
+	_, err := dst.CopyOut(ctx, buf[:])
+	return err
+}
+
+// Must be called with e.mu locked.
+func (e *EventOperations) hostWrite(val uint64) error {
+	var buf [8]byte
+	usermem.ByteOrder.PutUint64(buf[:], val)
+	_, err := syscall.Write(e.hostfd, buf[:])
+	if err == syscall.EWOULDBLOCK {
+		return syserror.ErrWouldBlock
+	}
+	return err
+}
+
+func (e *EventOperations) write(ctx context.Context, src usermem.IOSequence) error {
+	var buf [8]byte
+	if _, err := src.CopyIn(ctx, buf[:]); err != nil {
+		return err
+	}
+	val := usermem.ByteOrder.Uint64(buf[:])
+
+	return e.Signal(val)
+}
+
+// Signal is an internal function to signal the event fd.
+func (e *EventOperations) Signal(val uint64) error {
+	if val == math.MaxUint64 {
+		return syscall.EINVAL
+	}
+
+	e.mu.Lock()
+
+	if e.hostfd >= 0 {
+		defer e.mu.Unlock()
+		return e.hostWrite(val)
+	}
+
+	// We only allow writes that won't cause the value to go over the max
+	// uint64 minus 1.
+	if val > math.MaxUint64-1-e.val {
+		e.mu.Unlock()
+		return syserror.ErrWouldBlock
+	}
+
+	e.val += val
+	e.mu.Unlock()
+
+	// Always trigger a notification.
+	e.wq.Notify(waiter.EventIn)
+
+	return nil
+}
+
+// Readiness returns the ready events for the event fd.
+func (e *EventOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	e.mu.Lock()
+	if e.hostfd >= 0 {
+		defer e.mu.Unlock()
+		return fdnotifier.NonBlockingPoll(int32(e.hostfd), mask)
+	}
+
+	ready := waiter.EventMask(0)
+	if e.val > 0 {
+		ready |= waiter.EventIn
+	}
+
+	if e.val < math.MaxUint64-1 {
+		ready |= waiter.EventOut
+	}
+	e.mu.Unlock()
+
+	return mask & ready
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (e *EventOperations) EventRegister(entry *waiter.Entry, mask waiter.EventMask) {
+	e.wq.EventRegister(entry, mask)
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if e.hostfd >= 0 {
+		fdnotifier.UpdateFD(int32(e.hostfd))
+	}
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (e *EventOperations) EventUnregister(entry *waiter.Entry) {
+	e.wq.EventUnregister(entry)
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if e.hostfd >= 0 {
+		fdnotifier.UpdateFD(int32(e.hostfd))
+	}
+}
diff --git a/pkg/sentry/kernel/eventfd/eventfd_state_autogen.go b/pkg/sentry/kernel/eventfd/eventfd_state_autogen.go
new file mode 100755
index 000000000..922ff1b73
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/eventfd_state_autogen.go
@@ -0,0 +1,27 @@
+// automatically generated by stateify.
+
+package eventfd
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *EventOperations) beforeSave() {}
+func (x *EventOperations) save(m state.Map) {
+	x.beforeSave()
+	if !state.IsZeroValue(x.wq) { m.Failf("wq is %v, expected zero", x.wq) }
+	m.Save("val", &x.val)
+	m.Save("semMode", &x.semMode)
+	m.Save("hostfd", &x.hostfd)
+}
+
+func (x *EventOperations) afterLoad() {}
+func (x *EventOperations) load(m state.Map) {
+	m.Load("val", &x.val)
+	m.Load("semMode", &x.semMode)
+	m.Load("hostfd", &x.hostfd)
+}
+
+func init() {
+	state.Register("eventfd.EventOperations", (*EventOperations)(nil), state.Fns{Save: (*EventOperations).save, Load: (*EventOperations).load})
+}
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
new file mode 100644
index 000000000..84cd08501
--- /dev/null
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -0,0 +1,148 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fasync provides FIOASYNC related functionality.
+package fasync
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// New creates a new FileAsync.
+func New() fs.FileAsync {
+	return &FileAsync{}
+}
+
+// FileAsync sends signals when the registered file is ready for IO.
+//
+// +stateify savable
+type FileAsync struct {
+	mu        sync.Mutex `state:"nosave"`
+	e         waiter.Entry
+	requester *auth.Credentials
+
+	// Only one of the following is allowed to be non-nil.
+	recipientPG *kernel.ProcessGroup
+	recipientTG *kernel.ThreadGroup
+	recipientT  *kernel.Task
+}
+
+// Callback sends a signal.
+func (a *FileAsync) Callback(e *waiter.Entry) {
+	a.mu.Lock()
+	if a.e.Callback == nil {
+		a.mu.Unlock()
+		return
+	}
+	t := a.recipientT
+	tg := a.recipientTG
+	if a.recipientPG != nil {
+		tg = a.recipientPG.Originator()
+	}
+	if tg != nil {
+		t = tg.Leader()
+	}
+	if t == nil {
+		// No recipient has been registered.
+		a.mu.Unlock()
+		return
+	}
+	c := t.Credentials()
+	// Logic from sigio_perm in fs/fcntl.c.
+	if a.requester.EffectiveKUID == 0 ||
+		a.requester.EffectiveKUID == c.SavedKUID ||
+		a.requester.EffectiveKUID == c.RealKUID ||
+		a.requester.RealKUID == c.SavedKUID ||
+		a.requester.RealKUID == c.RealKUID {
+		t.SendSignal(kernel.SignalInfoPriv(linux.SIGIO))
+	}
+	a.mu.Unlock()
+}
+
+// Register sets the file which will be monitored for IO events.
+//
+// The file must not be currently registered.
+func (a *FileAsync) Register(w waiter.Waitable) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if a.e.Callback != nil {
+		panic("registering already registered file")
+	}
+
+	a.e.Callback = a
+	w.EventRegister(&a.e, waiter.EventIn|waiter.EventOut|waiter.EventErr|waiter.EventHUp)
+}
+
+// Unregister stops monitoring a file.
+//
+// The file must be currently registered.
+func (a *FileAsync) Unregister(w waiter.Waitable) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if a.e.Callback == nil {
+		panic("unregistering unregistered file")
+	}
+
+	w.EventUnregister(&a.e)
+	a.e.Callback = nil
+}
+
+// Owner returns who is currently getting signals. All return values will be
+// nil if no one is set to receive signals.
+func (a *FileAsync) Owner() (*kernel.Task, *kernel.ThreadGroup, *kernel.ProcessGroup) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	return a.recipientT, a.recipientTG, a.recipientPG
+}
+
+// SetOwnerTask sets the owner (who will receive signals) to a specified task.
+// Only this owner will receive signals.
+func (a *FileAsync) SetOwnerTask(requester *kernel.Task, recipient *kernel.Task) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.requester = requester.Credentials()
+	a.recipientT = recipient
+	a.recipientTG = nil
+	a.recipientPG = nil
+}
+
+// SetOwnerThreadGroup sets the owner (who will receive signals) to a specified
+// thread group. Only this owner will receive signals.
+func (a *FileAsync) SetOwnerThreadGroup(requester *kernel.Task, recipient *kernel.ThreadGroup) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.requester = requester.Credentials()
+	a.recipientT = nil
+	a.recipientTG = recipient
+	a.recipientPG = nil
+}
+
+// SetOwnerProcessGroup sets the owner (who will receive signals) to a
+// specified process group. Only this owner will receive signals.
+func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kernel.ProcessGroup) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.requester = requester.Credentials()
+	a.recipientT = nil
+	a.recipientTG = nil
+	a.recipientPG = recipient
+}
diff --git a/pkg/sentry/kernel/fasync/fasync_state_autogen.go b/pkg/sentry/kernel/fasync/fasync_state_autogen.go
new file mode 100755
index 000000000..e162e0033
--- /dev/null
+++ b/pkg/sentry/kernel/fasync/fasync_state_autogen.go
@@ -0,0 +1,30 @@
+// automatically generated by stateify.
+
+package fasync
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *FileAsync) beforeSave() {}
+func (x *FileAsync) save(m state.Map) {
+	x.beforeSave()
+	m.Save("e", &x.e)
+	m.Save("requester", &x.requester)
+	m.Save("recipientPG", &x.recipientPG)
+	m.Save("recipientTG", &x.recipientTG)
+	m.Save("recipientT", &x.recipientT)
+}
+
+func (x *FileAsync) afterLoad() {}
+func (x *FileAsync) load(m state.Map) {
+	m.Load("e", &x.e)
+	m.Load("requester", &x.requester)
+	m.Load("recipientPG", &x.recipientPG)
+	m.Load("recipientTG", &x.recipientTG)
+	m.Load("recipientT", &x.recipientT)
+}
+
+func init() {
+	state.Register("fasync.FileAsync", (*FileAsync)(nil), state.Fns{Save: (*FileAsync).save, Load: (*FileAsync).load})
+}
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
new file mode 100644
index 000000000..c5636d233
--- /dev/null
+++ b/pkg/sentry/kernel/fd_map.go
@@ -0,0 +1,364 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// FDs is an ordering of FD's that can be made stable.
+type FDs []kdefs.FD
+
+func (f FDs) Len() int {
+	return len(f)
+}
+
+func (f FDs) Swap(i, j int) {
+	f[i], f[j] = f[j], f[i]
+}
+
+func (f FDs) Less(i, j int) bool {
+	return f[i] < f[j]
+}
+
+// FDFlags define flags for an individual descriptor.
+//
+// +stateify savable
+type FDFlags struct {
+	// CloseOnExec indicates the descriptor should be closed on exec.
+	CloseOnExec bool
+}
+
+// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
+// representation.
+func (f FDFlags) ToLinuxFileFlags() (mask uint) {
+	if f.CloseOnExec {
+		mask |= linux.O_CLOEXEC
+	}
+	return
+}
+
+// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
+// representation.
+func (f FDFlags) ToLinuxFDFlags() (mask uint) {
+	if f.CloseOnExec {
+		mask |= linux.FD_CLOEXEC
+	}
+	return
+}
+
+// descriptor holds the details about a file descriptor, namely a pointer the
+// file itself and the descriptor flags.
+//
+// +stateify savable
+type descriptor struct {
+	file  *fs.File
+	flags FDFlags
+}
+
+// FDMap is used to manage File references and flags.
+//
+// +stateify savable
+type FDMap struct {
+	refs.AtomicRefCount
+	k     *Kernel
+	files map[kdefs.FD]descriptor
+	mu    sync.RWMutex `state:"nosave"`
+	uid   uint64
+}
+
+// ID returns a unique identifier for this FDMap.
+func (f *FDMap) ID() uint64 {
+	return f.uid
+}
+
+// NewFDMap allocates a new FDMap that may be used by tasks in k.
+func (k *Kernel) NewFDMap() *FDMap {
+	return &FDMap{
+		k:     k,
+		files: make(map[kdefs.FD]descriptor),
+		uid:   atomic.AddUint64(&k.fdMapUids, 1),
+	}
+}
+
+// destroy removes all of the file descriptors from the map.
+func (f *FDMap) destroy() {
+	f.RemoveIf(func(*fs.File, FDFlags) bool {
+		return true
+	})
+}
+
+// DecRef implements RefCounter.DecRef with destructor f.destroy.
+func (f *FDMap) DecRef() {
+	f.DecRefWithDestructor(f.destroy)
+}
+
+// Size returns the number of file descriptor slots currently allocated.
+func (f *FDMap) Size() int {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	return len(f.files)
+}
+
+// String is a stringer for FDMap.
+func (f *FDMap) String() string {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	var b bytes.Buffer
+	for k, v := range f.files {
+		n, _ := v.file.Dirent.FullName(nil /* root */)
+		b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", k, n))
+	}
+	return b.String()
+}
+
+// NewFDFrom allocates a new FD guaranteed to be the lowest number available
+// greater than or equal to from. This property is important as Unix programs
+// tend to count on this allocation order.
+func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error) {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return 0, syscall.EINVAL
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// Finds the lowest fd not in the handles map.
+	lim := limitSet.Get(limits.NumberOfFiles)
+	for i := fd; lim.Cur == limits.Infinity || i < kdefs.FD(lim.Cur); i++ {
+		if _, ok := f.files[i]; !ok {
+			file.IncRef()
+			f.files[i] = descriptor{file, flags}
+			return i, nil
+		}
+	}
+
+	return -1, syscall.EMFILE
+}
+
+// NewFDAt sets the file reference for the given FD. If there is an
+// active reference for that FD, the ref count for that existing reference
+// is decremented.
+func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return syscall.EBADF
+	}
+
+	// In this one case we do not do a defer of the Unlock.  The
+	// reason is that we must have done all the work needed for
+	// discarding any old open file before we return to the
+	// caller. In other words, the DecRef(), below, must have
+	// completed by the time we return to the caller to ensure
+	// side effects are, in fact, effected. A classic example is
+	// dup2(fd1, fd2); if fd2 was already open, it must be closed,
+	// and we don't want to resume the caller until it is; we have
+	// to block on the DecRef(). Hence we can not just do a 'go
+	// oldfile.DecRef()', since there would be no guarantee that
+	// it would be done before we the caller resumed. Since we
+	// must wait for the DecRef() to finish, and that could take
+	// time, it's best to first call f.muUnlock beore so we are
+	// not blocking other uses of this FDMap on the DecRef() call.
+	f.mu.Lock()
+	oldDesc, oldExists := f.files[fd]
+	lim := limitSet.Get(limits.NumberOfFiles).Cur
+	// if we're closing one then the effective limit is one
+	// more than the actual limit.
+	if oldExists && lim != limits.Infinity {
+		lim++
+	}
+	if lim != limits.Infinity && fd >= kdefs.FD(lim) {
+		f.mu.Unlock()
+		return syscall.EMFILE
+	}
+
+	file.IncRef()
+	f.files[fd] = descriptor{file, flags}
+	f.mu.Unlock()
+
+	if oldExists {
+		oldDesc.file.DecRef()
+	}
+	return nil
+}
+
+// SetFlags sets the flags for the given file descriptor, if it is valid.
+func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	desc, ok := f.files[fd]
+	if !ok {
+		return
+	}
+
+	f.files[fd] = descriptor{desc.file, flags}
+}
+
+// GetDescriptor returns a reference to the file and the flags for the FD. It
+// bumps its reference count as well. It returns nil if there is no File
+// for the FD, i.e. if the FD is invalid. The caller must use DecRef
+// when they are done.
+func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags) {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	if desc, ok := f.files[fd]; ok {
+		desc.file.IncRef()
+		return desc.file, desc.flags
+	}
+	return nil, FDFlags{}
+}
+
+// GetFile returns a reference to the File for the FD and bumps
+// its reference count as well. It returns nil if there is no File
+// for the FD, i.e. if the FD is invalid. The caller must use DecRef
+// when they are done.
+func (f *FDMap) GetFile(fd kdefs.FD) *fs.File {
+	f.mu.RLock()
+	if desc, ok := f.files[fd]; ok {
+		desc.file.IncRef()
+		f.mu.RUnlock()
+		return desc.file
+	}
+	f.mu.RUnlock()
+	return nil
+}
+
+// fds returns an ordering of FDs.
+func (f *FDMap) fds() FDs {
+	fds := make(FDs, 0, len(f.files))
+	for fd := range f.files {
+		fds = append(fds, fd)
+	}
+	sort.Sort(fds)
+	return fds
+}
+
+// GetFDs returns a list of valid fds.
+func (f *FDMap) GetFDs() FDs {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+	return f.fds()
+}
+
+// GetRefs returns a stable slice of references to all files and bumps the
+// reference count on each.  The caller must use DecRef on each reference when
+// they're done using the slice.
+func (f *FDMap) GetRefs() []*fs.File {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	fds := f.fds()
+	fs := make([]*fs.File, 0, len(fds))
+	for _, fd := range fds {
+		desc := f.files[fd]
+		desc.file.IncRef()
+		fs = append(fs, desc.file)
+	}
+	return fs
+}
+
+// Fork returns an independent FDMap pointing to the same descriptors.
+func (f *FDMap) Fork() *FDMap {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	clone := f.k.NewFDMap()
+
+	// Grab a extra reference for every file.
+	for fd, desc := range f.files {
+		desc.file.IncRef()
+		clone.files[fd] = desc
+	}
+
+	// That's it!
+	return clone
+}
+
+// unlock releases all file locks held by this FDMap's uid.  Must only be
+// called on a non-nil *fs.File.
+func (f *FDMap) unlock(file *fs.File) {
+	id := lock.UniqueID(f.ID())
+	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(id, lock.LockRange{0, lock.LockEOF})
+}
+
+// inotifyFileClose generates the appropriate inotify events for f being closed.
+func inotifyFileClose(f *fs.File) {
+	var ev uint32
+	d := f.Dirent
+
+	if fs.IsDir(d.Inode.StableAttr) {
+		ev |= linux.IN_ISDIR
+	}
+
+	if f.Flags().Write {
+		ev |= linux.IN_CLOSE_WRITE
+	} else {
+		ev |= linux.IN_CLOSE_NOWRITE
+	}
+
+	d.InotifyEvent(ev, 0)
+}
+
+// Remove removes an FD from the FDMap, and returns (File, true) if a File
+// one was found. Callers are expected to decrement the reference count on
+// the File. Otherwise returns (nil, false).
+func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool) {
+	f.mu.Lock()
+	desc := f.files[fd]
+	delete(f.files, fd)
+	f.mu.Unlock()
+	if desc.file != nil {
+		f.unlock(desc.file)
+		inotifyFileClose(desc.file)
+		return desc.file, true
+	}
+	return nil, false
+}
+
+// RemoveIf removes all FDs where cond is true.
+func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool) {
+	var removed []*fs.File
+	f.mu.Lock()
+	for fd, desc := range f.files {
+		if desc.file != nil && cond(desc.file, desc.flags) {
+			delete(f.files, fd)
+			removed = append(removed, desc.file)
+		}
+	}
+	f.mu.Unlock()
+
+	for _, file := range removed {
+		f.unlock(file)
+		inotifyFileClose(file)
+		file.DecRef()
+	}
+}
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
new file mode 100644
index 000000000..d8115f59a
--- /dev/null
+++ b/pkg/sentry/kernel/fs_context.go
@@ -0,0 +1,187 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// FSContext contains filesystem context.
+//
+// This includes umask and working directory.
+//
+// +stateify savable
+type FSContext struct {
+	refs.AtomicRefCount
+
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// root is the filesystem root. Will be nil iff the FSContext has been
+	// destroyed.
+	root *fs.Dirent
+
+	// cwd is the current working directory. Will be nil iff the FSContext
+	// has been destroyed.
+	cwd *fs.Dirent
+
+	// umask is the current file mode creation mask. When a thread using this
+	// context invokes a syscall that creates a file, bits set in umask are
+	// removed from the permissions that the file is created with.
+	umask uint
+}
+
+// newFSContext returns a new filesystem context.
+func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
+	root.IncRef()
+	cwd.IncRef()
+	return &FSContext{
+		root:  root,
+		cwd:   cwd,
+		umask: umask,
+	}
+}
+
+// destroy is the destructor for an FSContext.
+//
+// This will call DecRef on both root and cwd Dirents.  If either call to
+// DecRef returns an error, then it will be propigated.  If both calls to
+// DecRef return an error, then the one from root.DecRef will be propigated.
+//
+// Note that there may still be calls to WorkingDirectory() or RootDirectory()
+// (that return nil).  This is because valid references may still be held via
+// proc files or other mechanisms.
+func (f *FSContext) destroy() {
+	// Hold f.mu so that we don't race with RootDirectory() and
+	// WorkingDirectory().
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.root.DecRef()
+	f.root = nil
+
+	f.cwd.DecRef()
+	f.cwd = nil
+}
+
+// DecRef implements RefCounter.DecRef with destructor f.destroy.
+func (f *FSContext) DecRef() {
+	f.DecRefWithDestructor(f.destroy)
+}
+
+// Fork forks this FSContext.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) Fork() *FSContext {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.cwd.IncRef()
+	f.root.IncRef()
+	return &FSContext{
+		cwd:   f.cwd,
+		root:  f.root,
+		umask: f.umask,
+	}
+}
+
+// WorkingDirectory returns the current working directory.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) WorkingDirectory() *fs.Dirent {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if f.cwd != nil {
+		f.cwd.IncRef()
+	}
+	return f.cwd
+}
+
+// SetWorkingDirectory sets the current working directory.
+// This will take an extra reference on the Dirent.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
+	if d == nil {
+		panic("FSContext.SetWorkingDirectory called with nil dirent")
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	if f.cwd == nil {
+		panic(fmt.Sprintf("FSContext.SetWorkingDirectory(%v)) called after destroy", d))
+	}
+
+	old := f.cwd
+	f.cwd = d
+	d.IncRef()
+	old.DecRef()
+}
+
+// RootDirectory returns the current filesystem root.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) RootDirectory() *fs.Dirent {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if f.root != nil {
+		f.root.IncRef()
+	}
+	return f.root
+}
+
+// SetRootDirectory sets the root directory.
+// This will take an extra reference on the Dirent.
+//
+// This is not a valid call after free.
+func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
+	if d == nil {
+		panic("FSContext.SetRootDirectory called with nil dirent")
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	if f.root == nil {
+		panic(fmt.Sprintf("FSContext.SetRootDirectory(%v)) called after destroy", d))
+	}
+
+	old := f.root
+	f.root = d
+	d.IncRef()
+	old.DecRef()
+}
+
+// Umask returns the current umask.
+func (f *FSContext) Umask() uint {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return f.umask
+}
+
+// SwapUmask atomically sets the current umask and returns the old umask.
+func (f *FSContext) SwapUmask(mask uint) uint {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	old := f.umask
+	f.umask = mask
+	return old
+}
diff --git a/pkg/sentry/kernel/futex/atomicptr_bucket.go b/pkg/sentry/kernel/futex/atomicptr_bucket.go
new file mode 100755
index 000000000..2251a6e72
--- /dev/null
+++ b/pkg/sentry/kernel/futex/atomicptr_bucket.go
@@ -0,0 +1,27 @@
+package futex
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// An AtomicPtr is a pointer to a value of type Value that can be atomically
+// loaded and stored. The zero value of an AtomicPtr represents nil.
+//
+// Note that copying AtomicPtr by value performs a non-atomic read of the
+// stored pointer, which is unsafe if Store() can be called concurrently; in
+// this case, do `dst.Store(src.Load())` instead.
+type AtomicPtrBucket struct {
+	ptr unsafe.Pointer
+}
+
+// Load returns the value set by the most recent Store. It returns nil if there
+// has been no previous call to Store.
+func (p *AtomicPtrBucket) Load() *bucket {
+	return (*bucket)(atomic.LoadPointer(&p.ptr))
+}
+
+// Store sets the value returned by Load to x.
+func (p *AtomicPtrBucket) Store(x *bucket) {
+	atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x))
+}
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
new file mode 100644
index 000000000..bb38eb81e
--- /dev/null
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -0,0 +1,783 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package futex provides an implementation of the futex interface as found in
+// the Linux kernel. It allows one to easily transform Wait() calls into waits
+// on a channel, which is useful in a Go-based kernel, for example.
+package futex
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// KeyKind indicates the type of a Key.
+type KeyKind int
+
+const (
+	// KindPrivate indicates a private futex (a futex syscall with the
+	// FUTEX_PRIVATE_FLAG set).
+	KindPrivate KeyKind = iota
+
+	// KindSharedPrivate indicates a shared futex on a private memory mapping.
+	// Although KindPrivate and KindSharedPrivate futexes both use memory
+	// addresses to identify futexes, they do not interoperate (in Linux, the
+	// two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key
+	// comparison).
+	KindSharedPrivate
+
+	// KindSharedMappable indicates a shared futex on a memory mapping other
+	// than a private anonymous memory mapping.
+	KindSharedMappable
+)
+
+// Key represents something that a futex waiter may wait on.
+type Key struct {
+	// Kind is the type of the Key.
+	Kind KeyKind
+
+	// Mappable is the memory-mapped object that is represented by the Key.
+	// Mappable is always nil if Kind is not KindSharedMappable, and may be nil
+	// even if it is.
+	Mappable memmap.Mappable
+
+	// MappingIdentity is the MappingIdentity associated with Mappable.
+	// MappingIdentity is always nil is Mappable is nil, and may be nil even if
+	// it isn't.
+	MappingIdentity memmap.MappingIdentity
+
+	// If Kind is KindPrivate or KindSharedPrivate, Offset is the represented
+	// memory address. Otherwise, Offset is the represented offset into
+	// Mappable.
+	Offset uint64
+}
+
+func (k *Key) release() {
+	if k.MappingIdentity != nil {
+		k.MappingIdentity.DecRef()
+	}
+	k.Mappable = nil
+	k.MappingIdentity = nil
+}
+
+func (k *Key) clone() Key {
+	if k.MappingIdentity != nil {
+		k.MappingIdentity.IncRef()
+	}
+	return *k
+}
+
+// Preconditions: k.Kind == KindPrivate or KindSharedPrivate.
+func (k *Key) addr() usermem.Addr {
+	return usermem.Addr(k.Offset)
+}
+
+// matches returns true if a wakeup on k2 should wake a waiter waiting on k.
+func (k *Key) matches(k2 *Key) bool {
+	// k.MappingIdentity is ignored; it's only used for reference counting.
+	return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset
+}
+
+// Target abstracts memory accesses and keys.
+type Target interface {
+	// SwapUint32 gives access to usermem.IO.SwapUint32.
+	SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
+
+	// CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32.
+	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
+
+	// LoadUint32 gives access to usermem.IO.LoadUint32.
+	LoadUint32(addr usermem.Addr) (uint32, error)
+
+	// GetSharedKey returns a Key with kind KindSharedPrivate or
+	// KindSharedMappable corresponding to the memory mapped at address addr.
+	//
+	// If GetSharedKey returns a Key with a non-nil MappingIdentity, a
+	// reference is held on the MappingIdentity, which must be dropped by the
+	// caller when the Key is no longer in use.
+	GetSharedKey(addr usermem.Addr) (Key, error)
+}
+
+// check performs a basic equality check on the given address.
+func check(t Target, addr usermem.Addr, val uint32) error {
+	cur, err := t.LoadUint32(addr)
+	if err != nil {
+		return err
+	}
+	if cur != val {
+		return syserror.EAGAIN
+	}
+	return nil
+}
+
+// atomicOp performs a complex operation on the given address.
+func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
+	opType := (opIn >> 28) & 0xf
+	cmp := (opIn >> 24) & 0xf
+	opArg := (opIn >> 12) & 0xfff
+	cmpArg := opIn & 0xfff
+
+	if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 {
+		opArg = 1 << opArg
+		opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag.
+	}
+
+	var (
+		oldVal uint32
+		err    error
+	)
+	if opType == linux.FUTEX_OP_SET {
+		oldVal, err = t.SwapUint32(addr, opArg)
+		if err != nil {
+			return false, err
+		}
+	} else {
+		for {
+			oldVal, err = t.LoadUint32(addr)
+			if err != nil {
+				return false, err
+			}
+			var newVal uint32
+			switch opType {
+			case linux.FUTEX_OP_ADD:
+				newVal = oldVal + opArg
+			case linux.FUTEX_OP_OR:
+				newVal = oldVal | opArg
+			case linux.FUTEX_OP_ANDN:
+				newVal = oldVal &^ opArg
+			case linux.FUTEX_OP_XOR:
+				newVal = oldVal ^ opArg
+			default:
+				return false, syserror.ENOSYS
+			}
+			prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal)
+			if err != nil {
+				return false, err
+			}
+			if prev == oldVal {
+				break // Success.
+			}
+		}
+	}
+
+	switch cmp {
+	case linux.FUTEX_OP_CMP_EQ:
+		return oldVal == cmpArg, nil
+	case linux.FUTEX_OP_CMP_NE:
+		return oldVal != cmpArg, nil
+	case linux.FUTEX_OP_CMP_LT:
+		return oldVal < cmpArg, nil
+	case linux.FUTEX_OP_CMP_LE:
+		return oldVal <= cmpArg, nil
+	case linux.FUTEX_OP_CMP_GT:
+		return oldVal > cmpArg, nil
+	case linux.FUTEX_OP_CMP_GE:
+		return oldVal >= cmpArg, nil
+	default:
+		return false, syserror.ENOSYS
+	}
+}
+
+// Waiter is the struct which gets enqueued into buckets for wake up routines
+// and requeue routines to scan and notify. Once a Waiter has been enqueued by
+// WaitPrepare(), callers may listen on C for wake up events.
+type Waiter struct {
+	// Synchronization:
+	//
+	// - A Waiter that is not enqueued in a bucket is exclusively owned (no
+	// synchronization applies).
+	//
+	// - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this,
+	// waiterEntry, bucket, and key are protected by the bucket.mu ("bucket
+	// lock") of the containing bucket, and bitmask is immutable. Note that
+	// since bucket is mutated using atomic memory operations, bucket.Load()
+	// may be called without holding the bucket lock, although it may change
+	// racily. See WaitComplete().
+	//
+	// - A Waiter is only guaranteed to be no longer queued after calling
+	// WaitComplete().
+
+	// waiterEntry links Waiter into bucket.waiters.
+	waiterEntry
+
+	// bucket is the bucket this waiter is queued in. If bucket is nil, the
+	// waiter is not waiting and is not in any bucket.
+	bucket AtomicPtrBucket
+
+	// C is sent to when the Waiter is woken.
+	C chan struct{}
+
+	// key is what this waiter is waiting on.
+	key Key
+
+	// The bitmask we're waiting on.
+	// This is used the case of a FUTEX_WAKE_BITSET.
+	bitmask uint32
+
+	// tid is the thread ID for the waiter in case this is a PI mutex.
+	tid uint32
+}
+
+// NewWaiter returns a new unqueued Waiter.
+func NewWaiter() *Waiter {
+	return &Waiter{
+		C: make(chan struct{}, 1),
+	}
+}
+
+// woken returns true if w has been woken since the last call to WaitPrepare.
+func (w *Waiter) woken() bool {
+	return len(w.C) != 0
+}
+
+// bucket holds a list of waiters for a given address hash.
+//
+// +stateify savable
+type bucket struct {
+	// mu protects waiters and contained Waiter state. See comment in Waiter.
+	mu sync.Mutex `state:"nosave"`
+
+	waiters waiterList `state:"zerovalue"`
+}
+
+// wakeLocked wakes up to n waiters matching the bitmask at the addr for this
+// bucket and returns the number of waiters woken.
+//
+// Preconditions: b.mu must be locked.
+func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int {
+	done := 0
+	for w := b.waiters.Front(); done < n && w != nil; {
+		if !w.key.matches(key) || w.bitmask&bitmask == 0 {
+			// Not matching.
+			w = w.Next()
+			continue
+		}
+
+		// Remove from the bucket and wake the waiter.
+		woke := w
+		w = w.Next() // Next iteration.
+		b.wakeWaiterLocked(woke)
+		done++
+	}
+	return done
+}
+
+func (b *bucket) wakeWaiterLocked(w *Waiter) {
+	// Remove from the bucket and wake the waiter.
+	b.waiters.Remove(w)
+	w.C <- struct{}{}
+
+	// NOTE: The above channel write establishes a write barrier according
+	// to the memory model, so nothing may be ordered around it. Since
+	// we've dequeued w and will never touch it again, we can safely
+	// store nil to w.bucket here and allow the WaitComplete() to
+	// short-circuit grabbing the bucket lock. If they somehow miss the
+	// store, we are still holding the lock, so we can know that they won't
+	// dequeue w, assume it's free and have the below operation
+	// afterwards.
+	w.bucket.Store(nil)
+}
+
+// requeueLocked takes n waiters from the bucket and moves them to naddr on the
+// bucket "to".
+//
+// Preconditions: b and to must be locked.
+func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int {
+	done := 0
+	for w := b.waiters.Front(); done < n && w != nil; {
+		if !w.key.matches(key) {
+			// Not matching.
+			w = w.Next()
+			continue
+		}
+
+		requeued := w
+		w = w.Next() // Next iteration.
+		b.waiters.Remove(requeued)
+		requeued.key.release()
+		requeued.key = nkey.clone()
+		to.waiters.PushBack(requeued)
+		requeued.bucket.Store(to)
+		done++
+	}
+	return done
+}
+
+const (
+	// bucketCount is the number of buckets per Manager. By having many of
+	// these we reduce contention when concurrent yet unrelated calls are made.
+	bucketCount     = 1 << bucketCountBits
+	bucketCountBits = 10
+)
+
+// getKey returns a Key representing address addr in c.
+func getKey(t Target, addr usermem.Addr, private bool) (Key, error) {
+	// Ensure the address is aligned.
+	// It must be a DWORD boundary.
+	if addr&0x3 != 0 {
+		return Key{}, syserror.EINVAL
+	}
+	if private {
+		return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil
+	}
+	return t.GetSharedKey(addr)
+}
+
+// bucketIndexForAddr returns the index into Manager.buckets for addr.
+func bucketIndexForAddr(addr usermem.Addr) uintptr {
+	// - The bottom 2 bits of addr must be 0, per getKey.
+	//
+	// - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
+	// for a canonical address, and (on all existing platforms) bit 47 must be
+	// 0 for an application address.
+	//
+	// Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful"
+	// bits. We choose one of the simplest possible hash functions that at
+	// least uses all 45 useful bits in the output, given that bucketCountBits
+	// == 10. This hash function also has the property that it will usually map
+	// adjacent addresses to adjacent buckets, slightly improving memory
+	// locality when an application synchronization structure uses multiple
+	// nearby futexes.
+	//
+	// Note that despite the large number of arithmetic operations in the
+	// function, many components can be computed in parallel, such that the
+	// critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This
+	// is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... +
+	// (addr >> 42)" without any additional grouping, the compiler puts all 4
+	// additions in the critical path.
+	h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22)
+	h2 := uintptr(addr>>32) + uintptr(addr>>42)
+	return (h1 + h2) % bucketCount
+}
+
+// Manager holds futex state for a single virtual address space.
+//
+// +stateify savable
+type Manager struct {
+	// privateBuckets holds buckets for KindPrivate and KindSharedPrivate
+	// futexes.
+	privateBuckets [bucketCount]bucket `state:"zerovalue"`
+
+	// sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket
+	// may be shared by multiple Managers. The sharedBucket pointer is
+	// immutable.
+	sharedBucket *bucket
+}
+
+// NewManager returns an initialized futex manager.
+func NewManager() *Manager {
+	return &Manager{
+		sharedBucket: &bucket{},
+	}
+}
+
+// Fork returns a new Manager. Shared futex clients using the returned Manager
+// may interoperate with those using m.
+func (m *Manager) Fork() *Manager {
+	return &Manager{
+		sharedBucket: m.sharedBucket,
+	}
+}
+
+// lockBucket returns a locked bucket for the given key.
+func (m *Manager) lockBucket(k *Key) *bucket {
+	var b *bucket
+	if k.Kind == KindSharedMappable {
+		b = m.sharedBucket
+	} else {
+		b = &m.privateBuckets[bucketIndexForAddr(k.addr())]
+	}
+	b.mu.Lock()
+	return b
+}
+
+// lockBuckets returns locked buckets for the given keys.
+func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
+	// Buckets must be consistently ordered to avoid circular lock
+	// dependencies. We order buckets in m.privateBuckets by index (lowest
+	// index first), and all buckets in m.privateBuckets precede
+	// m.sharedBucket.
+
+	// Handle the common case first:
+	if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable {
+		i1 := bucketIndexForAddr(k1.addr())
+		i2 := bucketIndexForAddr(k2.addr())
+		b1 := &m.privateBuckets[i1]
+		b2 := &m.privateBuckets[i2]
+		switch {
+		case i1 < i2:
+			b1.mu.Lock()
+			b2.mu.Lock()
+		case i2 < i1:
+			b2.mu.Lock()
+			b1.mu.Lock()
+		default:
+			b1.mu.Lock()
+		}
+		return b1, b2
+	}
+
+	// At least one of b1 or b2 should be m.sharedBucket.
+	b1 := m.sharedBucket
+	b2 := m.sharedBucket
+	if k1.Kind != KindSharedMappable {
+		b1 = m.lockBucket(k1)
+	} else if k2.Kind != KindSharedMappable {
+		b2 = m.lockBucket(k2)
+	}
+	m.sharedBucket.mu.Lock()
+	return b1, b2
+}
+
+// Wake wakes up to n waiters matching the bitmask on the given addr.
+// The number of waiters woken is returned.
+func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32, n int) (int, error) {
+	// This function is very hot; avoid defer.
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return 0, err
+	}
+
+	b := m.lockBucket(&k)
+	r := b.wakeLocked(&k, bitmask, n)
+
+	b.mu.Unlock()
+	k.release()
+	return r, nil
+}
+
+func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
+	k1, err := getKey(t, addr, private)
+	if err != nil {
+		return 0, err
+	}
+	defer k1.release()
+	k2, err := getKey(t, naddr, private)
+	if err != nil {
+		return 0, err
+	}
+	defer k2.release()
+
+	b1, b2 := m.lockBuckets(&k1, &k2)
+	defer b1.mu.Unlock()
+	if b2 != b1 {
+		defer b2.mu.Unlock()
+	}
+
+	if checkval {
+		if err := check(t, addr, val); err != nil {
+			return 0, err
+		}
+	}
+
+	// Wake the number required.
+	done := b1.wakeLocked(&k1, ^uint32(0), nwake)
+
+	// Requeue the number required.
+	b1.requeueLocked(b2, &k1, &k2, nreq)
+
+	return done, nil
+}
+
+// Requeue wakes up to nwake waiters on the given addr, and unconditionally
+// requeues up to nreq waiters on naddr.
+func (m *Manager) Requeue(t Target, addr, naddr usermem.Addr, private bool, nwake int, nreq int) (int, error) {
+	return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq)
+}
+
+// RequeueCmp atomically checks that the addr contains val (via the Target),
+// wakes up to nwake waiters on addr and then unconditionally requeues nreq
+// waiters on naddr.
+func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, val uint32, nwake int, nreq int) (int, error) {
+	return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq)
+}
+
+// WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
+// waiters unconditionally from addr1, and, based on the original value at addr2
+// and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
+// It returns the total number of waiters woken.
+func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
+	k1, err := getKey(t, addr1, private)
+	if err != nil {
+		return 0, err
+	}
+	defer k1.release()
+	k2, err := getKey(t, addr2, private)
+	if err != nil {
+		return 0, err
+	}
+	defer k2.release()
+
+	b1, b2 := m.lockBuckets(&k1, &k2)
+	defer b1.mu.Unlock()
+	if b2 != b1 {
+		defer b2.mu.Unlock()
+	}
+
+	done := 0
+	cond, err := atomicOp(t, addr2, op)
+	if err != nil {
+		return 0, err
+	}
+
+	// Wake up up to nwake1 entries from the first bucket.
+	done = b1.wakeLocked(&k1, ^uint32(0), nwake1)
+
+	// Wake up up to nwake2 entries from the second bucket if the
+	// operation yielded true.
+	if cond {
+		done += b2.wakeLocked(&k2, ^uint32(0), nwake2)
+	}
+
+	return done, nil
+}
+
+// WaitPrepare atomically checks that addr contains val (via the Checker), then
+// enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
+// Waiter must be subsequently removed by calling WaitComplete, whether or not
+// a wakeup is received on w.C.
+func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) error {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return err
+	}
+	// Ownership of k is transferred to w below.
+
+	// Prepare the Waiter before taking the bucket lock.
+	select {
+	case <-w.C:
+	default:
+	}
+	w.key = k
+	w.bitmask = bitmask
+
+	b := m.lockBucket(&k)
+	// This function is very hot; avoid defer.
+
+	// Perform our atomic check.
+	if err := check(t, addr, val); err != nil {
+		b.mu.Unlock()
+		w.key.release()
+		return err
+	}
+
+	// Add the waiter to the bucket.
+	b.waiters.PushBack(w)
+	w.bucket.Store(b)
+
+	b.mu.Unlock()
+	return nil
+}
+
+// WaitComplete must be called when a Waiter previously added by WaitPrepare is
+// no longer eligible to be woken.
+func (m *Manager) WaitComplete(w *Waiter) {
+	// Remove w from the bucket it's in.
+	for {
+		b := w.bucket.Load()
+
+		// If b is nil, the waiter isn't in any bucket anymore. This can't be
+		// racy because the waiter can't be concurrently re-queued in another
+		// bucket.
+		if b == nil {
+			break
+		}
+
+		// Take the bucket lock. Note that without holding the bucket lock, the
+		// waiter is not guaranteed to stay in that bucket, so after we take
+		// the bucket lock, we must ensure that the bucket hasn't changed: if
+		// it happens to have changed, we release the old bucket lock and try
+		// again with the new bucket; if it hasn't changed, we know it won't
+		// change now because we hold the lock.
+		b.mu.Lock()
+		if b != w.bucket.Load() {
+			b.mu.Unlock()
+			continue
+		}
+
+		// Remove waiter from bucket.
+		b.waiters.Remove(w)
+		w.bucket.Store(nil)
+		b.mu.Unlock()
+		break
+	}
+
+	// Release references held by the waiter.
+	w.key.release()
+}
+
+// LockPI attempts to lock the futex following the Priority-inheritance futex
+// rules. The lock is acquired only when 'addr' points to 0. The TID of the
+// calling task is set to 'addr' to indicate the futex is owned. It returns true
+// if the futex was successfully acquired.
+//
+// FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see
+// exit_robust_list()). Given we don't support robust lists, although handled
+// below, it's never set.
+func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return false, err
+	}
+	// Ownership of k is transferred to w below.
+
+	// Prepare the Waiter before taking the bucket lock.
+	select {
+	case <-w.C:
+	default:
+	}
+	w.key = k
+	w.tid = tid
+
+	b := m.lockBucket(&k)
+	// Hot function: avoid defers.
+
+	success, err := m.lockPILocked(w, t, addr, tid, b, try)
+	if err != nil {
+		w.key.release()
+		b.mu.Unlock()
+		return false, err
+	}
+	if success || try {
+		// Release waiter if it's not going to be a wait.
+		w.key.release()
+	}
+	b.mu.Unlock()
+	return success, nil
+}
+
+func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) {
+	for {
+		cur, err := t.LoadUint32(addr)
+		if err != nil {
+			return false, err
+		}
+		if (cur & linux.FUTEX_TID_MASK) == tid {
+			return false, syserror.EDEADLK
+		}
+
+		if (cur & linux.FUTEX_TID_MASK) == 0 {
+			// No owner and no waiters, try to acquire the futex.
+
+			// Set TID and preserve owner died status.
+			val := tid
+			val |= cur & linux.FUTEX_OWNER_DIED
+			prev, err := t.CompareAndSwapUint32(addr, cur, val)
+			if err != nil {
+				return false, err
+			}
+			if prev != cur {
+				// CAS failed, retry...
+				// Linux reacquires the bucket lock on retries, which will re-lookup the
+				// mapping at the futex address. However, retrying while holding the
+				// lock is more efficient and reduces the chance of another conflict.
+				continue
+			}
+			// Futex acquired.
+			return true, nil
+		}
+
+		// Futex is already owned, prepare to wait.
+
+		if try {
+			// Caller doesn't want to wait.
+			return false, nil
+		}
+
+		// Set waiters bit if not set yet.
+		if cur&linux.FUTEX_WAITERS == 0 {
+			prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS)
+			if err != nil {
+				return false, err
+			}
+			if prev != cur {
+				// CAS failed, retry...
+				continue
+			}
+		}
+
+		// Add the waiter to the bucket.
+		b.waiters.PushBack(w)
+		w.bucket.Store(b)
+		return false, nil
+	}
+}
+
+// UnlockPI unlock the futex following the Priority-inheritance futex
+// rules. The address provided must contain the caller's TID. If there are
+// waiters, TID of the next waiter (FIFO) is set to the given address, and the
+// waiter woken up. If there are no waiters, 0 is set to the address.
+func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return err
+	}
+	b := m.lockBucket(&k)
+
+	err = m.unlockPILocked(t, addr, tid, b)
+
+	k.release()
+	b.mu.Unlock()
+	return err
+}
+
+func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket) error {
+	cur, err := t.LoadUint32(addr)
+	if err != nil {
+		return err
+	}
+
+	if (cur & linux.FUTEX_TID_MASK) != tid {
+		return syserror.EPERM
+	}
+
+	if b.waiters.Empty() {
+		// It's safe to set 0 because there are no waiters, no new owner, and the
+		// executing task is the current owner (no owner died bit).
+		prev, err := t.CompareAndSwapUint32(addr, cur, 0)
+		if err != nil {
+			return err
+		}
+		if prev != cur {
+			// Let user mode handle CAS races. This is different than lock, which
+			// retries when CAS fails.
+			return syserror.EAGAIN
+		}
+		return nil
+	}
+
+	next := b.waiters.Front()
+
+	// Set next owner's TID, waiters if there are any. Resets owner died bit, if
+	// set, because the executing task takes over as the owner.
+	val := next.tid
+	if next.Next() != nil {
+		val |= linux.FUTEX_WAITERS
+	}
+
+	prev, err := t.CompareAndSwapUint32(addr, cur, val)
+	if err != nil {
+		return err
+	}
+	if prev != cur {
+		return syserror.EINVAL
+	}
+
+	b.wakeWaiterLocked(next)
+	return nil
+}
diff --git a/pkg/sentry/kernel/futex/futex_state_autogen.go b/pkg/sentry/kernel/futex/futex_state_autogen.go
new file mode 100755
index 000000000..b58e22b78
--- /dev/null
+++ b/pkg/sentry/kernel/futex/futex_state_autogen.go
@@ -0,0 +1,62 @@
+// automatically generated by stateify.
+
+package futex
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *bucket) beforeSave() {}
+func (x *bucket) save(m state.Map) {
+	x.beforeSave()
+	if !state.IsZeroValue(x.waiters) { m.Failf("waiters is %v, expected zero", x.waiters) }
+}
+
+func (x *bucket) afterLoad() {}
+func (x *bucket) load(m state.Map) {
+}
+
+func (x *Manager) beforeSave() {}
+func (x *Manager) save(m state.Map) {
+	x.beforeSave()
+	if !state.IsZeroValue(x.privateBuckets) { m.Failf("privateBuckets is %v, expected zero", x.privateBuckets) }
+	m.Save("sharedBucket", &x.sharedBucket)
+}
+
+func (x *Manager) afterLoad() {}
+func (x *Manager) load(m state.Map) {
+	m.Load("sharedBucket", &x.sharedBucket)
+}
+
+func (x *waiterList) beforeSave() {}
+func (x *waiterList) save(m state.Map) {
+	x.beforeSave()
+	m.Save("head", &x.head)
+	m.Save("tail", &x.tail)
+}
+
+func (x *waiterList) afterLoad() {}
+func (x *waiterList) load(m state.Map) {
+	m.Load("head", &x.head)
+	m.Load("tail", &x.tail)
+}
+
+func (x *waiterEntry) beforeSave() {}
+func (x *waiterEntry) save(m state.Map) {
+	x.beforeSave()
+	m.Save("next", &x.next)
+	m.Save("prev", &x.prev)
+}
+
+func (x *waiterEntry) afterLoad() {}
+func (x *waiterEntry) load(m state.Map) {
+	m.Load("next", &x.next)
+	m.Load("prev", &x.prev)
+}
+
+func init() {
+	state.Register("futex.bucket", (*bucket)(nil), state.Fns{Save: (*bucket).save, Load: (*bucket).load})
+	state.Register("futex.Manager", (*Manager)(nil), state.Fns{Save: (*Manager).save, Load: (*Manager).load})
+	state.Register("futex.waiterList", (*waiterList)(nil), state.Fns{Save: (*waiterList).save, Load: (*waiterList).load})
+	state.Register("futex.waiterEntry", (*waiterEntry)(nil), state.Fns{Save: (*waiterEntry).save, Load: (*waiterEntry).load})
+}
diff --git a/pkg/sentry/kernel/futex/waiter_list.go b/pkg/sentry/kernel/futex/waiter_list.go
new file mode 100755
index 000000000..cca5c4721
--- /dev/null
+++ b/pkg/sentry/kernel/futex/waiter_list.go
@@ -0,0 +1,173 @@
+package futex
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type waiterElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (waiterElementMapper) linkerFor(elem *Waiter) *Waiter { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+//      for e := l.Front(); e != nil; e = e.Next() {
+// 		// do something with e.
+//      }
+//
+// +stateify savable
+type waiterList struct {
+	head *Waiter
+	tail *Waiter
+}
+
+// Reset resets list l to the empty state.
+func (l *waiterList) Reset() {
+	l.head = nil
+	l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+func (l *waiterList) Empty() bool {
+	return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+func (l *waiterList) Front() *Waiter {
+	return l.head
+}
+
+// Back returns the last element of list l or nil.
+func (l *waiterList) Back() *Waiter {
+	return l.tail
+}
+
+// PushFront inserts the element e at the front of list l.
+func (l *waiterList) PushFront(e *Waiter) {
+	waiterElementMapper{}.linkerFor(e).SetNext(l.head)
+	waiterElementMapper{}.linkerFor(e).SetPrev(nil)
+
+	if l.head != nil {
+		waiterElementMapper{}.linkerFor(l.head).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+
+	l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+func (l *waiterList) PushBack(e *Waiter) {
+	waiterElementMapper{}.linkerFor(e).SetNext(nil)
+	waiterElementMapper{}.linkerFor(e).SetPrev(l.tail)
+
+	if l.tail != nil {
+		waiterElementMapper{}.linkerFor(l.tail).SetNext(e)
+	} else {
+		l.head = e
+	}
+
+	l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+func (l *waiterList) PushBackList(m *waiterList) {
+	if l.head == nil {
+		l.head = m.head
+		l.tail = m.tail
+	} else if m.head != nil {
+		waiterElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+		waiterElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+		l.tail = m.tail
+	}
+
+	m.head = nil
+	m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+func (l *waiterList) InsertAfter(b, e *Waiter) {
+	a := waiterElementMapper{}.linkerFor(b).Next()
+	waiterElementMapper{}.linkerFor(e).SetNext(a)
+	waiterElementMapper{}.linkerFor(e).SetPrev(b)
+	waiterElementMapper{}.linkerFor(b).SetNext(e)
+
+	if a != nil {
+		waiterElementMapper{}.linkerFor(a).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+}
+
+// InsertBefore inserts e before a.
+func (l *waiterList) InsertBefore(a, e *Waiter) {
+	b := waiterElementMapper{}.linkerFor(a).Prev()
+	waiterElementMapper{}.linkerFor(e).SetNext(a)
+	waiterElementMapper{}.linkerFor(e).SetPrev(b)
+	waiterElementMapper{}.linkerFor(a).SetPrev(e)
+
+	if b != nil {
+		waiterElementMapper{}.linkerFor(b).SetNext(e)
+	} else {
+		l.head = e
+	}
+}
+
+// Remove removes e from l.
+func (l *waiterList) Remove(e *Waiter) {
+	prev := waiterElementMapper{}.linkerFor(e).Prev()
+	next := waiterElementMapper{}.linkerFor(e).Next()
+
+	if prev != nil {
+		waiterElementMapper{}.linkerFor(prev).SetNext(next)
+	} else {
+		l.head = next
+	}
+
+	if next != nil {
+		waiterElementMapper{}.linkerFor(next).SetPrev(prev)
+	} else {
+		l.tail = prev
+	}
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type waiterEntry struct {
+	next *Waiter
+	prev *Waiter
+}
+
+// Next returns the entry that follows e in the list.
+func (e *waiterEntry) Next() *Waiter {
+	return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+func (e *waiterEntry) Prev() *Waiter {
+	return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+func (e *waiterEntry) SetNext(elem *Waiter) {
+	e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+func (e *waiterEntry) SetPrev(elem *Waiter) {
+	e.prev = elem
+}
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
new file mode 100644
index 000000000..ebe12812c
--- /dev/null
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -0,0 +1,58 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm"
+)
+
+// IPCNamespace represents an IPC namespace.
+//
+// +stateify savable
+type IPCNamespace struct {
+	// User namespace which owns this IPC namespace. Immutable.
+	userNS *auth.UserNamespace
+
+	semaphores *semaphore.Registry
+	shms       *shm.Registry
+}
+
+// NewIPCNamespace creates a new IPC namespace.
+func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
+	return &IPCNamespace{
+		userNS:     userNS,
+		semaphores: semaphore.NewRegistry(userNS),
+		shms:       shm.NewRegistry(userNS),
+	}
+}
+
+// SemaphoreRegistry returns the semanphore set registry for this namespace.
+func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry {
+	return i.semaphores
+}
+
+// ShmRegistry returns the shm segment registry for this namespace.
+func (i *IPCNamespace) ShmRegistry() *shm.Registry {
+	return i.shms
+}
+
+// IPCNamespace returns the task's IPC namespace.
+func (t *Task) IPCNamespace() *IPCNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.ipcns
+}
diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go
new file mode 100644
index 000000000..304da2032
--- /dev/null
+++ b/pkg/sentry/kernel/kdefs/kdefs.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kdefs defines common kernel definitions.
+//
+package kdefs
+
+// FD is a File Descriptor.
+type FD int32
diff --git a/pkg/sentry/kernel/kdefs/kdefs_state_autogen.go b/pkg/sentry/kernel/kdefs/kdefs_state_autogen.go
new file mode 100755
index 000000000..cef77125b
--- /dev/null
+++ b/pkg/sentry/kernel/kdefs/kdefs_state_autogen.go
@@ -0,0 +1,4 @@
+// automatically generated by stateify.
+
+package kdefs
+
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
new file mode 100644
index 000000000..85d73ace2
--- /dev/null
+++ b/pkg/sentry/kernel/kernel.go
@@ -0,0 +1,1241 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kernel provides an emulation of the Linux kernel.
+//
+// See README.md for a detailed overview.
+//
+// Lock order (outermost locks must be taken first):
+//
+// Kernel.extMu
+//   ThreadGroup.timerMu
+//     ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer)
+//       TaskSet.mu
+//         SignalHandlers.mu
+//           Task.mu
+//
+// Locking SignalHandlers.mu in multiple SignalHandlers requires locking
+// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
+// time requires locking all of their signal mutexes first.
+package kernel
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"path/filepath"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port"
+	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
+	uspb "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+	"gvisor.googlesource.com/gvisor/pkg/state"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+)
+
+// Kernel represents an emulated Linux kernel. It must be initialized by calling
+// Init() or LoadFrom().
+//
+// +stateify savable
+type Kernel struct {
+	// extMu serializes external changes to the Kernel with calls to
+	// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
+	// remains frozen for the duration of the call; it requires that the Kernel
+	// is paused as a precondition, which ensures that none of the tasks
+	// running within the Kernel can affect its state, but extMu is required to
+	// ensure that concurrent users of the Kernel *outside* the Kernel's
+	// control cannot affect its state by calling e.g.
+	// Kernel.SendExternalSignal.)
+	extMu sync.Mutex `state:"nosave"`
+
+	// started is true if Start has been called. Unless otherwise specified,
+	// all Kernel fields become immutable once started becomes true.
+	started bool `state:"nosave"`
+
+	// All of the following fields are immutable unless otherwise specified.
+
+	// Platform is the platform that is used to execute tasks in the created
+	// Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is
+	// embedded anonymously (the same issue applies).
+	platform.Platform `state:"nosave"`
+
+	// mf provides application memory.
+	mf *pgalloc.MemoryFile `state:"nosave"`
+
+	// See InitKernelArgs for the meaning of these fields.
+	featureSet                  *cpuid.FeatureSet
+	timekeeper                  *Timekeeper
+	tasks                       *TaskSet
+	rootUserNamespace           *auth.UserNamespace
+	networkStack                inet.Stack `state:"nosave"`
+	applicationCores            uint
+	useHostCores                bool
+	extraAuxv                   []arch.AuxEntry
+	vdso                        *loader.VDSO
+	rootUTSNamespace            *UTSNamespace
+	rootIPCNamespace            *IPCNamespace
+	rootAbstractSocketNamespace *AbstractSocketNamespace
+
+	// mounts holds the state of the virtual filesystem. mounts is initially
+	// nil, and must be set by calling Kernel.SetRootMountNamespace before
+	// Kernel.CreateProcess can succeed.
+	mounts *fs.MountNamespace
+
+	// futexes is the "root" futex.Manager, from which all others are forked.
+	// This is necessary to ensure that shared futexes are coherent across all
+	// tasks, including those created by CreateProcess.
+	futexes *futex.Manager
+
+	// globalInit is the thread group whose leader has ID 1 in the root PID
+	// namespace. globalInit is stored separately so that it is accessible even
+	// after all tasks in the thread group have exited, such that ID 1 is no
+	// longer mapped.
+	//
+	// globalInit is mutable until it is assigned by the first successful call
+	// to CreateProcess, and is protected by extMu.
+	globalInit *ThreadGroup
+
+	// realtimeClock is a ktime.Clock based on timekeeper's Realtime.
+	realtimeClock *timekeeperClock
+
+	// monotonicClock is a ktime.Clock based on timekeeper's Monotonic.
+	monotonicClock *timekeeperClock
+
+	// syslog is the kernel log.
+	syslog syslog
+
+	// cpuClock is incremented every linux.ClockTick. cpuClock is used to
+	// measure task CPU usage, since sampling monotonicClock twice on every
+	// syscall turns out to be unreasonably expensive. This is similar to how
+	// Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING),
+	// although Linux also uses scheduler timing information to improve
+	// resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do
+	// since "preeemptive" scheduling is managed by the Go runtime, which
+	// doesn't provide this information.
+	//
+	// cpuClock is mutable, and is accessed using atomic memory operations.
+	cpuClock uint64
+
+	// cpuClockTicker increments cpuClock.
+	cpuClockTicker *ktime.Timer `state:"nosave"`
+
+	// fdMapUids is an ever-increasing counter for generating FDMap uids.
+	//
+	// fdMapUids is mutable, and is accessed using atomic memory operations.
+	fdMapUids uint64
+
+	// uniqueID is used to generate unique identifiers.
+	//
+	// uniqueID is mutable, and is accessed using atomic memory operations.
+	uniqueID uint64
+
+	// nextInotifyCookie is a monotonically increasing counter used for
+	// generating unique inotify event cookies.
+	//
+	// nextInotifyCookie is mutable, and is accessed using atomic memory
+	// operations.
+	nextInotifyCookie uint32
+
+	// netlinkPorts manages allocation of netlink socket port IDs.
+	netlinkPorts *port.Manager
+
+	// saveErr is the error causing the sandbox to exit during save, if
+	// any. It is protected by extMu.
+	saveErr error `state:"nosave"`
+
+	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
+	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
+
+	// socketTable is used to track all sockets on the system. Protected by
+	// extMu.
+	socketTable map[int]map[*refs.WeakRef]struct{}
+
+	// deviceRegistry is used to save/restore device.SimpleDevices.
+	deviceRegistry struct{} `state:".(*device.Registry)"`
+
+	// DirentCacheLimiter controls the number of total dirent entries can be in
+	// caches. Not all caches use it, only the caches that use host resources use
+	// the limiter. It may be nil if disabled.
+	DirentCacheLimiter *fs.DirentCacheLimiter
+}
+
+// InitKernelArgs holds arguments to Init.
+type InitKernelArgs struct {
+	// FeatureSet is the emulated CPU feature set.
+	FeatureSet *cpuid.FeatureSet
+
+	// Timekeeper manages time for all tasks in the system.
+	Timekeeper *Timekeeper
+
+	// RootUserNamespace is the root user namespace.
+	RootUserNamespace *auth.UserNamespace
+
+	// NetworkStack is the TCP/IP network stack. NetworkStack may be nil.
+	NetworkStack inet.Stack
+
+	// ApplicationCores is the number of logical CPUs visible to sandboxed
+	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
+	// ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
+	// most significant bit in cpu_possible_mask + 1.
+	ApplicationCores uint
+
+	// If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
+	// instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
+	// no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
+	// will be overridden.
+	UseHostCores bool
+
+	// ExtraAuxv contains additional auxiliary vector entries that are added to
+	// each process by the ELF loader.
+	ExtraAuxv []arch.AuxEntry
+
+	// Vdso holds the VDSO and its parameter page.
+	Vdso *loader.VDSO
+
+	// RootUTSNamespace is the root UTS namespace.
+	RootUTSNamespace *UTSNamespace
+
+	// RootIPCNamespace is the root IPC namespace.
+	RootIPCNamespace *IPCNamespace
+
+	// RootAbstractSocketNamespace is the root Abstract Socket namespace.
+	RootAbstractSocketNamespace *AbstractSocketNamespace
+}
+
+// Init initialize the Kernel with no tasks.
+//
+// Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile
+// before calling Init.
+func (k *Kernel) Init(args InitKernelArgs) error {
+	if args.FeatureSet == nil {
+		return fmt.Errorf("FeatureSet is nil")
+	}
+	if args.Timekeeper == nil {
+		return fmt.Errorf("Timekeeper is nil")
+	}
+	if args.RootUserNamespace == nil {
+		return fmt.Errorf("RootUserNamespace is nil")
+	}
+	if args.ApplicationCores == 0 {
+		return fmt.Errorf("ApplicationCores is 0")
+	}
+
+	k.featureSet = args.FeatureSet
+	k.timekeeper = args.Timekeeper
+	k.tasks = newTaskSet()
+	k.rootUserNamespace = args.RootUserNamespace
+	k.rootUTSNamespace = args.RootUTSNamespace
+	k.rootIPCNamespace = args.RootIPCNamespace
+	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
+	k.networkStack = args.NetworkStack
+	k.applicationCores = args.ApplicationCores
+	if args.UseHostCores {
+		k.useHostCores = true
+		maxCPU, err := hostcpu.MaxPossibleCPU()
+		if err != nil {
+			return fmt.Errorf("Failed to get maximum CPU number: %v", err)
+		}
+		minAppCores := uint(maxCPU) + 1
+		if k.applicationCores < minAppCores {
+			log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
+			k.applicationCores = minAppCores
+		}
+	}
+	k.extraAuxv = args.ExtraAuxv
+	k.vdso = args.Vdso
+	k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime}
+	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
+	k.futexes = futex.NewManager()
+	k.netlinkPorts = port.New()
+	k.socketTable = make(map[int]map[*refs.WeakRef]struct{})
+
+	return nil
+}
+
+// SaveTo saves the state of k to w.
+//
+// Preconditions: The kernel must be paused throughout the call to SaveTo.
+func (k *Kernel) SaveTo(w io.Writer) error {
+	saveStart := time.Now()
+	ctx := k.SupervisorContext()
+
+	// Do not allow other Kernel methods to affect it while it's being saved.
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+
+	// Stop time.
+	k.pauseTimeLocked()
+	defer k.resumeTimeLocked()
+
+	// Evict all evictable MemoryFile allocations.
+	k.mf.StartEvictions()
+	k.mf.WaitForEvictions()
+
+	// Flush write operations on open files so data reaches backing storage.
+	// This must come after MemoryFile eviction since eviction may cause file
+	// writes.
+	if err := k.tasks.flushWritesToFiles(ctx); err != nil {
+		return err
+	}
+
+	// Remove all epoll waiter objects from underlying wait queues.
+	// NOTE: for programs to resume execution in future snapshot scenarios,
+	// we will need to re-establish these waiter objects after saving.
+	k.tasks.unregisterEpollWaiters()
+
+	// Clear the dirent cache before saving because Dirents must be Loaded in a
+	// particular order (parents before children), and Loading dirents from a cache
+	// breaks that order.
+	if err := k.flushMountSourceRefs(); err != nil {
+		return err
+	}
+
+	// Ensure that all pending asynchronous work is complete:
+	//   - inode and mount release
+	//   - asynchronuous IO
+	fs.AsyncBarrier()
+
+	// Once all fs work has completed (flushed references have all been released),
+	// reset mount mappings. This allows individual mounts to save how inodes map
+	// to filesystem resources. Without this, fs.Inodes cannot be restored.
+	fs.SaveInodeMappings()
+
+	// Discard unsavable mappings, such as those for host file descriptors.
+	// This must be done after waiting for "asynchronous fs work", which
+	// includes async I/O that may touch application memory.
+	if err := k.invalidateUnsavableMappings(ctx); err != nil {
+		return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+	}
+
+	// Save the CPUID FeatureSet before the rest of the kernel so we can
+	// verify its compatibility on restore before attempting to restore the
+	// entire kernel, which may fail on an incompatible machine.
+	//
+	// N.B. This will also be saved along with the full kernel save below.
+	cpuidStart := time.Now()
+	if err := state.Save(w, k.FeatureSet(), nil); err != nil {
+		return err
+	}
+	log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
+
+	// Save the kernel state.
+	kernelStart := time.Now()
+	var stats state.Stats
+	if err := state.Save(w, k, &stats); err != nil {
+		return err
+	}
+	log.Infof("Kernel save stats: %s", &stats)
+	log.Infof("Kernel save took [%s].", time.Since(kernelStart))
+
+	// Save the memory file's state.
+	memoryStart := time.Now()
+	if err := k.mf.SaveTo(w); err != nil {
+		return err
+	}
+	log.Infof("Memory save took [%s].", time.Since(memoryStart))
+
+	log.Infof("Overall save took [%s].", time.Since(saveStart))
+
+	return nil
+}
+
+// flushMountSourceRefs flushes the MountSources for all mounted filesystems
+// and open FDs.
+func (k *Kernel) flushMountSourceRefs() error {
+	// Flush all mount sources for currently mounted filesystems.
+	k.mounts.FlushMountSourceRefs()
+
+	// There may be some open FDs whose filesystems have been unmounted. We
+	// must flush those as well.
+	return k.tasks.forEachFDPaused(func(desc descriptor) error {
+		desc.file.Dirent.Inode.MountSource.FlushDirentRefs()
+		return nil
+	})
+}
+
+// forEachFDPaused applies the given function to each open file descriptor in each
+// task.
+//
+// Precondition: Must be called with the kernel paused.
+func (ts *TaskSet) forEachFDPaused(f func(descriptor) error) error {
+	ts.mu.RLock()
+	defer ts.mu.RUnlock()
+	for t := range ts.Root.tids {
+		// We can skip locking Task.mu here since the kernel is paused.
+		if t.fds == nil {
+			continue
+		}
+		for _, desc := range t.fds.files {
+			if err := f(desc); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
+	return ts.forEachFDPaused(func(desc descriptor) error {
+		if flags := desc.file.Flags(); !flags.Write {
+			return nil
+		}
+		if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
+			return nil
+		}
+		// Here we need all metadata synced.
+		syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+		if err := fs.SaveFileFsyncError(syncErr); err != nil {
+			name, _ := desc.file.Dirent.FullName(nil /* root */)
+			// Wrap this error in ErrSaveRejection
+			// so that it will trigger a save
+			// error, rather than a panic. This
+			// also allows us to distinguish Fsync
+			// errors from state file errors in
+			// state.Save.
+			return fs.ErrSaveRejection{
+				Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err),
+			}
+		}
+		return nil
+	})
+}
+
+// Preconditions: The kernel must be paused.
+func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
+	invalidated := make(map[*mm.MemoryManager]struct{})
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+	for t := range k.tasks.Root.tids {
+		// We can skip locking Task.mu here since the kernel is paused.
+		if mm := t.tc.MemoryManager; mm != nil {
+			if _, ok := invalidated[mm]; !ok {
+				if err := mm.InvalidateUnsavable(ctx); err != nil {
+					return err
+				}
+				invalidated[mm] = struct{}{}
+			}
+		}
+		// I really wish we just had a sync.Map of all MMs...
+		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
+			if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (ts *TaskSet) unregisterEpollWaiters() {
+	ts.mu.RLock()
+	defer ts.mu.RUnlock()
+	for t := range ts.Root.tids {
+		// We can skip locking Task.mu here since the kernel is paused.
+		if fdmap := t.fds; fdmap != nil {
+			for _, desc := range fdmap.files {
+				if desc.file != nil {
+					if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok {
+						e.UnregisterEpollWaiters()
+					}
+				}
+			}
+		}
+	}
+}
+
+// LoadFrom returns a new Kernel loaded from args.
+func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
+	loadStart := time.Now()
+
+	k.networkStack = net
+
+	initAppCores := k.applicationCores
+
+	// Load the pre-saved CPUID FeatureSet.
+	//
+	// N.B. This was also saved along with the full kernel below, so we
+	// don't need to explicitly install it in the Kernel.
+	cpuidStart := time.Now()
+	var features cpuid.FeatureSet
+	if err := state.Load(r, &features, nil); err != nil {
+		return err
+	}
+	log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
+
+	// Verify that the FeatureSet is usable on this host. We do this before
+	// Kernel load so that the explicit CPUID mismatch error has priority
+	// over floating point state restore errors that may occur on load on
+	// an incompatible machine.
+	if err := features.CheckHostCompatible(); err != nil {
+		return err
+	}
+
+	// Load the kernel state.
+	kernelStart := time.Now()
+	var stats state.Stats
+	if err := state.Load(r, k, &stats); err != nil {
+		return err
+	}
+	log.Infof("Kernel load stats: %s", &stats)
+	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
+
+	// Load the memory file's state.
+	memoryStart := time.Now()
+	if err := k.mf.LoadFrom(r); err != nil {
+		return err
+	}
+	log.Infof("Memory load took [%s].", time.Since(memoryStart))
+
+	// Ensure that all pending asynchronous work is complete:
+	//   - namedpipe opening
+	//   - inode file opening
+	if err := fs.AsyncErrorBarrier(); err != nil {
+		return err
+	}
+
+	tcpip.AsyncLoading.Wait()
+
+	log.Infof("Overall load took [%s]", time.Since(loadStart))
+
+	// Applications may size per-cpu structures based on k.applicationCores, so
+	// it can't change across save/restore. When we are virtualizing CPU
+	// numbers, this isn't a problem. However, when we are exposing host CPU
+	// assignments, we can't tolerate an increase in the number of host CPUs,
+	// which could result in getcpu(2) returning CPUs that applications expect
+	// not to exist.
+	if k.useHostCores && initAppCores > k.applicationCores {
+		return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
+	}
+
+	return nil
+}
+
+// Destroy releases resources owned by k.
+//
+// Preconditions: There must be no task goroutines running in k.
+func (k *Kernel) Destroy() {
+	if k.mounts != nil {
+		k.mounts.DecRef()
+		k.mounts = nil
+	}
+}
+
+// UniqueID returns a unique identifier.
+func (k *Kernel) UniqueID() uint64 {
+	id := atomic.AddUint64(&k.uniqueID, 1)
+	if id == 0 {
+		panic("unique identifier generator wrapped around")
+	}
+	return id
+}
+
+// CreateProcessArgs holds arguments to kernel.CreateProcess.
+type CreateProcessArgs struct {
+	// Filename is the filename to load.
+	//
+	// If this is provided as "", then the file will be guessed via Argv[0].
+	Filename string
+
+	// Argvv is a list of arguments.
+	Argv []string
+
+	// Envv is a list of environment variables.
+	Envv []string
+
+	// WorkingDirectory is the initial working directory.
+	//
+	// This defaults to the root if empty.
+	WorkingDirectory string
+
+	// Credentials is the initial credentials.
+	Credentials *auth.Credentials
+
+	// FDMap is the initial set of file descriptors. If CreateProcess succeeds,
+	// it takes a reference on FDMap.
+	FDMap *FDMap
+
+	// Umask is the initial umask.
+	Umask uint
+
+	// Limits is the initial resource limits.
+	Limits *limits.LimitSet
+
+	// MaxSymlinkTraversals is the maximum number of symlinks to follow
+	// during resolution.
+	MaxSymlinkTraversals uint
+
+	// UTSNamespace is the initial UTS namespace.
+	UTSNamespace *UTSNamespace
+
+	// IPCNamespace is the initial IPC namespace.
+	IPCNamespace *IPCNamespace
+
+	// AbstractSocketNamespace is the initial Abstract Socket namespace.
+	AbstractSocketNamespace *AbstractSocketNamespace
+
+	// Root optionally contains the dirent that serves as the root for the
+	// process. If nil, the mount namespace's root is used as the process'
+	// root.
+	//
+	// Anyone setting Root must donate a reference (i.e. increment it) to
+	// keep it alive until it is decremented by CreateProcess.
+	Root *fs.Dirent
+
+	// ContainerID is the container that the process belongs to.
+	ContainerID string
+}
+
+// NewContext returns a context.Context that represents the task that will be
+// created by args.NewContext(k).
+func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext {
+	return &createProcessContext{
+		Logger: log.Log(),
+		k:      k,
+		args:   args,
+	}
+}
+
+// createProcessContext is a context.Context that represents the context
+// associated with a task that is being created.
+type createProcessContext struct {
+	context.NoopSleeper
+	log.Logger
+	k    *Kernel
+	args *CreateProcessArgs
+}
+
+// Value implements context.Context.Value.
+func (ctx *createProcessContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxKernel:
+		return ctx.k
+	case CtxPIDNamespace:
+		// "The new task ... is in the root PID namespace." -
+		// Kernel.CreateProcess
+		return ctx.k.tasks.Root
+	case CtxUTSNamespace:
+		return ctx.args.UTSNamespace
+	case CtxIPCNamespace:
+		return ctx.args.IPCNamespace
+	case auth.CtxCredentials:
+		return ctx.args.Credentials
+	case fs.CtxRoot:
+		if ctx.args.Root != nil {
+			// Take a refernce on the root dirent that will be
+			// given to the caller.
+			ctx.args.Root.IncRef()
+			return ctx.args.Root
+		}
+		if ctx.k.mounts != nil {
+			// MountNamespace.Root() will take a reference on the
+			// root dirent for us.
+			return ctx.k.mounts.Root()
+		}
+		return nil
+	case fs.CtxDirentCacheLimiter:
+		return ctx.k.DirentCacheLimiter
+	case ktime.CtxRealtimeClock:
+		return ctx.k.RealtimeClock()
+	case limits.CtxLimits:
+		return ctx.args.Limits
+	case pgalloc.CtxMemoryFile:
+		return ctx.k.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return ctx.k
+	case platform.CtxPlatform:
+		return ctx.k
+	case uniqueid.CtxGlobalUniqueID:
+		return ctx.k.UniqueID()
+	case uniqueid.CtxGlobalUniqueIDProvider:
+		return ctx.k
+	case uniqueid.CtxInotifyCookie:
+		return ctx.k.GenerateInotifyCookie()
+	case unimpl.CtxEvents:
+		return ctx.k
+	default:
+		return nil
+	}
+}
+
+// CreateProcess creates a new task in a new thread group with the given
+// options. The new task has no parent and is in the root PID namespace.
+//
+// If k.Start() has already been called, then the created process must be
+// started by calling kernel.StartProcess(tg).
+//
+// If k.Start() has not yet been called, then the created task will begin
+// running when k.Start() is called.
+//
+// CreateProcess has no analogue in Linux; it is used to create the initial
+// application task, as well as processes started by the control server.
+func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	log.Infof("EXEC: %v", args.Argv)
+
+	if k.mounts == nil {
+		return nil, 0, fmt.Errorf("no kernel MountNamespace")
+	}
+
+	tg := k.newThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+	ctx := args.NewContext(k)
+
+	// Grab the root directory.
+	root := args.Root
+	if root == nil {
+		root = fs.RootFromContext(ctx)
+		// Is the root STILL nil?
+		if root == nil {
+			return nil, 0, fmt.Errorf("CreateProcessArgs.Root was not provided, and failed to get root from context")
+		}
+	}
+	defer root.DecRef()
+	args.Root = nil
+
+	// Grab the working directory.
+	remainingTraversals := uint(args.MaxSymlinkTraversals)
+	wd := root // Default.
+	if args.WorkingDirectory != "" {
+		var err error
+		wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
+		if err != nil {
+			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+		}
+		defer wd.DecRef()
+	}
+
+	if args.Filename == "" {
+		// Was anything provided?
+		if len(args.Argv) == 0 {
+			return nil, 0, fmt.Errorf("no filename or command provided")
+		}
+		if !filepath.IsAbs(args.Argv[0]) {
+			return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
+		}
+		args.Filename = args.Argv[0]
+	}
+
+	// Create a fresh task context.
+	remainingTraversals = uint(args.MaxSymlinkTraversals)
+	tc, se := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
+	if se != nil {
+		return nil, 0, errors.New(se.String())
+	}
+
+	// Take a reference on the FDMap, which will be transferred to
+	// TaskSet.NewTask().
+	args.FDMap.IncRef()
+
+	// Create the task.
+	config := &TaskConfig{
+		Kernel:                  k,
+		ThreadGroup:             tg,
+		TaskContext:             tc,
+		FSContext:               newFSContext(root, wd, args.Umask),
+		FDMap:                   args.FDMap,
+		Credentials:             args.Credentials,
+		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
+		UTSNamespace:            args.UTSNamespace,
+		IPCNamespace:            args.IPCNamespace,
+		AbstractSocketNamespace: args.AbstractSocketNamespace,
+		ContainerID:             args.ContainerID,
+	}
+	if _, err := k.tasks.NewTask(config); err != nil {
+		return nil, 0, err
+	}
+
+	// Success.
+	tgid := k.tasks.Root.IDOfThreadGroup(tg)
+	if k.globalInit == nil {
+		k.globalInit = tg
+	}
+	return tg, tgid, nil
+}
+
+// StartProcess starts running a process that was created with CreateProcess.
+func (k *Kernel) StartProcess(tg *ThreadGroup) {
+	t := tg.Leader()
+	tid := k.tasks.Root.IDOfTask(t)
+	t.Start(tid)
+}
+
+// Start starts execution of all tasks in k.
+//
+// Preconditions: Start may be called exactly once.
+func (k *Kernel) Start() error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+
+	if k.globalInit == nil {
+		return fmt.Errorf("kernel contains no tasks")
+	}
+	if k.started {
+		return fmt.Errorf("kernel already started")
+	}
+
+	k.started = true
+	k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, newKernelCPUClockTicker(k))
+	k.cpuClockTicker.Swap(ktime.Setting{
+		Enabled: true,
+		Period:  linux.ClockTick,
+	})
+	// If k was created by LoadKernelFrom, timers were stopped during
+	// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
+	// this is a no-op.
+	k.resumeTimeLocked()
+	// Start task goroutines.
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+	for t, tid := range k.tasks.Root.tids {
+		t.Start(tid)
+	}
+	return nil
+}
+
+// pauseTimeLocked pauses all Timers and Timekeeper updates.
+//
+// Preconditions: Any task goroutines running in k must be stopped. k.extMu
+// must be locked.
+func (k *Kernel) pauseTimeLocked() {
+	// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
+	// Kernel.Start().
+	if k.cpuClockTicker != nil {
+		k.cpuClockTicker.Pause()
+	}
+
+	// By precondition, nothing else can be interacting with PIDNamespace.tids
+	// or FDMap.files, so we can iterate them without synchronization. (We
+	// can't hold the TaskSet mutex when pausing thread group timers because
+	// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
+	// mutex, while holding the Timer mutex.)
+	for t := range k.tasks.Root.tids {
+		if t == t.tg.leader {
+			t.tg.itimerRealTimer.Pause()
+			for _, it := range t.tg.timers {
+				it.PauseTimer()
+			}
+		}
+		// This means we'll iterate FDMaps shared by multiple tasks repeatedly,
+		// but ktime.Timer.Pause is idempotent so this is harmless.
+		if fdm := t.fds; fdm != nil {
+			for _, desc := range fdm.files {
+				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+					tfd.PauseTimer()
+				}
+			}
+		}
+	}
+	k.timekeeper.PauseUpdates()
+}
+
+// resumeTimeLocked resumes all Timers and Timekeeper updates. If
+// pauseTimeLocked has not been previously called, resumeTimeLocked has no
+// effect.
+//
+// Preconditions: Any task goroutines running in k must be stopped. k.extMu
+// must be locked.
+func (k *Kernel) resumeTimeLocked() {
+	if k.cpuClockTicker != nil {
+		k.cpuClockTicker.Resume()
+	}
+
+	k.timekeeper.ResumeUpdates()
+	for t := range k.tasks.Root.tids {
+		if t == t.tg.leader {
+			t.tg.itimerRealTimer.Resume()
+			for _, it := range t.tg.timers {
+				it.ResumeTimer()
+			}
+		}
+		if fdm := t.fds; fdm != nil {
+			for _, desc := range fdm.files {
+				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+					tfd.ResumeTimer()
+				}
+			}
+		}
+	}
+}
+
+// WaitExited blocks until all tasks in k have exited.
+func (k *Kernel) WaitExited() {
+	k.tasks.liveGoroutines.Wait()
+}
+
+// Kill requests that all tasks in k immediately exit as if group exiting with
+// status es. Kill does not wait for tasks to exit.
+func (k *Kernel) Kill(es ExitStatus) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.Kill(es)
+}
+
+// Pause requests that all tasks in k temporarily stop executing, and blocks
+// until all tasks in k have stopped. Multiple calls to Pause nest and require
+// an equal number of calls to Unpause to resume execution.
+func (k *Kernel) Pause() {
+	k.extMu.Lock()
+	k.tasks.BeginExternalStop()
+	k.extMu.Unlock()
+	k.tasks.runningGoroutines.Wait()
+}
+
+// Unpause ends the effect of a previous call to Pause. If Unpause is called
+// without a matching preceding call to Pause, Unpause may panic.
+func (k *Kernel) Unpause() {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.EndExternalStop()
+}
+
+// SendExternalSignal injects a signal into the kernel.
+//
+// context is used only for debugging to describe how the signal was received.
+//
+// Preconditions: Kernel must have an init process.
+func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.sendExternalSignal(info, context)
+}
+
+// SendContainerSignal sends the given signal to all processes inside the
+// namespace that match the given container ID.
+func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+
+	var lastErr error
+	for tg := range k.tasks.Root.tgids {
+		if tg.leader.ContainerID() == cid {
+			tg.signalHandlers.mu.Lock()
+			infoCopy := *info
+			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
+				lastErr = err
+			}
+			tg.signalHandlers.mu.Unlock()
+		}
+	}
+	return lastErr
+}
+
+// FeatureSet returns the FeatureSet.
+func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
+	return k.featureSet
+}
+
+// Timekeeper returns the Timekeeper.
+func (k *Kernel) Timekeeper() *Timekeeper {
+	return k.timekeeper
+}
+
+// TaskSet returns the TaskSet.
+func (k *Kernel) TaskSet() *TaskSet {
+	return k.tasks
+}
+
+// RootUserNamespace returns the root UserNamespace.
+func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
+	return k.rootUserNamespace
+}
+
+// RootUTSNamespace returns the root UTSNamespace.
+func (k *Kernel) RootUTSNamespace() *UTSNamespace {
+	return k.rootUTSNamespace
+}
+
+// RootIPCNamespace returns the root IPCNamespace.
+func (k *Kernel) RootIPCNamespace() *IPCNamespace {
+	return k.rootIPCNamespace
+}
+
+// RootAbstractSocketNamespace returns the root AbstractSocketNamespace.
+func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
+	return k.rootAbstractSocketNamespace
+}
+
+// RootMountNamespace returns the MountNamespace.
+func (k *Kernel) RootMountNamespace() *fs.MountNamespace {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return k.mounts
+}
+
+// SetRootMountNamespace sets the MountNamespace.
+func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.mounts = mounts
+}
+
+// NetworkStack returns the network stack. NetworkStack may return nil if no
+// network stack is available.
+func (k *Kernel) NetworkStack() inet.Stack {
+	return k.networkStack
+}
+
+// GlobalInit returns the thread group with ID 1 in the root PID namespace, or
+// nil if no such thread group exists. GlobalInit may return a thread group
+// containing no tasks if the thread group has already exited.
+func (k *Kernel) GlobalInit() *ThreadGroup {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return k.globalInit
+}
+
+// ApplicationCores returns the number of CPUs visible to sandboxed
+// applications.
+func (k *Kernel) ApplicationCores() uint {
+	return k.applicationCores
+}
+
+// RealtimeClock returns the application CLOCK_REALTIME clock.
+func (k *Kernel) RealtimeClock() ktime.Clock {
+	return k.realtimeClock
+}
+
+// MonotonicClock returns the application CLOCK_MONOTONIC clock.
+func (k *Kernel) MonotonicClock() ktime.Clock {
+	return k.monotonicClock
+}
+
+// CPUClockNow returns the current value of k.cpuClock.
+func (k *Kernel) CPUClockNow() uint64 {
+	return atomic.LoadUint64(&k.cpuClock)
+}
+
+// Syslog returns the syslog.
+func (k *Kernel) Syslog() *syslog {
+	return &k.syslog
+}
+
+// GenerateInotifyCookie generates a unique inotify event cookie.
+//
+// Returned values may overlap with previously returned values if the value
+// space is exhausted. 0 is not a valid cookie value, all other values
+// representable in a uint32 are allowed.
+func (k *Kernel) GenerateInotifyCookie() uint32 {
+	id := atomic.AddUint32(&k.nextInotifyCookie, 1)
+	// Wrap-around is explicitly allowed for inotify event cookies.
+	if id == 0 {
+		id = atomic.AddUint32(&k.nextInotifyCookie, 1)
+	}
+	return id
+}
+
+// NetlinkPorts returns the netlink port manager.
+func (k *Kernel) NetlinkPorts() *port.Manager {
+	return k.netlinkPorts
+}
+
+// SaveError returns the sandbox error that caused the kernel to exit during
+// save.
+func (k *Kernel) SaveError() error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return k.saveErr
+}
+
+// SetSaveError sets the sandbox error that caused the kernel to exit during
+// save, if one is not already set.
+func (k *Kernel) SetSaveError(err error) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	if k.saveErr == nil {
+		k.saveErr = err
+	}
+}
+
+var _ tcpip.Clock = (*Kernel)(nil)
+
+// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
+func (k *Kernel) NowNanoseconds() int64 {
+	now, err := k.timekeeper.GetTime(sentrytime.Realtime)
+	if err != nil {
+		panic("Kernel.NowNanoseconds: " + err.Error())
+	}
+	return now
+}
+
+// NowMonotonic implements tcpip.Clock.NowMonotonic.
+func (k *Kernel) NowMonotonic() int64 {
+	now, err := k.timekeeper.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		panic("Kernel.NowMonotonic: " + err.Error())
+	}
+	return now
+}
+
+// SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
+// LoadFrom.
+func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
+	k.mf = mf
+}
+
+// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
+func (k *Kernel) MemoryFile() *pgalloc.MemoryFile {
+	return k.mf
+}
+
+// SupervisorContext returns a Context with maximum privileges in k. It should
+// only be used by goroutines outside the control of the emulated kernel
+// defined by e.
+//
+// Callers are responsible for ensuring that the returned Context is not used
+// concurrently with changes to the Kernel.
+func (k *Kernel) SupervisorContext() context.Context {
+	return supervisorContext{
+		Logger: log.Log(),
+		k:      k,
+	}
+}
+
+// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
+// channel.
+func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
+	t := TaskFromContext(ctx)
+	eventchannel.Emit(&uspb.UnimplementedSyscall{
+		Tid:       int32(t.ThreadID()),
+		Registers: t.Arch().StateData().Proto(),
+	})
+}
+
+// socketEntry represents a socket recorded in Kernel.socketTable. It implements
+// refs.WeakRefUser for sockets stored in the socket table.
+//
+// +stateify savable
+type socketEntry struct {
+	k      *Kernel
+	sock   *refs.WeakRef
+	family int
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+func (s *socketEntry) WeakRefGone() {
+	s.k.extMu.Lock()
+	// k.socketTable is guaranteed to point to a valid socket table for s.family
+	// at this point, since we made sure of the fact when we created this
+	// socketEntry, and we never delete socket tables.
+	delete(s.k.socketTable[s.family], s.sock)
+	s.k.extMu.Unlock()
+}
+
+// RecordSocket adds a socket to the system-wide socket table for tracking.
+//
+// Precondition: Caller must hold a reference to sock.
+func (k *Kernel) RecordSocket(sock *fs.File, family int) {
+	k.extMu.Lock()
+	table, ok := k.socketTable[family]
+	if !ok {
+		table = make(map[*refs.WeakRef]struct{})
+		k.socketTable[family] = table
+	}
+	se := socketEntry{k: k, family: family}
+	se.sock = refs.NewWeakRef(sock, &se)
+	table[se.sock] = struct{}{}
+	k.extMu.Unlock()
+}
+
+// ListSockets returns a snapshot of all sockets of a given family.
+func (k *Kernel) ListSockets(family int) []*refs.WeakRef {
+	k.extMu.Lock()
+	socks := []*refs.WeakRef{}
+	if table, ok := k.socketTable[family]; ok {
+		socks = make([]*refs.WeakRef, 0, len(table))
+		for s := range table {
+			socks = append(socks, s)
+		}
+	}
+	k.extMu.Unlock()
+	return socks
+}
+
+type supervisorContext struct {
+	context.NoopSleeper
+	log.Logger
+	k *Kernel
+}
+
+// Value implements context.Context.
+func (ctx supervisorContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxCanTrace:
+		// The supervisor context can trace anything. (None of
+		// supervisorContext's users are expected to invoke ptrace, but ptrace
+		// permissions are required for certain file accesses.)
+		return func(*Task, bool) bool { return true }
+	case CtxKernel:
+		return ctx.k
+	case CtxPIDNamespace:
+		return ctx.k.tasks.Root
+	case CtxUTSNamespace:
+		return ctx.k.rootUTSNamespace
+	case CtxIPCNamespace:
+		return ctx.k.rootIPCNamespace
+	case auth.CtxCredentials:
+		// The supervisor context is global root.
+		return auth.NewRootCredentials(ctx.k.rootUserNamespace)
+	case fs.CtxRoot:
+		return ctx.k.mounts.Root()
+	case fs.CtxDirentCacheLimiter:
+		return ctx.k.DirentCacheLimiter
+	case ktime.CtxRealtimeClock:
+		return ctx.k.RealtimeClock()
+	case limits.CtxLimits:
+		// No limits apply.
+		return limits.NewLimitSet()
+	case pgalloc.CtxMemoryFile:
+		return ctx.k.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return ctx.k
+	case platform.CtxPlatform:
+		return ctx.k
+	case uniqueid.CtxGlobalUniqueID:
+		return ctx.k.UniqueID()
+	case uniqueid.CtxGlobalUniqueIDProvider:
+		return ctx.k
+	case uniqueid.CtxInotifyCookie:
+		return ctx.k.GenerateInotifyCookie()
+	case unimpl.CtxEvents:
+		return ctx.k
+	default:
+		return nil
+	}
+}
diff --git a/pkg/sentry/kernel/kernel_state.go b/pkg/sentry/kernel/kernel_state.go
new file mode 100644
index 000000000..48c3ff5a9
--- /dev/null
+++ b/pkg/sentry/kernel/kernel_state.go
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+)
+
+// saveDanglingEndpoints is invoked by stateify.
+func (k *Kernel) saveDanglingEndpoints() []tcpip.Endpoint {
+	return tcpip.GetDanglingEndpoints()
+}
+
+// loadDanglingEndpoints is invoked by stateify.
+func (k *Kernel) loadDanglingEndpoints(es []tcpip.Endpoint) {
+	for _, e := range es {
+		tcpip.AddDanglingEndpoint(e)
+	}
+}
+
+// saveDeviceRegistry is invoked by stateify.
+func (k *Kernel) saveDeviceRegistry() *device.Registry {
+	return device.SimpleDevices
+}
+
+// loadDeviceRegistry is invoked by stateify.
+func (k *Kernel) loadDeviceRegistry(r *device.Registry) {
+	device.SimpleDevices.LoadFrom(r)
+}
diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go
new file mode 100755
index 000000000..82fd0abfd
--- /dev/null
+++ b/pkg/sentry/kernel/kernel_state_autogen.go
@@ -0,0 +1,1147 @@
+// automatically generated by stateify.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+	"gvisor.googlesource.com/gvisor/pkg/bpf"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+)
+
+func (x *abstractEndpoint) beforeSave() {}
+func (x *abstractEndpoint) save(m state.Map) {
+	x.beforeSave()
+	m.Save("ep", &x.ep)
+	m.Save("wr", &x.wr)
+	m.Save("name", &x.name)
+	m.Save("ns", &x.ns)
+}
+
+func (x *abstractEndpoint) afterLoad() {}
+func (x *abstractEndpoint) load(m state.Map) {
+	m.Load("ep", &x.ep)
+	m.Load("wr", &x.wr)
+	m.Load("name", &x.name)
+	m.Load("ns", &x.ns)
+}
+
+func (x *AbstractSocketNamespace) beforeSave() {}
+func (x *AbstractSocketNamespace) save(m state.Map) {
+	x.beforeSave()
+	m.Save("endpoints", &x.endpoints)
+}
+
+func (x *AbstractSocketNamespace) afterLoad() {}
+func (x *AbstractSocketNamespace) load(m state.Map) {
+	m.Load("endpoints", &x.endpoints)
+}
+
+func (x *FDFlags) beforeSave() {}
+func (x *FDFlags) save(m state.Map) {
+	x.beforeSave()
+	m.Save("CloseOnExec", &x.CloseOnExec)
+}
+
+func (x *FDFlags) afterLoad() {}
+func (x *FDFlags) load(m state.Map) {
+	m.Load("CloseOnExec", &x.CloseOnExec)
+}
+
+func (x *descriptor) beforeSave() {}
+func (x *descriptor) save(m state.Map) {
+	x.beforeSave()
+	m.Save("file", &x.file)
+	m.Save("flags", &x.flags)
+}
+
+func (x *descriptor) afterLoad() {}
+func (x *descriptor) load(m state.Map) {
+	m.Load("file", &x.file)
+	m.Load("flags", &x.flags)
+}
+
+func (x *FDMap) beforeSave() {}
+func (x *FDMap) save(m state.Map) {
+	x.beforeSave()
+	m.Save("AtomicRefCount", &x.AtomicRefCount)
+	m.Save("k", &x.k)
+	m.Save("files", &x.files)
+	m.Save("uid", &x.uid)
+}
+
+func (x *FDMap) afterLoad() {}
+func (x *FDMap) load(m state.Map) {
+	m.Load("AtomicRefCount", &x.AtomicRefCount)
+	m.Load("k", &x.k)
+	m.Load("files", &x.files)
+	m.Load("uid", &x.uid)
+}
+
+func (x *FSContext) beforeSave() {}
+func (x *FSContext) save(m state.Map) {
+	x.beforeSave()
+	m.Save("AtomicRefCount", &x.AtomicRefCount)
+	m.Save("root", &x.root)
+	m.Save("cwd", &x.cwd)
+	m.Save("umask", &x.umask)
+}
+
+func (x *FSContext) afterLoad() {}
+func (x *FSContext) load(m state.Map) {
+	m.Load("AtomicRefCount", &x.AtomicRefCount)
+	m.Load("root", &x.root)
+	m.Load("cwd", &x.cwd)
+	m.Load("umask", &x.umask)
+}
+
+func (x *IPCNamespace) beforeSave() {}
+func (x *IPCNamespace) save(m state.Map) {
+	x.beforeSave()
+	m.Save("userNS", &x.userNS)
+	m.Save("semaphores", &x.semaphores)
+	m.Save("shms", &x.shms)
+}
+
+func (x *IPCNamespace) afterLoad() {}
+func (x *IPCNamespace) load(m state.Map) {
+	m.Load("userNS", &x.userNS)
+	m.Load("semaphores", &x.semaphores)
+	m.Load("shms", &x.shms)
+}
+
+func (x *Kernel) beforeSave() {}
+func (x *Kernel) save(m state.Map) {
+	x.beforeSave()
+	var danglingEndpoints []tcpip.Endpoint = x.saveDanglingEndpoints()
+	m.SaveValue("danglingEndpoints", danglingEndpoints)
+	var deviceRegistry *device.Registry = x.saveDeviceRegistry()
+	m.SaveValue("deviceRegistry", deviceRegistry)
+	m.Save("featureSet", &x.featureSet)
+	m.Save("timekeeper", &x.timekeeper)
+	m.Save("tasks", &x.tasks)
+	m.Save("rootUserNamespace", &x.rootUserNamespace)
+	m.Save("applicationCores", &x.applicationCores)
+	m.Save("useHostCores", &x.useHostCores)
+	m.Save("extraAuxv", &x.extraAuxv)
+	m.Save("vdso", &x.vdso)
+	m.Save("rootUTSNamespace", &x.rootUTSNamespace)
+	m.Save("rootIPCNamespace", &x.rootIPCNamespace)
+	m.Save("rootAbstractSocketNamespace", &x.rootAbstractSocketNamespace)
+	m.Save("mounts", &x.mounts)
+	m.Save("futexes", &x.futexes)
+	m.Save("globalInit", &x.globalInit)
+	m.Save("realtimeClock", &x.realtimeClock)
+	m.Save("monotonicClock", &x.monotonicClock)
+	m.Save("syslog", &x.syslog)
+	m.Save("cpuClock", &x.cpuClock)
+	m.Save("fdMapUids", &x.fdMapUids)
+	m.Save("uniqueID", &x.uniqueID)
+	m.Save("nextInotifyCookie", &x.nextInotifyCookie)
+	m.Save("netlinkPorts", &x.netlinkPorts)
+	m.Save("socketTable", &x.socketTable)
+	m.Save("DirentCacheLimiter", &x.DirentCacheLimiter)
+}
+
+func (x *Kernel) afterLoad() {}
+func (x *Kernel) load(m state.Map) {
+	m.Load("featureSet", &x.featureSet)
+	m.Load("timekeeper", &x.timekeeper)
+	m.Load("tasks", &x.tasks)
+	m.Load("rootUserNamespace", &x.rootUserNamespace)
+	m.Load("applicationCores", &x.applicationCores)
+	m.Load("useHostCores", &x.useHostCores)
+	m.Load("extraAuxv", &x.extraAuxv)
+	m.Load("vdso", &x.vdso)
+	m.Load("rootUTSNamespace", &x.rootUTSNamespace)
+	m.Load("rootIPCNamespace", &x.rootIPCNamespace)
+	m.Load("rootAbstractSocketNamespace", &x.rootAbstractSocketNamespace)
+	m.Load("mounts", &x.mounts)
+	m.Load("futexes", &x.futexes)
+	m.Load("globalInit", &x.globalInit)
+	m.Load("realtimeClock", &x.realtimeClock)
+	m.Load("monotonicClock", &x.monotonicClock)
+	m.Load("syslog", &x.syslog)
+	m.Load("cpuClock", &x.cpuClock)
+	m.Load("fdMapUids", &x.fdMapUids)
+	m.Load("uniqueID", &x.uniqueID)
+	m.Load("nextInotifyCookie", &x.nextInotifyCookie)
+	m.Load("netlinkPorts", &x.netlinkPorts)
+	m.Load("socketTable", &x.socketTable)
+	m.Load("DirentCacheLimiter", &x.DirentCacheLimiter)
+	m.LoadValue("danglingEndpoints", new([]tcpip.Endpoint), func(y interface{}) { x.loadDanglingEndpoints(y.([]tcpip.Endpoint)) })
+	m.LoadValue("deviceRegistry", new(*device.Registry), func(y interface{}) { x.loadDeviceRegistry(y.(*device.Registry)) })
+}
+
+func (x *socketEntry) beforeSave() {}
+func (x *socketEntry) save(m state.Map) {
+	x.beforeSave()
+	m.Save("k", &x.k)
+	m.Save("sock", &x.sock)
+	m.Save("family", &x.family)
+}
+
+func (x *socketEntry) afterLoad() {}
+func (x *socketEntry) load(m state.Map) {
+	m.Load("k", &x.k)
+	m.Load("sock", &x.sock)
+	m.Load("family", &x.family)
+}
+
+func (x *pendingSignals) beforeSave() {}
+func (x *pendingSignals) save(m state.Map) {
+	x.beforeSave()
+	var signals []savedPendingSignal = x.saveSignals()
+	m.SaveValue("signals", signals)
+}
+
+func (x *pendingSignals) afterLoad() {}
+func (x *pendingSignals) load(m state.Map) {
+	m.LoadValue("signals", new([]savedPendingSignal), func(y interface{}) { x.loadSignals(y.([]savedPendingSignal)) })
+}
+
+func (x *pendingSignalQueue) beforeSave() {}
+func (x *pendingSignalQueue) save(m state.Map) {
+	x.beforeSave()
+	m.Save("pendingSignalList", &x.pendingSignalList)
+	m.Save("length", &x.length)
+}
+
+func (x *pendingSignalQueue) afterLoad() {}
+func (x *pendingSignalQueue) load(m state.Map) {
+	m.Load("pendingSignalList", &x.pendingSignalList)
+	m.Load("length", &x.length)
+}
+
+func (x *pendingSignal) beforeSave() {}
+func (x *pendingSignal) save(m state.Map) {
+	x.beforeSave()
+	m.Save("pendingSignalEntry", &x.pendingSignalEntry)
+	m.Save("SignalInfo", &x.SignalInfo)
+	m.Save("timer", &x.timer)
+}
+
+func (x *pendingSignal) afterLoad() {}
+func (x *pendingSignal) load(m state.Map) {
+	m.Load("pendingSignalEntry", &x.pendingSignalEntry)
+	m.Load("SignalInfo", &x.SignalInfo)
+	m.Load("timer", &x.timer)
+}
+
+func (x *pendingSignalList) beforeSave() {}
+func (x *pendingSignalList) save(m state.Map) {
+	x.beforeSave()
+	m.Save("head", &x.head)
+	m.Save("tail", &x.tail)
+}
+
+func (x *pendingSignalList) afterLoad() {}
+func (x *pendingSignalList) load(m state.Map) {
+	m.Load("head", &x.head)
+	m.Load("tail", &x.tail)
+}
+
+func (x *pendingSignalEntry) beforeSave() {}
+func (x *pendingSignalEntry) save(m state.Map) {
+	x.beforeSave()
+	m.Save("next", &x.next)
+	m.Save("prev", &x.prev)
+}
+
+func (x *pendingSignalEntry) afterLoad() {}
+func (x *pendingSignalEntry) load(m state.Map) {
+	m.Load("next", &x.next)
+	m.Load("prev", &x.prev)
+}
+
+func (x *savedPendingSignal) beforeSave() {}
+func (x *savedPendingSignal) save(m state.Map) {
+	x.beforeSave()
+	m.Save("si", &x.si)
+	m.Save("timer", &x.timer)
+}
+
+func (x *savedPendingSignal) afterLoad() {}
+func (x *savedPendingSignal) load(m state.Map) {
+	m.Load("si", &x.si)
+	m.Load("timer", &x.timer)
+}
+
+func (x *IntervalTimer) beforeSave() {}
+func (x *IntervalTimer) save(m state.Map) {
+	x.beforeSave()
+	m.Save("timer", &x.timer)
+	m.Save("target", &x.target)
+	m.Save("signo", &x.signo)
+	m.Save("id", &x.id)
+	m.Save("sigval", &x.sigval)
+	m.Save("group", &x.group)
+	m.Save("sigpending", &x.sigpending)
+	m.Save("sigorphan", &x.sigorphan)
+	m.Save("overrunCur", &x.overrunCur)
+	m.Save("overrunLast", &x.overrunLast)
+}
+
+func (x *IntervalTimer) afterLoad() {}
+func (x *IntervalTimer) load(m state.Map) {
+	m.Load("timer", &x.timer)
+	m.Load("target", &x.target)
+	m.Load("signo", &x.signo)
+	m.Load("id", &x.id)
+	m.Load("sigval", &x.sigval)
+	m.Load("group", &x.group)
+	m.Load("sigpending", &x.sigpending)
+	m.Load("sigorphan", &x.sigorphan)
+	m.Load("overrunCur", &x.overrunCur)
+	m.Load("overrunLast", &x.overrunLast)
+}
+
+func (x *processGroupList) beforeSave() {}
+func (x *processGroupList) save(m state.Map) {
+	x.beforeSave()
+	m.Save("head", &x.head)
+	m.Save("tail", &x.tail)
+}
+
+func (x *processGroupList) afterLoad() {}
+func (x *processGroupList) load(m state.Map) {
+	m.Load("head", &x.head)
+	m.Load("tail", &x.tail)
+}
+
+func (x *processGroupEntry) beforeSave() {}
+func (x *processGroupEntry) save(m state.Map) {
+	x.beforeSave()
+	m.Save("next", &x.next)
+	m.Save("prev", &x.prev)
+}
+
+func (x *processGroupEntry) afterLoad() {}
+func (x *processGroupEntry) load(m state.Map) {
+	m.Load("next", &x.next)
+	m.Load("prev", &x.prev)
+}
+
+func (x *ptraceOptions) beforeSave() {}
+func (x *ptraceOptions) save(m state.Map) {
+	x.beforeSave()
+	m.Save("ExitKill", &x.ExitKill)
+	m.Save("SysGood", &x.SysGood)
+	m.Save("TraceClone", &x.TraceClone)
+	m.Save("TraceExec", &x.TraceExec)
+	m.Save("TraceExit", &x.TraceExit)
+	m.Save("TraceFork", &x.TraceFork)
+	m.Save("TraceSeccomp", &x.TraceSeccomp)
+	m.Save("TraceVfork", &x.TraceVfork)
+	m.Save("TraceVforkDone", &x.TraceVforkDone)
+}
+
+func (x *ptraceOptions) afterLoad() {}
+func (x *ptraceOptions) load(m state.Map) {
+	m.Load("ExitKill", &x.ExitKill)
+	m.Load("SysGood", &x.SysGood)
+	m.Load("TraceClone", &x.TraceClone)
+	m.Load("TraceExec", &x.TraceExec)
+	m.Load("TraceExit", &x.TraceExit)
+	m.Load("TraceFork", &x.TraceFork)
+	m.Load("TraceSeccomp", &x.TraceSeccomp)
+	m.Load("TraceVfork", &x.TraceVfork)
+	m.Load("TraceVforkDone", &x.TraceVforkDone)
+}
+
+func (x *ptraceStop) beforeSave() {}
+func (x *ptraceStop) save(m state.Map) {
+	x.beforeSave()
+	m.Save("frozen", &x.frozen)
+	m.Save("listen", &x.listen)
+}
+
+func (x *ptraceStop) afterLoad() {}
+func (x *ptraceStop) load(m state.Map) {
+	m.Load("frozen", &x.frozen)
+	m.Load("listen", &x.listen)
+}
+
+func (x *RSEQCriticalRegion) beforeSave() {}
+func (x *RSEQCriticalRegion) save(m state.Map) {
+	x.beforeSave()
+	m.Save("CriticalSection", &x.CriticalSection)
+	m.Save("Restart", &x.Restart)
+}
+
+func (x *RSEQCriticalRegion) afterLoad() {}
+func (x *RSEQCriticalRegion) load(m state.Map) {
+	m.Load("CriticalSection", &x.CriticalSection)
+	m.Load("Restart", &x.Restart)
+}
+
+func (x *sessionList) beforeSave() {}
+func (x *sessionList) save(m state.Map) {
+	x.beforeSave()
+	m.Save("head", &x.head)
+	m.Save("tail", &x.tail)
+}
+
+func (x *sessionList) afterLoad() {}
+func (x *sessionList) load(m state.Map) {
+	m.Load("head", &x.head)
+	m.Load("tail", &x.tail)
+}
+
+func (x *sessionEntry) beforeSave() {}
+func (x *sessionEntry) save(m state.Map) {
+	x.beforeSave()
+	m.Save("next", &x.next)
+	m.Save("prev", &x.prev)
+}
+
+func (x *sessionEntry) afterLoad() {}
+func (x *sessionEntry) load(m state.Map) {
+	m.Load("next", &x.next)
+	m.Load("prev", &x.prev)
+}
+
+func (x *Session) beforeSave() {}
+func (x *Session) save(m state.Map) {
+	x.beforeSave()
+	m.Save("refs", &x.refs)
+	m.Save("leader", &x.leader)
+	m.Save("id", &x.id)
+	m.Save("processGroups", &x.processGroups)
+	m.Save("sessionEntry", &x.sessionEntry)
+}
+
+func (x *Session) afterLoad() {}
+func (x *Session) load(m state.Map) {
+	m.Load("refs", &x.refs)
+	m.Load("leader", &x.leader)
+	m.Load("id", &x.id)
+	m.Load("processGroups", &x.processGroups)
+	m.Load("sessionEntry", &x.sessionEntry)
+}
+
+func (x *ProcessGroup) beforeSave() {}
+func (x *ProcessGroup) save(m state.Map) {
+	x.beforeSave()
+	m.Save("refs", &x.refs)
+	m.Save("originator", &x.originator)
+	m.Save("id", &x.id)
+	m.Save("session", &x.session)
+	m.Save("ancestors", &x.ancestors)
+	m.Save("processGroupEntry", &x.processGroupEntry)
+}
+
+func (x *ProcessGroup) afterLoad() {}
+func (x *ProcessGroup) load(m state.Map) {
+	m.Load("refs", &x.refs)
+	m.Load("originator", &x.originator)
+	m.Load("id", &x.id)
+	m.Load("session", &x.session)
+	m.Load("ancestors", &x.ancestors)
+	m.Load("processGroupEntry", &x.processGroupEntry)
+}
+
+func (x *SignalHandlers) beforeSave() {}
+func (x *SignalHandlers) save(m state.Map) {
+	x.beforeSave()
+	m.Save("actions", &x.actions)
+}
+
+func (x *SignalHandlers) afterLoad() {}
+func (x *SignalHandlers) load(m state.Map) {
+	m.Load("actions", &x.actions)
+}
+
+func (x *SyscallTable) beforeSave() {}
+func (x *SyscallTable) save(m state.Map) {
+	x.beforeSave()
+	m.Save("OS", &x.OS)
+	m.Save("Arch", &x.Arch)
+}
+
+func (x *SyscallTable) load(m state.Map) {
+	m.LoadWait("OS", &x.OS)
+	m.LoadWait("Arch", &x.Arch)
+	m.AfterLoad(x.afterLoad)
+}
+
+func (x *syslog) beforeSave() {}
+func (x *syslog) save(m state.Map) {
+	x.beforeSave()
+	m.Save("msg", &x.msg)
+}
+
+func (x *syslog) afterLoad() {}
+func (x *syslog) load(m state.Map) {
+	m.Load("msg", &x.msg)
+}
+
+func (x *Task) beforeSave() {}
+func (x *Task) save(m state.Map) {
+	x.beforeSave()
+	var ptraceTracer *Task = x.savePtraceTracer()
+	m.SaveValue("ptraceTracer", ptraceTracer)
+	var logPrefix string = x.saveLogPrefix()
+	m.SaveValue("logPrefix", logPrefix)
+	var syscallFilters []bpf.Program = x.saveSyscallFilters()
+	m.SaveValue("syscallFilters", syscallFilters)
+	m.Save("taskNode", &x.taskNode)
+	m.Save("runState", &x.runState)
+	m.Save("haveSyscallReturn", &x.haveSyscallReturn)
+	m.Save("gosched", &x.gosched)
+	m.Save("yieldCount", &x.yieldCount)
+	m.Save("pendingSignals", &x.pendingSignals)
+	m.Save("signalMask", &x.signalMask)
+	m.Save("realSignalMask", &x.realSignalMask)
+	m.Save("haveSavedSignalMask", &x.haveSavedSignalMask)
+	m.Save("savedSignalMask", &x.savedSignalMask)
+	m.Save("signalStack", &x.signalStack)
+	m.Save("groupStopPending", &x.groupStopPending)
+	m.Save("groupStopAcknowledged", &x.groupStopAcknowledged)
+	m.Save("trapStopPending", &x.trapStopPending)
+	m.Save("trapNotifyPending", &x.trapNotifyPending)
+	m.Save("stop", &x.stop)
+	m.Save("exitStatus", &x.exitStatus)
+	m.Save("syscallRestartBlock", &x.syscallRestartBlock)
+	m.Save("k", &x.k)
+	m.Save("containerID", &x.containerID)
+	m.Save("tc", &x.tc)
+	m.Save("fsc", &x.fsc)
+	m.Save("fds", &x.fds)
+	m.Save("vforkParent", &x.vforkParent)
+	m.Save("exitState", &x.exitState)
+	m.Save("exitTracerNotified", &x.exitTracerNotified)
+	m.Save("exitTracerAcked", &x.exitTracerAcked)
+	m.Save("exitParentNotified", &x.exitParentNotified)
+	m.Save("exitParentAcked", &x.exitParentAcked)
+	m.Save("ptraceTracees", &x.ptraceTracees)
+	m.Save("ptraceSeized", &x.ptraceSeized)
+	m.Save("ptraceOpts", &x.ptraceOpts)
+	m.Save("ptraceSyscallMode", &x.ptraceSyscallMode)
+	m.Save("ptraceSinglestep", &x.ptraceSinglestep)
+	m.Save("ptraceCode", &x.ptraceCode)
+	m.Save("ptraceSiginfo", &x.ptraceSiginfo)
+	m.Save("ptraceEventMsg", &x.ptraceEventMsg)
+	m.Save("ioUsage", &x.ioUsage)
+	m.Save("creds", &x.creds)
+	m.Save("utsns", &x.utsns)
+	m.Save("ipcns", &x.ipcns)
+	m.Save("abstractSockets", &x.abstractSockets)
+	m.Save("parentDeathSignal", &x.parentDeathSignal)
+	m.Save("cleartid", &x.cleartid)
+	m.Save("allowedCPUMask", &x.allowedCPUMask)
+	m.Save("cpu", &x.cpu)
+	m.Save("niceness", &x.niceness)
+	m.Save("numaPolicy", &x.numaPolicy)
+	m.Save("numaNodeMask", &x.numaNodeMask)
+	m.Save("netns", &x.netns)
+	m.Save("rseqCPUAddr", &x.rseqCPUAddr)
+	m.Save("rseqCPU", &x.rseqCPU)
+	m.Save("startTime", &x.startTime)
+}
+
+func (x *Task) load(m state.Map) {
+	m.Load("taskNode", &x.taskNode)
+	m.Load("runState", &x.runState)
+	m.Load("haveSyscallReturn", &x.haveSyscallReturn)
+	m.Load("gosched", &x.gosched)
+	m.Load("yieldCount", &x.yieldCount)
+	m.Load("pendingSignals", &x.pendingSignals)
+	m.Load("signalMask", &x.signalMask)
+	m.Load("realSignalMask", &x.realSignalMask)
+	m.Load("haveSavedSignalMask", &x.haveSavedSignalMask)
+	m.Load("savedSignalMask", &x.savedSignalMask)
+	m.Load("signalStack", &x.signalStack)
+	m.Load("groupStopPending", &x.groupStopPending)
+	m.Load("groupStopAcknowledged", &x.groupStopAcknowledged)
+	m.Load("trapStopPending", &x.trapStopPending)
+	m.Load("trapNotifyPending", &x.trapNotifyPending)
+	m.Load("stop", &x.stop)
+	m.Load("exitStatus", &x.exitStatus)
+	m.Load("syscallRestartBlock", &x.syscallRestartBlock)
+	m.Load("k", &x.k)
+	m.Load("containerID", &x.containerID)
+	m.Load("tc", &x.tc)
+	m.Load("fsc", &x.fsc)
+	m.Load("fds", &x.fds)
+	m.Load("vforkParent", &x.vforkParent)
+	m.Load("exitState", &x.exitState)
+	m.Load("exitTracerNotified", &x.exitTracerNotified)
+	m.Load("exitTracerAcked", &x.exitTracerAcked)
+	m.Load("exitParentNotified", &x.exitParentNotified)
+	m.Load("exitParentAcked", &x.exitParentAcked)
+	m.Load("ptraceTracees", &x.ptraceTracees)
+	m.Load("ptraceSeized", &x.ptraceSeized)
+	m.Load("ptraceOpts", &x.ptraceOpts)
+	m.Load("ptraceSyscallMode", &x.ptraceSyscallMode)
+	m.Load("ptraceSinglestep", &x.ptraceSinglestep)
+	m.Load("ptraceCode", &x.ptraceCode)
+	m.Load("ptraceSiginfo", &x.ptraceSiginfo)
+	m.Load("ptraceEventMsg", &x.ptraceEventMsg)
+	m.Load("ioUsage", &x.ioUsage)
+	m.Load("creds", &x.creds)
+	m.Load("utsns", &x.utsns)
+	m.Load("ipcns", &x.ipcns)
+	m.Load("abstractSockets", &x.abstractSockets)
+	m.Load("parentDeathSignal", &x.parentDeathSignal)
+	m.Load("cleartid", &x.cleartid)
+	m.Load("allowedCPUMask", &x.allowedCPUMask)
+	m.Load("cpu", &x.cpu)
+	m.Load("niceness", &x.niceness)
+	m.Load("numaPolicy", &x.numaPolicy)
+	m.Load("numaNodeMask", &x.numaNodeMask)
+	m.Load("netns", &x.netns)
+	m.Load("rseqCPUAddr", &x.rseqCPUAddr)
+	m.Load("rseqCPU", &x.rseqCPU)
+	m.Load("startTime", &x.startTime)
+	m.LoadValue("ptraceTracer", new(*Task), func(y interface{}) { x.loadPtraceTracer(y.(*Task)) })
+	m.LoadValue("logPrefix", new(string), func(y interface{}) { x.loadLogPrefix(y.(string)) })
+	m.LoadValue("syscallFilters", new([]bpf.Program), func(y interface{}) { x.loadSyscallFilters(y.([]bpf.Program)) })
+	m.AfterLoad(x.afterLoad)
+}
+
+func (x *runSyscallAfterPtraceEventClone) beforeSave() {}
+func (x *runSyscallAfterPtraceEventClone) save(m state.Map) {
+	x.beforeSave()
+	m.Save("vforkChild", &x.vforkChild)
+	m.Save("vforkChildTID", &x.vforkChildTID)
+}
+
+func (x *runSyscallAfterPtraceEventClone) afterLoad() {}
+func (x *runSyscallAfterPtraceEventClone) load(m state.Map) {
+	m.Load("vforkChild", &x.vforkChild)
+	m.Load("vforkChildTID", &x.vforkChildTID)
+}
+
+func (x *runSyscallAfterVforkStop) beforeSave() {}
+func (x *runSyscallAfterVforkStop) save(m state.Map) {
+	x.beforeSave()
+	m.Save("childTID", &x.childTID)
+}
+
+func (x *runSyscallAfterVforkStop) afterLoad() {}
+func (x *runSyscallAfterVforkStop) load(m state.Map) {
+	m.Load("childTID", &x.childTID)
+}
+
+func (x *vforkStop) beforeSave() {}
+func (x *vforkStop) save(m state.Map) {
+	x.beforeSave()
+}
+
+func (x *vforkStop) afterLoad() {}
+func (x *vforkStop) load(m state.Map) {
+}
+
+func (x *TaskContext) beforeSave() {}
+func (x *TaskContext) save(m state.Map) {
+	x.beforeSave()
+	m.Save("Name", &x.Name)
+	m.Save("Arch", &x.Arch)
+	m.Save("MemoryManager", &x.MemoryManager)
+	m.Save("fu", &x.fu)
+	m.Save("st", &x.st)
+}
+
+func (x *TaskContext) afterLoad() {}
+func (x *TaskContext) load(m state.Map) {
+	m.Load("Name", &x.Name)
+	m.Load("Arch", &x.Arch)
+	m.Load("MemoryManager", &x.MemoryManager)
+	m.Load("fu", &x.fu)
+	m.Load("st", &x.st)
+}
+
+func (x *execStop) beforeSave() {}
+func (x *execStop) save(m state.Map) {
+	x.beforeSave()
+}
+
+func (x *execStop) afterLoad() {}
+func (x *execStop) load(m state.Map) {
+}
+
+func (x *runSyscallAfterExecStop) beforeSave() {}
+func (x *runSyscallAfterExecStop) save(m state.Map) {
+	x.beforeSave()
+	m.Save("tc", &x.tc)
+}
+
+func (x *runSyscallAfterExecStop) afterLoad() {}
+func (x *runSyscallAfterExecStop) load(m state.Map) {
+	m.Load("tc", &x.tc)
+}
+
+func (x *ExitStatus) beforeSave() {}
+func (x *ExitStatus) save(m state.Map) {
+	x.beforeSave()
+	m.Save("Code", &x.Code)
+	m.Save("Signo", &x.Signo)
+}
+
+func (x *ExitStatus) afterLoad() {}
+func (x *ExitStatus) load(m state.Map) {
+	m.Load("Code", &x.Code)
+	m.Load("Signo", &x.Signo)
+}
+
+func (x *runExit) beforeSave() {}
+func (x *runExit) save(m state.Map) {
+	x.beforeSave()
+}
+
+func (x *runExit) afterLoad() {}
+func (x *runExit) load(m state.Map) {
+}
+
+func (x *runExitMain) beforeSave() {}
+func (x *runExitMain) save(m state.Map) {
+	x.beforeSave()
+}
+
+func (x *runExitMain) afterLoad() {}
+func (x *runExitMain) load(m state.Map) {
+}
+
+func (x *runExitNotify) beforeSave() {}
+func (x *runExitNotify) save(m state.Map) {
+	x.beforeSave()
+}
+
+func (x *runExitNotify) afterLoad() {}
+func (x *runExitNotify) load(m state.Map) {
+}
+
+func (x *taskList) beforeSave() {}
+func (x *taskList) save(m state.Map) {
+	x.beforeSave()
+	m.Save("head", &x.head)
+	m.Save("tail", &x.tail)
+}
+
+func (x *taskList) afterLoad() {}
+func (x *taskList) load(m state.Map) {
+	m.Load("head", &x.head)
+	m.Load("tail", &x.tail)
+}
+
+func (x *taskEntry) beforeSave() {}
+func (x *taskEntry) save(m state.Map) {
+	x.beforeSave()
+	m.Save("next", &x.next)
+	m.Save("prev", &x.prev)
+}
+
+func (x *taskEntry) afterLoad() {}
+func (x *taskEntry) load(m state.Map) {
+	m.Load("next", &x.next)
+	m.Load("prev", &x.prev)
+}
+
+func (x *runApp) beforeSave() {}
+func (x *runApp) save(m state.Map) {
+	x.beforeSave()
+}
+
+func (x *runApp) afterLoad() {}
+func (x *runApp) load(m state.Map) {
+}
+
+func (x *TaskGoroutineSchedInfo) beforeSave() {}
+func (x *TaskGoroutineSchedInfo) save(m state.Map) {
+	x.beforeSave()
+	m.Save("Timestamp", &x.Timestamp)
+	m.Save("State", &x.State)
+	m.Save("UserTicks", &x.UserTicks)
+	m.Save("SysTicks", &x.SysTicks)
+}
+
+func (x *TaskGoroutineSchedInfo) afterLoad() {}
+func (x *TaskGoroutineSchedInfo) load(m state.Map) {
+	m.Load("Timestamp", &x.Timestamp)
+	m.Load("State", &x.State)
+	m.Load("UserTicks", &x.UserTicks)
+	m.Load("SysTicks", &x.SysTicks)
+}
+
+func (x *taskClock) beforeSave() {}
+func (x *taskClock) save(m state.Map) {
+	x.beforeSave()
+	m.Save("t", &x.t)
+	m.Save("includeSys", &x.includeSys)
+}
+
+func (x *taskClock) afterLoad() {}
+func (x *taskClock) load(m state.Map) {
+	m.Load("t", &x.t)
+	m.Load("includeSys", &x.includeSys)
+}
+
+func (x *tgClock) beforeSave() {}
+func (x *tgClock) save(m state.Map) {
+	x.beforeSave()
+	m.Save("tg", &x.tg)
+	m.Save("includeSys", &x.includeSys)
+}
+
+func (x *tgClock) afterLoad() {}
+func (x *tgClock) load(m state.Map) {
+	m.Load("tg", &x.tg)
+	m.Load("includeSys", &x.includeSys)
+}
+
+func (x *groupStop) beforeSave() {}
+func (x *groupStop) save(m state.Map) {
+	x.beforeSave()
+}
+
+func (x *groupStop) afterLoad() {}
+func (x *groupStop) load(m state.Map) {
+}
+
+func (x *runInterrupt) beforeSave() {}
+func (x *runInterrupt) save(m state.Map) {
+	x.beforeSave()
+}
+
+func (x *runInterrupt) afterLoad() {}
+func (x *runInterrupt) load(m state.Map) {
+}
+
+func (x *runInterruptAfterSignalDeliveryStop) beforeSave() {}
+func (x *runInterruptAfterSignalDeliveryStop) save(m state.Map) {
+	x.beforeSave()
+}
+
+func (x *runInterruptAfterSignalDeliveryStop) afterLoad() {}
+func (x *runInterruptAfterSignalDeliveryStop) load(m state.Map) {
+}
+
+func (x *runSyscallAfterSyscallEnterStop) beforeSave() {}
+func (x *runSyscallAfterSyscallEnterStop) save(m state.Map) {
+	x.beforeSave()
+}
+
+func (x *runSyscallAfterSyscallEnterStop) afterLoad() {}
+func (x *runSyscallAfterSyscallEnterStop) load(m state.Map) {
+}
+
+func (x *runSyscallAfterSysemuStop) beforeSave() {}
+func (x *runSyscallAfterSysemuStop) save(m state.Map) {
+	x.beforeSave()
+}
+
+func (x *runSyscallAfterSysemuStop) afterLoad() {}
+func (x *runSyscallAfterSysemuStop) load(m state.Map) {
+}
+
+func (x *runSyscallReinvoke) beforeSave() {}
+func (x *runSyscallReinvoke) save(m state.Map) {
+	x.beforeSave()
+}
+
+func (x *runSyscallReinvoke) afterLoad() {}
+func (x *runSyscallReinvoke) load(m state.Map) {
+}
+
+func (x *runSyscallExit) beforeSave() {}
+func (x *runSyscallExit) save(m state.Map) {
+	x.beforeSave()
+}
+
+func (x *runSyscallExit) afterLoad() {}
+func (x *runSyscallExit) load(m state.Map) {
+}
+
+func (x *ThreadGroup) beforeSave() {}
+func (x *ThreadGroup) save(m state.Map) {
+	x.beforeSave()
+	var rscr *RSEQCriticalRegion = x.saveRscr()
+	m.SaveValue("rscr", rscr)
+	m.Save("threadGroupNode", &x.threadGroupNode)
+	m.Save("signalHandlers", &x.signalHandlers)
+	m.Save("pendingSignals", &x.pendingSignals)
+	m.Save("groupStopDequeued", &x.groupStopDequeued)
+	m.Save("groupStopSignal", &x.groupStopSignal)
+	m.Save("groupStopPendingCount", &x.groupStopPendingCount)
+	m.Save("groupStopComplete", &x.groupStopComplete)
+	m.Save("groupStopWaitable", &x.groupStopWaitable)
+	m.Save("groupContNotify", &x.groupContNotify)
+	m.Save("groupContInterrupted", &x.groupContInterrupted)
+	m.Save("groupContWaitable", &x.groupContWaitable)
+	m.Save("exiting", &x.exiting)
+	m.Save("exitStatus", &x.exitStatus)
+	m.Save("terminationSignal", &x.terminationSignal)
+	m.Save("itimerRealTimer", &x.itimerRealTimer)
+	m.Save("itimerVirtSetting", &x.itimerVirtSetting)
+	m.Save("itimerProfSetting", &x.itimerProfSetting)
+	m.Save("rlimitCPUSoftSetting", &x.rlimitCPUSoftSetting)
+	m.Save("cpuTimersEnabled", &x.cpuTimersEnabled)
+	m.Save("timers", &x.timers)
+	m.Save("nextTimerID", &x.nextTimerID)
+	m.Save("exitedCPUStats", &x.exitedCPUStats)
+	m.Save("childCPUStats", &x.childCPUStats)
+	m.Save("ioUsage", &x.ioUsage)
+	m.Save("maxRSS", &x.maxRSS)
+	m.Save("childMaxRSS", &x.childMaxRSS)
+	m.Save("limits", &x.limits)
+	m.Save("processGroup", &x.processGroup)
+	m.Save("execed", &x.execed)
+}
+
+func (x *ThreadGroup) afterLoad() {}
+func (x *ThreadGroup) load(m state.Map) {
+	m.Load("threadGroupNode", &x.threadGroupNode)
+	m.Load("signalHandlers", &x.signalHandlers)
+	m.Load("pendingSignals", &x.pendingSignals)
+	m.Load("groupStopDequeued", &x.groupStopDequeued)
+	m.Load("groupStopSignal", &x.groupStopSignal)
+	m.Load("groupStopPendingCount", &x.groupStopPendingCount)
+	m.Load("groupStopComplete", &x.groupStopComplete)
+	m.Load("groupStopWaitable", &x.groupStopWaitable)
+	m.Load("groupContNotify", &x.groupContNotify)
+	m.Load("groupContInterrupted", &x.groupContInterrupted)
+	m.Load("groupContWaitable", &x.groupContWaitable)
+	m.Load("exiting", &x.exiting)
+	m.Load("exitStatus", &x.exitStatus)
+	m.Load("terminationSignal", &x.terminationSignal)
+	m.Load("itimerRealTimer", &x.itimerRealTimer)
+	m.Load("itimerVirtSetting", &x.itimerVirtSetting)
+	m.Load("itimerProfSetting", &x.itimerProfSetting)
+	m.Load("rlimitCPUSoftSetting", &x.rlimitCPUSoftSetting)
+	m.Load("cpuTimersEnabled", &x.cpuTimersEnabled)
+	m.Load("timers", &x.timers)
+	m.Load("nextTimerID", &x.nextTimerID)
+	m.Load("exitedCPUStats", &x.exitedCPUStats)
+	m.Load("childCPUStats", &x.childCPUStats)
+	m.Load("ioUsage", &x.ioUsage)
+	m.Load("maxRSS", &x.maxRSS)
+	m.Load("childMaxRSS", &x.childMaxRSS)
+	m.Load("limits", &x.limits)
+	m.Load("processGroup", &x.processGroup)
+	m.Load("execed", &x.execed)
+	m.LoadValue("rscr", new(*RSEQCriticalRegion), func(y interface{}) { x.loadRscr(y.(*RSEQCriticalRegion)) })
+}
+
+func (x *itimerRealListener) beforeSave() {}
+func (x *itimerRealListener) save(m state.Map) {
+	x.beforeSave()
+	m.Save("tg", &x.tg)
+}
+
+func (x *itimerRealListener) afterLoad() {}
+func (x *itimerRealListener) load(m state.Map) {
+	m.Load("tg", &x.tg)
+}
+
+func (x *TaskSet) beforeSave() {}
+func (x *TaskSet) save(m state.Map) {
+	x.beforeSave()
+	m.Save("Root", &x.Root)
+	m.Save("sessions", &x.sessions)
+}
+
+func (x *TaskSet) afterLoad() {}
+func (x *TaskSet) load(m state.Map) {
+	m.Load("Root", &x.Root)
+	m.Load("sessions", &x.sessions)
+}
+
+func (x *PIDNamespace) beforeSave() {}
+func (x *PIDNamespace) save(m state.Map) {
+	x.beforeSave()
+	m.Save("owner", &x.owner)
+	m.Save("parent", &x.parent)
+	m.Save("userns", &x.userns)
+	m.Save("last", &x.last)
+	m.Save("tasks", &x.tasks)
+	m.Save("tids", &x.tids)
+	m.Save("tgids", &x.tgids)
+	m.Save("sessions", &x.sessions)
+	m.Save("sids", &x.sids)
+	m.Save("processGroups", &x.processGroups)
+	m.Save("pgids", &x.pgids)
+	m.Save("exiting", &x.exiting)
+}
+
+func (x *PIDNamespace) afterLoad() {}
+func (x *PIDNamespace) load(m state.Map) {
+	m.Load("owner", &x.owner)
+	m.Load("parent", &x.parent)
+	m.Load("userns", &x.userns)
+	m.Load("last", &x.last)
+	m.Load("tasks", &x.tasks)
+	m.Load("tids", &x.tids)
+	m.Load("tgids", &x.tgids)
+	m.Load("sessions", &x.sessions)
+	m.Load("sids", &x.sids)
+	m.Load("processGroups", &x.processGroups)
+	m.Load("pgids", &x.pgids)
+	m.Load("exiting", &x.exiting)
+}
+
+func (x *threadGroupNode) beforeSave() {}
+func (x *threadGroupNode) save(m state.Map) {
+	x.beforeSave()
+	m.Save("pidns", &x.pidns)
+	m.Save("leader", &x.leader)
+	m.Save("execing", &x.execing)
+	m.Save("tasks", &x.tasks)
+	m.Save("tasksCount", &x.tasksCount)
+	m.Save("liveTasks", &x.liveTasks)
+	m.Save("activeTasks", &x.activeTasks)
+}
+
+func (x *threadGroupNode) afterLoad() {}
+func (x *threadGroupNode) load(m state.Map) {
+	m.Load("pidns", &x.pidns)
+	m.Load("leader", &x.leader)
+	m.Load("execing", &x.execing)
+	m.Load("tasks", &x.tasks)
+	m.Load("tasksCount", &x.tasksCount)
+	m.Load("liveTasks", &x.liveTasks)
+	m.Load("activeTasks", &x.activeTasks)
+}
+
+func (x *taskNode) beforeSave() {}
+func (x *taskNode) save(m state.Map) {
+	x.beforeSave()
+	m.Save("tg", &x.tg)
+	m.Save("taskEntry", &x.taskEntry)
+	m.Save("parent", &x.parent)
+	m.Save("children", &x.children)
+	m.Save("childPIDNamespace", &x.childPIDNamespace)
+}
+
+func (x *taskNode) afterLoad() {}
+func (x *taskNode) load(m state.Map) {
+	m.LoadWait("tg", &x.tg)
+	m.Load("taskEntry", &x.taskEntry)
+	m.Load("parent", &x.parent)
+	m.Load("children", &x.children)
+	m.Load("childPIDNamespace", &x.childPIDNamespace)
+}
+
+func (x *Timekeeper) save(m state.Map) {
+	x.beforeSave()
+	m.Save("bootTime", &x.bootTime)
+	m.Save("saveMonotonic", &x.saveMonotonic)
+	m.Save("saveRealtime", &x.saveRealtime)
+	m.Save("params", &x.params)
+}
+
+func (x *Timekeeper) load(m state.Map) {
+	m.Load("bootTime", &x.bootTime)
+	m.Load("saveMonotonic", &x.saveMonotonic)
+	m.Load("saveRealtime", &x.saveRealtime)
+	m.Load("params", &x.params)
+	m.AfterLoad(x.afterLoad)
+}
+
+func (x *timekeeperClock) beforeSave() {}
+func (x *timekeeperClock) save(m state.Map) {
+	x.beforeSave()
+	m.Save("tk", &x.tk)
+	m.Save("c", &x.c)
+}
+
+func (x *timekeeperClock) afterLoad() {}
+func (x *timekeeperClock) load(m state.Map) {
+	m.Load("tk", &x.tk)
+	m.Load("c", &x.c)
+}
+
+func (x *UTSNamespace) beforeSave() {}
+func (x *UTSNamespace) save(m state.Map) {
+	x.beforeSave()
+	m.Save("hostName", &x.hostName)
+	m.Save("domainName", &x.domainName)
+	m.Save("userns", &x.userns)
+}
+
+func (x *UTSNamespace) afterLoad() {}
+func (x *UTSNamespace) load(m state.Map) {
+	m.Load("hostName", &x.hostName)
+	m.Load("domainName", &x.domainName)
+	m.Load("userns", &x.userns)
+}
+
+func (x *VDSOParamPage) beforeSave() {}
+func (x *VDSOParamPage) save(m state.Map) {
+	x.beforeSave()
+	m.Save("mfp", &x.mfp)
+	m.Save("fr", &x.fr)
+	m.Save("seq", &x.seq)
+}
+
+func (x *VDSOParamPage) afterLoad() {}
+func (x *VDSOParamPage) load(m state.Map) {
+	m.Load("mfp", &x.mfp)
+	m.Load("fr", &x.fr)
+	m.Load("seq", &x.seq)
+}
+
+func init() {
+	state.Register("kernel.abstractEndpoint", (*abstractEndpoint)(nil), state.Fns{Save: (*abstractEndpoint).save, Load: (*abstractEndpoint).load})
+	state.Register("kernel.AbstractSocketNamespace", (*AbstractSocketNamespace)(nil), state.Fns{Save: (*AbstractSocketNamespace).save, Load: (*AbstractSocketNamespace).load})
+	state.Register("kernel.FDFlags", (*FDFlags)(nil), state.Fns{Save: (*FDFlags).save, Load: (*FDFlags).load})
+	state.Register("kernel.descriptor", (*descriptor)(nil), state.Fns{Save: (*descriptor).save, Load: (*descriptor).load})
+	state.Register("kernel.FDMap", (*FDMap)(nil), state.Fns{Save: (*FDMap).save, Load: (*FDMap).load})
+	state.Register("kernel.FSContext", (*FSContext)(nil), state.Fns{Save: (*FSContext).save, Load: (*FSContext).load})
+	state.Register("kernel.IPCNamespace", (*IPCNamespace)(nil), state.Fns{Save: (*IPCNamespace).save, Load: (*IPCNamespace).load})
+	state.Register("kernel.Kernel", (*Kernel)(nil), state.Fns{Save: (*Kernel).save, Load: (*Kernel).load})
+	state.Register("kernel.socketEntry", (*socketEntry)(nil), state.Fns{Save: (*socketEntry).save, Load: (*socketEntry).load})
+	state.Register("kernel.pendingSignals", (*pendingSignals)(nil), state.Fns{Save: (*pendingSignals).save, Load: (*pendingSignals).load})
+	state.Register("kernel.pendingSignalQueue", (*pendingSignalQueue)(nil), state.Fns{Save: (*pendingSignalQueue).save, Load: (*pendingSignalQueue).load})
+	state.Register("kernel.pendingSignal", (*pendingSignal)(nil), state.Fns{Save: (*pendingSignal).save, Load: (*pendingSignal).load})
+	state.Register("kernel.pendingSignalList", (*pendingSignalList)(nil), state.Fns{Save: (*pendingSignalList).save, Load: (*pendingSignalList).load})
+	state.Register("kernel.pendingSignalEntry", (*pendingSignalEntry)(nil), state.Fns{Save: (*pendingSignalEntry).save, Load: (*pendingSignalEntry).load})
+	state.Register("kernel.savedPendingSignal", (*savedPendingSignal)(nil), state.Fns{Save: (*savedPendingSignal).save, Load: (*savedPendingSignal).load})
+	state.Register("kernel.IntervalTimer", (*IntervalTimer)(nil), state.Fns{Save: (*IntervalTimer).save, Load: (*IntervalTimer).load})
+	state.Register("kernel.processGroupList", (*processGroupList)(nil), state.Fns{Save: (*processGroupList).save, Load: (*processGroupList).load})
+	state.Register("kernel.processGroupEntry", (*processGroupEntry)(nil), state.Fns{Save: (*processGroupEntry).save, Load: (*processGroupEntry).load})
+	state.Register("kernel.ptraceOptions", (*ptraceOptions)(nil), state.Fns{Save: (*ptraceOptions).save, Load: (*ptraceOptions).load})
+	state.Register("kernel.ptraceStop", (*ptraceStop)(nil), state.Fns{Save: (*ptraceStop).save, Load: (*ptraceStop).load})
+	state.Register("kernel.RSEQCriticalRegion", (*RSEQCriticalRegion)(nil), state.Fns{Save: (*RSEQCriticalRegion).save, Load: (*RSEQCriticalRegion).load})
+	state.Register("kernel.sessionList", (*sessionList)(nil), state.Fns{Save: (*sessionList).save, Load: (*sessionList).load})
+	state.Register("kernel.sessionEntry", (*sessionEntry)(nil), state.Fns{Save: (*sessionEntry).save, Load: (*sessionEntry).load})
+	state.Register("kernel.Session", (*Session)(nil), state.Fns{Save: (*Session).save, Load: (*Session).load})
+	state.Register("kernel.ProcessGroup", (*ProcessGroup)(nil), state.Fns{Save: (*ProcessGroup).save, Load: (*ProcessGroup).load})
+	state.Register("kernel.SignalHandlers", (*SignalHandlers)(nil), state.Fns{Save: (*SignalHandlers).save, Load: (*SignalHandlers).load})
+	state.Register("kernel.SyscallTable", (*SyscallTable)(nil), state.Fns{Save: (*SyscallTable).save, Load: (*SyscallTable).load})
+	state.Register("kernel.syslog", (*syslog)(nil), state.Fns{Save: (*syslog).save, Load: (*syslog).load})
+	state.Register("kernel.Task", (*Task)(nil), state.Fns{Save: (*Task).save, Load: (*Task).load})
+	state.Register("kernel.runSyscallAfterPtraceEventClone", (*runSyscallAfterPtraceEventClone)(nil), state.Fns{Save: (*runSyscallAfterPtraceEventClone).save, Load: (*runSyscallAfterPtraceEventClone).load})
+	state.Register("kernel.runSyscallAfterVforkStop", (*runSyscallAfterVforkStop)(nil), state.Fns{Save: (*runSyscallAfterVforkStop).save, Load: (*runSyscallAfterVforkStop).load})
+	state.Register("kernel.vforkStop", (*vforkStop)(nil), state.Fns{Save: (*vforkStop).save, Load: (*vforkStop).load})
+	state.Register("kernel.TaskContext", (*TaskContext)(nil), state.Fns{Save: (*TaskContext).save, Load: (*TaskContext).load})
+	state.Register("kernel.execStop", (*execStop)(nil), state.Fns{Save: (*execStop).save, Load: (*execStop).load})
+	state.Register("kernel.runSyscallAfterExecStop", (*runSyscallAfterExecStop)(nil), state.Fns{Save: (*runSyscallAfterExecStop).save, Load: (*runSyscallAfterExecStop).load})
+	state.Register("kernel.ExitStatus", (*ExitStatus)(nil), state.Fns{Save: (*ExitStatus).save, Load: (*ExitStatus).load})
+	state.Register("kernel.runExit", (*runExit)(nil), state.Fns{Save: (*runExit).save, Load: (*runExit).load})
+	state.Register("kernel.runExitMain", (*runExitMain)(nil), state.Fns{Save: (*runExitMain).save, Load: (*runExitMain).load})
+	state.Register("kernel.runExitNotify", (*runExitNotify)(nil), state.Fns{Save: (*runExitNotify).save, Load: (*runExitNotify).load})
+	state.Register("kernel.taskList", (*taskList)(nil), state.Fns{Save: (*taskList).save, Load: (*taskList).load})
+	state.Register("kernel.taskEntry", (*taskEntry)(nil), state.Fns{Save: (*taskEntry).save, Load: (*taskEntry).load})
+	state.Register("kernel.runApp", (*runApp)(nil), state.Fns{Save: (*runApp).save, Load: (*runApp).load})
+	state.Register("kernel.TaskGoroutineSchedInfo", (*TaskGoroutineSchedInfo)(nil), state.Fns{Save: (*TaskGoroutineSchedInfo).save, Load: (*TaskGoroutineSchedInfo).load})
+	state.Register("kernel.taskClock", (*taskClock)(nil), state.Fns{Save: (*taskClock).save, Load: (*taskClock).load})
+	state.Register("kernel.tgClock", (*tgClock)(nil), state.Fns{Save: (*tgClock).save, Load: (*tgClock).load})
+	state.Register("kernel.groupStop", (*groupStop)(nil), state.Fns{Save: (*groupStop).save, Load: (*groupStop).load})
+	state.Register("kernel.runInterrupt", (*runInterrupt)(nil), state.Fns{Save: (*runInterrupt).save, Load: (*runInterrupt).load})
+	state.Register("kernel.runInterruptAfterSignalDeliveryStop", (*runInterruptAfterSignalDeliveryStop)(nil), state.Fns{Save: (*runInterruptAfterSignalDeliveryStop).save, Load: (*runInterruptAfterSignalDeliveryStop).load})
+	state.Register("kernel.runSyscallAfterSyscallEnterStop", (*runSyscallAfterSyscallEnterStop)(nil), state.Fns{Save: (*runSyscallAfterSyscallEnterStop).save, Load: (*runSyscallAfterSyscallEnterStop).load})
+	state.Register("kernel.runSyscallAfterSysemuStop", (*runSyscallAfterSysemuStop)(nil), state.Fns{Save: (*runSyscallAfterSysemuStop).save, Load: (*runSyscallAfterSysemuStop).load})
+	state.Register("kernel.runSyscallReinvoke", (*runSyscallReinvoke)(nil), state.Fns{Save: (*runSyscallReinvoke).save, Load: (*runSyscallReinvoke).load})
+	state.Register("kernel.runSyscallExit", (*runSyscallExit)(nil), state.Fns{Save: (*runSyscallExit).save, Load: (*runSyscallExit).load})
+	state.Register("kernel.ThreadGroup", (*ThreadGroup)(nil), state.Fns{Save: (*ThreadGroup).save, Load: (*ThreadGroup).load})
+	state.Register("kernel.itimerRealListener", (*itimerRealListener)(nil), state.Fns{Save: (*itimerRealListener).save, Load: (*itimerRealListener).load})
+	state.Register("kernel.TaskSet", (*TaskSet)(nil), state.Fns{Save: (*TaskSet).save, Load: (*TaskSet).load})
+	state.Register("kernel.PIDNamespace", (*PIDNamespace)(nil), state.Fns{Save: (*PIDNamespace).save, Load: (*PIDNamespace).load})
+	state.Register("kernel.threadGroupNode", (*threadGroupNode)(nil), state.Fns{Save: (*threadGroupNode).save, Load: (*threadGroupNode).load})
+	state.Register("kernel.taskNode", (*taskNode)(nil), state.Fns{Save: (*taskNode).save, Load: (*taskNode).load})
+	state.Register("kernel.Timekeeper", (*Timekeeper)(nil), state.Fns{Save: (*Timekeeper).save, Load: (*Timekeeper).load})
+	state.Register("kernel.timekeeperClock", (*timekeeperClock)(nil), state.Fns{Save: (*timekeeperClock).save, Load: (*timekeeperClock).load})
+	state.Register("kernel.UTSNamespace", (*UTSNamespace)(nil), state.Fns{Save: (*UTSNamespace).save, Load: (*UTSNamespace).load})
+	state.Register("kernel.VDSOParamPage", (*VDSOParamPage)(nil), state.Fns{Save: (*VDSOParamPage).save, Load: (*VDSOParamPage).load})
+}
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
new file mode 100644
index 000000000..c93f6598a
--- /dev/null
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -0,0 +1,142 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+const (
+	// stdSignalCap is the maximum number of instances of a given standard
+	// signal that may be pending. ("[If] multiple instances of a standard
+	// signal are delivered while that signal is currently blocked, then only
+	// one instance is queued.") - signal(7)
+	stdSignalCap = 1
+
+	// rtSignalCap is the maximum number of instances of a given realtime
+	// signal that may be pending.
+	//
+	// TODO(igudger): In Linux, the minimum signal queue size is
+	// RLIMIT_SIGPENDING, which is by default max_threads/2.
+	rtSignalCap = 32
+)
+
+// pendingSignals holds a collection of pending signals. The zero value of
+// pendingSignals is a valid empty collection. pendingSignals is thread-unsafe;
+// users must provide synchronization.
+//
+// +stateify savable
+type pendingSignals struct {
+	// signals contains all pending signals.
+	//
+	// Note that signals is zero-indexed, but signal 1 is the first valid
+	// signal, so signals[0] contains signals with signo 1 etc. This offset is
+	// usually handled by using Signal.index().
+	signals [linux.SignalMaximum]pendingSignalQueue `state:".([]savedPendingSignal)"`
+
+	// Bit i of pendingSet is set iff there is at least one signal with signo
+	// i+1 pending.
+	pendingSet linux.SignalSet `state:"manual"`
+}
+
+// pendingSignalQueue holds a pendingSignalList for a single signal number.
+//
+// +stateify savable
+type pendingSignalQueue struct {
+	pendingSignalList
+	length int
+}
+
+// +stateify savable
+type pendingSignal struct {
+	// pendingSignalEntry links into a pendingSignalList.
+	pendingSignalEntry
+	*arch.SignalInfo
+
+	// If timer is not nil, it is the IntervalTimer which sent this signal.
+	timer *IntervalTimer
+}
+
+// enqueue enqueues the given signal. enqueue returns true on success and false
+// on failure (if the given signal's queue is full).
+//
+// Preconditions: info represents a valid signal.
+func (p *pendingSignals) enqueue(info *arch.SignalInfo, timer *IntervalTimer) bool {
+	sig := linux.Signal(info.Signo)
+	q := &p.signals[sig.Index()]
+	if sig.IsStandard() {
+		if q.length >= stdSignalCap {
+			return false
+		}
+	} else if q.length >= rtSignalCap {
+		return false
+	}
+	q.pendingSignalList.PushBack(&pendingSignal{SignalInfo: info, timer: timer})
+	q.length++
+	p.pendingSet |= linux.SignalSetOf(sig)
+	return true
+}
+
+// dequeue dequeues and returns any pending signal not masked by mask. If no
+// unmasked signals are pending, dequeue returns nil.
+func (p *pendingSignals) dequeue(mask linux.SignalSet) *arch.SignalInfo {
+	// "Real-time signals are delivered in a guaranteed order. Multiple
+	// real-time signals of the same type are delivered in the order they were
+	// sent. If different real-time signals are sent to a process, they are
+	// delivered starting with the lowest-numbered signal. (I.e., low-numbered
+	// signals have highest priority.) By contrast, if multiple standard
+	// signals are pending for a process, the order in which they are delivered
+	// is unspecified. If both standard and real-time signals are pending for a
+	// process, POSIX leaves it unspecified which is delivered first. Linux,
+	// like many other implementations, gives priority to standard signals in
+	// this case." - signal(7)
+	lowestPendingUnblockedBit := bits.TrailingZeros64(uint64(p.pendingSet &^ mask))
+	if lowestPendingUnblockedBit >= linux.SignalMaximum {
+		return nil
+	}
+	return p.dequeueSpecific(linux.Signal(lowestPendingUnblockedBit + 1))
+}
+
+func (p *pendingSignals) dequeueSpecific(sig linux.Signal) *arch.SignalInfo {
+	q := &p.signals[sig.Index()]
+	ps := q.pendingSignalList.Front()
+	if ps == nil {
+		return nil
+	}
+	q.pendingSignalList.Remove(ps)
+	q.length--
+	if q.length == 0 {
+		p.pendingSet &^= linux.SignalSetOf(sig)
+	}
+	if ps.timer != nil {
+		ps.timer.updateDequeuedSignalLocked(ps.SignalInfo)
+	}
+	return ps.SignalInfo
+}
+
+// discardSpecific causes all pending signals with number sig to be discarded.
+func (p *pendingSignals) discardSpecific(sig linux.Signal) {
+	q := &p.signals[sig.Index()]
+	for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() {
+		if ps.timer != nil {
+			ps.timer.signalRejectedLocked()
+		}
+	}
+	q.pendingSignalList.Reset()
+	q.length = 0
+	p.pendingSet &^= linux.SignalSetOf(sig)
+}
diff --git a/pkg/sentry/kernel/pending_signals_list.go b/pkg/sentry/kernel/pending_signals_list.go
new file mode 100755
index 000000000..a3499371a
--- /dev/null
+++ b/pkg/sentry/kernel/pending_signals_list.go
@@ -0,0 +1,173 @@
+package kernel
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type pendingSignalElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (pendingSignalElementMapper) linkerFor(elem *pendingSignal) *pendingSignal { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+//      for e := l.Front(); e != nil; e = e.Next() {
+// 		// do something with e.
+//      }
+//
+// +stateify savable
+type pendingSignalList struct {
+	head *pendingSignal
+	tail *pendingSignal
+}
+
+// Reset resets list l to the empty state.
+func (l *pendingSignalList) Reset() {
+	l.head = nil
+	l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+func (l *pendingSignalList) Empty() bool {
+	return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+func (l *pendingSignalList) Front() *pendingSignal {
+	return l.head
+}
+
+// Back returns the last element of list l or nil.
+func (l *pendingSignalList) Back() *pendingSignal {
+	return l.tail
+}
+
+// PushFront inserts the element e at the front of list l.
+func (l *pendingSignalList) PushFront(e *pendingSignal) {
+	pendingSignalElementMapper{}.linkerFor(e).SetNext(l.head)
+	pendingSignalElementMapper{}.linkerFor(e).SetPrev(nil)
+
+	if l.head != nil {
+		pendingSignalElementMapper{}.linkerFor(l.head).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+
+	l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+func (l *pendingSignalList) PushBack(e *pendingSignal) {
+	pendingSignalElementMapper{}.linkerFor(e).SetNext(nil)
+	pendingSignalElementMapper{}.linkerFor(e).SetPrev(l.tail)
+
+	if l.tail != nil {
+		pendingSignalElementMapper{}.linkerFor(l.tail).SetNext(e)
+	} else {
+		l.head = e
+	}
+
+	l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+func (l *pendingSignalList) PushBackList(m *pendingSignalList) {
+	if l.head == nil {
+		l.head = m.head
+		l.tail = m.tail
+	} else if m.head != nil {
+		pendingSignalElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+		pendingSignalElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+		l.tail = m.tail
+	}
+
+	m.head = nil
+	m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+func (l *pendingSignalList) InsertAfter(b, e *pendingSignal) {
+	a := pendingSignalElementMapper{}.linkerFor(b).Next()
+	pendingSignalElementMapper{}.linkerFor(e).SetNext(a)
+	pendingSignalElementMapper{}.linkerFor(e).SetPrev(b)
+	pendingSignalElementMapper{}.linkerFor(b).SetNext(e)
+
+	if a != nil {
+		pendingSignalElementMapper{}.linkerFor(a).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+}
+
+// InsertBefore inserts e before a.
+func (l *pendingSignalList) InsertBefore(a, e *pendingSignal) {
+	b := pendingSignalElementMapper{}.linkerFor(a).Prev()
+	pendingSignalElementMapper{}.linkerFor(e).SetNext(a)
+	pendingSignalElementMapper{}.linkerFor(e).SetPrev(b)
+	pendingSignalElementMapper{}.linkerFor(a).SetPrev(e)
+
+	if b != nil {
+		pendingSignalElementMapper{}.linkerFor(b).SetNext(e)
+	} else {
+		l.head = e
+	}
+}
+
+// Remove removes e from l.
+func (l *pendingSignalList) Remove(e *pendingSignal) {
+	prev := pendingSignalElementMapper{}.linkerFor(e).Prev()
+	next := pendingSignalElementMapper{}.linkerFor(e).Next()
+
+	if prev != nil {
+		pendingSignalElementMapper{}.linkerFor(prev).SetNext(next)
+	} else {
+		l.head = next
+	}
+
+	if next != nil {
+		pendingSignalElementMapper{}.linkerFor(next).SetPrev(prev)
+	} else {
+		l.tail = prev
+	}
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type pendingSignalEntry struct {
+	next *pendingSignal
+	prev *pendingSignal
+}
+
+// Next returns the entry that follows e in the list.
+func (e *pendingSignalEntry) Next() *pendingSignal {
+	return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+func (e *pendingSignalEntry) Prev() *pendingSignal {
+	return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+func (e *pendingSignalEntry) SetNext(elem *pendingSignal) {
+	e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+func (e *pendingSignalEntry) SetPrev(elem *pendingSignal) {
+	e.prev = elem
+}
diff --git a/pkg/sentry/kernel/pending_signals_state.go b/pkg/sentry/kernel/pending_signals_state.go
new file mode 100644
index 000000000..2c902c7e3
--- /dev/null
+++ b/pkg/sentry/kernel/pending_signals_state.go
@@ -0,0 +1,46 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+// +stateify savable
+type savedPendingSignal struct {
+	si    *arch.SignalInfo
+	timer *IntervalTimer
+}
+
+// saveSignals is invoked by stateify.
+func (p *pendingSignals) saveSignals() []savedPendingSignal {
+	var pending []savedPendingSignal
+	for _, q := range p.signals {
+		for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() {
+			pending = append(pending, savedPendingSignal{
+				si:    ps.SignalInfo,
+				timer: ps.timer,
+			})
+		}
+	}
+	return pending
+}
+
+// loadSignals is invoked by stateify.
+func (p *pendingSignals) loadSignals(pending []savedPendingSignal) {
+	for _, sps := range pending {
+		p.enqueue(sps.si, sps.timer)
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go
new file mode 100644
index 000000000..4360dc44f
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/buffer.go
@@ -0,0 +1,90 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+)
+
+// buffer encapsulates a queueable byte buffer.
+//
+// Note that the total size is slightly less than two pages. This
+// is done intentionally to ensure that the buffer object aligns
+// with runtime internals. We have no hard size or alignment
+// requirements. This two page size will effectively minimize
+// internal fragmentation, but still have a large enough chunk
+// to limit excessive segmentation.
+//
+// +stateify savable
+type buffer struct {
+	data  [8144]byte
+	read  int
+	write int
+	bufferEntry
+}
+
+// Reset resets internal data.
+//
+// This must be called before use.
+func (b *buffer) Reset() {
+	b.read = 0
+	b.write = 0
+}
+
+// Empty indicates the buffer is empty.
+//
+// This indicates there is no data left to read.
+func (b *buffer) Empty() bool {
+	return b.read == b.write
+}
+
+// Full indicates the buffer is full.
+//
+// This indicates there is no capacity left to write.
+func (b *buffer) Full() bool {
+	return b.write == len(b.data)
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (b *buffer) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.write:]))
+	n, err := safemem.CopySeq(dst, srcs)
+	b.write += int(n)
+	return n, err
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.read:b.write]))
+	n, err := safemem.CopySeq(dsts, src)
+	b.read += int(n)
+	return n, err
+}
+
+// bufferPool is a pool for buffers.
+var bufferPool = sync.Pool{
+	New: func() interface{} {
+		return new(buffer)
+	},
+}
+
+// newBuffer grabs a new buffer from the pool.
+func newBuffer() *buffer {
+	b := bufferPool.Get().(*buffer)
+	b.Reset()
+	return b
+}
diff --git a/pkg/sentry/kernel/pipe/buffer_list.go b/pkg/sentry/kernel/pipe/buffer_list.go
new file mode 100755
index 000000000..42ec78788
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/buffer_list.go
@@ -0,0 +1,173 @@
+package pipe
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type bufferElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (bufferElementMapper) linkerFor(elem *buffer) *buffer { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+//      for e := l.Front(); e != nil; e = e.Next() {
+// 		// do something with e.
+//      }
+//
+// +stateify savable
+type bufferList struct {
+	head *buffer
+	tail *buffer
+}
+
+// Reset resets list l to the empty state.
+func (l *bufferList) Reset() {
+	l.head = nil
+	l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+func (l *bufferList) Empty() bool {
+	return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+func (l *bufferList) Front() *buffer {
+	return l.head
+}
+
+// Back returns the last element of list l or nil.
+func (l *bufferList) Back() *buffer {
+	return l.tail
+}
+
+// PushFront inserts the element e at the front of list l.
+func (l *bufferList) PushFront(e *buffer) {
+	bufferElementMapper{}.linkerFor(e).SetNext(l.head)
+	bufferElementMapper{}.linkerFor(e).SetPrev(nil)
+
+	if l.head != nil {
+		bufferElementMapper{}.linkerFor(l.head).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+
+	l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+func (l *bufferList) PushBack(e *buffer) {
+	bufferElementMapper{}.linkerFor(e).SetNext(nil)
+	bufferElementMapper{}.linkerFor(e).SetPrev(l.tail)
+
+	if l.tail != nil {
+		bufferElementMapper{}.linkerFor(l.tail).SetNext(e)
+	} else {
+		l.head = e
+	}
+
+	l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+func (l *bufferList) PushBackList(m *bufferList) {
+	if l.head == nil {
+		l.head = m.head
+		l.tail = m.tail
+	} else if m.head != nil {
+		bufferElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+		bufferElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+		l.tail = m.tail
+	}
+
+	m.head = nil
+	m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+func (l *bufferList) InsertAfter(b, e *buffer) {
+	a := bufferElementMapper{}.linkerFor(b).Next()
+	bufferElementMapper{}.linkerFor(e).SetNext(a)
+	bufferElementMapper{}.linkerFor(e).SetPrev(b)
+	bufferElementMapper{}.linkerFor(b).SetNext(e)
+
+	if a != nil {
+		bufferElementMapper{}.linkerFor(a).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+}
+
+// InsertBefore inserts e before a.
+func (l *bufferList) InsertBefore(a, e *buffer) {
+	b := bufferElementMapper{}.linkerFor(a).Prev()
+	bufferElementMapper{}.linkerFor(e).SetNext(a)
+	bufferElementMapper{}.linkerFor(e).SetPrev(b)
+	bufferElementMapper{}.linkerFor(a).SetPrev(e)
+
+	if b != nil {
+		bufferElementMapper{}.linkerFor(b).SetNext(e)
+	} else {
+		l.head = e
+	}
+}
+
+// Remove removes e from l.
+func (l *bufferList) Remove(e *buffer) {
+	prev := bufferElementMapper{}.linkerFor(e).Prev()
+	next := bufferElementMapper{}.linkerFor(e).Next()
+
+	if prev != nil {
+		bufferElementMapper{}.linkerFor(prev).SetNext(next)
+	} else {
+		l.head = next
+	}
+
+	if next != nil {
+		bufferElementMapper{}.linkerFor(next).SetPrev(prev)
+	} else {
+		l.tail = prev
+	}
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type bufferEntry struct {
+	next *buffer
+	prev *buffer
+}
+
+// Next returns the entry that follows e in the list.
+func (e *bufferEntry) Next() *buffer {
+	return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+func (e *bufferEntry) Prev() *buffer {
+	return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+func (e *bufferEntry) SetNext(elem *buffer) {
+	e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+func (e *bufferEntry) SetPrev(elem *buffer) {
+	e.prev = elem
+}
diff --git a/pkg/sentry/kernel/pipe/device.go b/pkg/sentry/kernel/pipe/device.go
new file mode 100644
index 000000000..eb59e15a1
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+// pipeDevice is used for all pipe files.
+var pipeDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
new file mode 100644
index 000000000..926c4c623
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -0,0 +1,196 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/amutex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// inodeOperations implements fs.InodeOperations for pipes.
+//
+// +stateify savable
+type inodeOperations struct {
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeNotVirtual           `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// p is the underlying Pipe object representing this fifo.
+	p *Pipe
+
+	// Channels for synchronizing the creation of new readers and writers of
+	// this fifo. See waitFor and newHandleLocked.
+	//
+	// These are not saved/restored because all waiters are unblocked on save,
+	// and either automatically restart (via ERESTARTSYS) or return EINTR on
+	// resume. On restarts via ERESTARTSYS, the appropriate channel will be
+	// recreated.
+	rWakeup chan struct{} `state:"nosave"`
+	wWakeup chan struct{} `state:"nosave"`
+}
+
+var _ fs.InodeOperations = (*inodeOperations)(nil)
+
+// NewInodeOperations returns a new fs.InodeOperations for a given pipe.
+func NewInodeOperations(ctx context.Context, perms fs.FilePermissions, p *Pipe) *inodeOperations {
+	return &inodeOperations{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.FileOwnerFromContext(ctx), perms, linux.PIPEFS_MAGIC),
+		p:                     p,
+	}
+}
+
+// GetFile implements fs.InodeOperations.GetFile. Named pipes have special blocking
+// semantics during open:
+//
+// "Normally, opening the FIFO blocks until the other end is opened also. A
+// process can open a FIFO in nonblocking mode. In this case, opening for
+// read-only will succeed even if no-one has opened on the write side yet,
+// opening for write-only will fail with ENXIO (no such device or address)
+// unless the other end has already been opened. Under Linux, opening a FIFO
+// for read and write will succeed both in blocking and nonblocking mode. POSIX
+// leaves this behavior undefined. This can be used to open a FIFO for writing
+// while there are no readers available." - fifo(7)
+func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	switch {
+	case flags.Read && !flags.Write: // O_RDONLY.
+		r := i.p.Open(ctx, flags)
+		i.newHandleLocked(&i.rWakeup)
+
+		if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() {
+			if !i.waitFor(&i.wWakeup, ctx) {
+				r.DecRef()
+				return nil, syserror.ErrInterrupted
+			}
+		}
+
+		// By now, either we're doing a nonblocking open or we have a writer. On
+		// a nonblocking read-only open, the open succeeds even if no-one has
+		// opened the write side yet.
+		return r, nil
+
+	case flags.Write && !flags.Read: // O_WRONLY.
+		w := i.p.Open(ctx, flags)
+		i.newHandleLocked(&i.wWakeup)
+
+		if i.p.isNamed && !i.p.HasReaders() {
+			// On a nonblocking, write-only open, the open fails with ENXIO if the
+			// read side isn't open yet.
+			if flags.NonBlocking {
+				w.DecRef()
+				return nil, syserror.ENXIO
+			}
+
+			if !i.waitFor(&i.rWakeup, ctx) {
+				w.DecRef()
+				return nil, syserror.ErrInterrupted
+			}
+		}
+		return w, nil
+
+	case flags.Read && flags.Write: // O_RDWR.
+		// Pipes opened for read-write always succeeds without blocking.
+		rw := i.p.Open(ctx, flags)
+		i.newHandleLocked(&i.rWakeup)
+		i.newHandleLocked(&i.wWakeup)
+		return rw, nil
+
+	default:
+		return nil, syserror.EINVAL
+	}
+}
+
+// waitFor blocks until the underlying pipe has at least one reader/writer is
+// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this
+// function will block for either readers or writers, depending on where
+// 'wakeupChan' points.
+//
+// f.mu must be held by the caller. waitFor returns with f.mu held, but it will
+// drop f.mu before blocking for any reader/writers.
+func (i *inodeOperations) waitFor(wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool {
+	// Ideally this function would simply use a condition variable. However, the
+	// wait needs to be interruptible via 'sleeper', so we must sychronize via a
+	// channel. The synchronization below relies on the fact that closing a
+	// channel unblocks all receives on the channel.
+
+	// Does an appropriate wakeup channel already exist? If not, create a new
+	// one. This is all done under f.mu to avoid races.
+	if *wakeupChan == nil {
+		*wakeupChan = make(chan struct{})
+	}
+
+	// Grab a local reference to the wakeup channel since it may disappear as
+	// soon as we drop f.mu.
+	wakeup := *wakeupChan
+
+	// Drop the lock and prepare to sleep.
+	i.mu.Unlock()
+	cancel := sleeper.SleepStart()
+
+	// Wait for either a new reader/write to be signalled via 'wakeup', or
+	// for the sleep to be cancelled.
+	select {
+	case <-wakeup:
+		sleeper.SleepFinish(true)
+	case <-cancel:
+		sleeper.SleepFinish(false)
+	}
+
+	// Take the lock and check if we were woken. If we were woken and
+	// interrupted, the former takes priority.
+	i.mu.Lock()
+	select {
+	case <-wakeup:
+		return true
+	default:
+		return false
+	}
+}
+
+// newHandleLocked signals a new pipe reader or writer depending on where
+// 'wakeupChan' points. This unblocks any corresponding reader or writer
+// waiting for the other end of the channel to be opened, see Fifo.waitFor.
+//
+// i.mu must be held.
+func (*inodeOperations) newHandleLocked(wakeupChan *chan struct{}) {
+	if *wakeupChan != nil {
+		close(*wakeupChan)
+		*wakeupChan = nil
+	}
+}
+
+func (*inodeOperations) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error {
+	return syserror.EPIPE
+}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
new file mode 100644
index 000000000..b65204492
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -0,0 +1,429 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pipe provides a pipe implementation.
+package pipe
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+const (
+	// MinimumPipeSize is a hard limit of the minimum size of a pipe.
+	MinimumPipeSize = 64 << 10
+
+	// DefaultPipeSize is the system-wide default size of a pipe in bytes.
+	DefaultPipeSize = MinimumPipeSize
+
+	// MaximumPipeSize is a hard limit on the maximum size of a pipe.
+	MaximumPipeSize = 8 << 20
+)
+
+// Sizer is an interface for setting and getting the size of a pipe.
+//
+// It is implemented by Pipe and, through embedding, all other types.
+type Sizer interface {
+	// PipeSize returns the pipe capacity in bytes.
+	PipeSize() int64
+
+	// SetPipeSize sets the new pipe capacity in bytes.
+	//
+	// The new size is returned (which may be capped).
+	SetPipeSize(int64) (int64, error)
+}
+
+// Pipe is an encapsulation of a platform-independent pipe.
+// It manages a buffered byte queue shared between a reader/writer
+// pair.
+//
+// +stateify savable
+type Pipe struct {
+	waiter.Queue `state:"nosave"`
+
+	// isNamed indicates whether this is a named pipe.
+	//
+	// This value is immutable.
+	isNamed bool
+
+	// atomicIOBytes is the maximum number of bytes that the pipe will
+	// guarantee atomic reads or writes atomically.
+	//
+	// This value is immutable.
+	atomicIOBytes int64
+
+	// The dirent backing this pipe. Shared by all readers and writers.
+	//
+	// This value is immutable.
+	Dirent *fs.Dirent
+
+	// The number of active readers for this pipe.
+	//
+	// Access atomically.
+	readers int32
+
+	// The number of active writes for this pipe.
+	//
+	// Access atomically.
+	writers int32
+
+	// mu protects all pipe internal state below.
+	mu sync.Mutex `state:"nosave"`
+
+	// data is the buffer queue of pipe contents.
+	//
+	// This is protected by mu.
+	data bufferList
+
+	// max is the maximum size of the pipe in bytes. When this max has been
+	// reached, writers will get EWOULDBLOCK.
+	//
+	// This is protected by mu.
+	max int64
+
+	// size is the current size of the pipe in bytes.
+	//
+	// This is protected by mu.
+	size int64
+
+	// hadWriter indicates if this pipe ever had a writer. Note that this
+	// does not necessarily indicate there is *currently* a writer, just
+	// that there has been a writer at some point since the pipe was
+	// created.
+	//
+	// This is protected by mu.
+	hadWriter bool
+}
+
+// NewPipe initializes and returns a pipe.
+//
+// N.B. The size and atomicIOBytes will be bounded.
+func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe {
+	if sizeBytes < MinimumPipeSize {
+		sizeBytes = MinimumPipeSize
+	}
+	if sizeBytes > MaximumPipeSize {
+		sizeBytes = MaximumPipeSize
+	}
+	if atomicIOBytes <= 0 {
+		atomicIOBytes = 1
+	}
+	if atomicIOBytes > sizeBytes {
+		atomicIOBytes = sizeBytes
+	}
+	p := &Pipe{
+		isNamed:       isNamed,
+		max:           sizeBytes,
+		atomicIOBytes: atomicIOBytes,
+	}
+
+	// Build the fs.Dirent of this pipe, shared by all fs.Files associated
+	// with this pipe.
+	perms := fs.FilePermissions{
+		User: fs.PermMask{Read: true, Write: true},
+	}
+	iops := NewInodeOperations(ctx, perms, p)
+	ino := pipeDevice.NextIno()
+	sattr := fs.StableAttr{
+		Type:      fs.Pipe,
+		DeviceID:  pipeDevice.DeviceID(),
+		InodeID:   ino,
+		BlockSize: int64(atomicIOBytes),
+	}
+	ms := fs.NewPseudoMountSource()
+	p.Dirent = fs.NewDirent(fs.NewInode(iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino))
+	return p
+}
+
+// NewConnectedPipe initializes a pipe and returns a pair of objects
+// representing the read and write ends of the pipe.
+func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) {
+	p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes)
+	return p.Open(ctx, fs.FileFlags{Read: true}), p.Open(ctx, fs.FileFlags{Write: true})
+}
+
+// Open opens the pipe and returns a new file.
+//
+// Precondition: at least one of flags.Read or flags.Write must be set.
+func (p *Pipe) Open(ctx context.Context, flags fs.FileFlags) *fs.File {
+	switch {
+	case flags.Read && flags.Write:
+		p.rOpen()
+		p.wOpen()
+		return fs.NewFile(ctx, p.Dirent, flags, &ReaderWriter{
+			Pipe: p,
+		})
+	case flags.Read:
+		p.rOpen()
+		return fs.NewFile(ctx, p.Dirent, flags, &Reader{
+			ReaderWriter: ReaderWriter{Pipe: p},
+		})
+	case flags.Write:
+		p.wOpen()
+		return fs.NewFile(ctx, p.Dirent, flags, &Writer{
+			ReaderWriter: ReaderWriter{Pipe: p},
+		})
+	default:
+		// Precondition violated.
+		panic("invalid pipe flags")
+	}
+}
+
+// read reads data from the pipe into dst and returns the number of bytes
+// read, or returns ErrWouldBlock if the pipe is empty.
+//
+// Precondition: this pipe must have readers.
+func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	// Don't block for a zero-length read even if the pipe is empty.
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Is the pipe empty?
+	if p.size == 0 {
+		if !p.HasWriters() {
+			// There are no writers, return EOF.
+			return 0, nil
+		}
+		return 0, syserror.ErrWouldBlock
+	}
+
+	// Limit how much we consume.
+	if dst.NumBytes() > p.size {
+		dst = dst.TakeFirst64(p.size)
+	}
+
+	done := int64(0)
+	for dst.NumBytes() > 0 {
+		// Pop the first buffer.
+		first := p.data.Front()
+		if first == nil {
+			break
+		}
+
+		// Copy user data.
+		n, err := dst.CopyOutFrom(ctx, first)
+		done += int64(n)
+		p.size -= n
+		dst = dst.DropFirst64(n)
+
+		// Empty buffer?
+		if first.Empty() {
+			// Push to the free list.
+			p.data.Remove(first)
+			bufferPool.Put(first)
+		}
+
+		// Handle errors.
+		if err != nil {
+			return done, err
+		}
+	}
+
+	return done, nil
+}
+
+// write writes data from sv into the pipe and returns the number of bytes
+// written. If no bytes are written because the pipe is full (or has less than
+// atomicIOBytes free capacity), write returns ErrWouldBlock.
+//
+// Precondition: this pipe must have writers.
+func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Can't write to a pipe with no readers.
+	if !p.HasReaders() {
+		return 0, syscall.EPIPE
+	}
+
+	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
+	// atomic, but requires no atomicity for writes larger than this.
+	wanted := src.NumBytes()
+	if avail := p.max - p.size; wanted > avail {
+		if wanted <= p.atomicIOBytes {
+			return 0, syserror.ErrWouldBlock
+		}
+		// Limit to the available capacity.
+		src = src.TakeFirst64(avail)
+	}
+
+	done := int64(0)
+	for src.NumBytes() > 0 {
+		// Need a new buffer?
+		last := p.data.Back()
+		if last == nil || last.Full() {
+			// Add a new buffer to the data list.
+			last = newBuffer()
+			p.data.PushBack(last)
+		}
+
+		// Copy user data.
+		n, err := src.CopyInTo(ctx, last)
+		done += int64(n)
+		p.size += n
+		src = src.DropFirst64(n)
+
+		// Handle errors.
+		if err != nil {
+			return done, err
+		}
+	}
+	if wanted > done {
+		// Partial write due to full pipe.
+		return done, syserror.ErrWouldBlock
+	}
+
+	return done, nil
+}
+
+// rOpen signals a new reader of the pipe.
+func (p *Pipe) rOpen() {
+	atomic.AddInt32(&p.readers, 1)
+}
+
+// wOpen signals a new writer of the pipe.
+func (p *Pipe) wOpen() {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	p.hadWriter = true
+	atomic.AddInt32(&p.writers, 1)
+}
+
+// rClose signals that a reader has closed their end of the pipe.
+func (p *Pipe) rClose() {
+	newReaders := atomic.AddInt32(&p.readers, -1)
+	if newReaders < 0 {
+		panic(fmt.Sprintf("Refcounting bug, pipe has negative readers: %v", newReaders))
+	}
+}
+
+// wClose signals that a writer has closed their end of the pipe.
+func (p *Pipe) wClose() {
+	newWriters := atomic.AddInt32(&p.writers, -1)
+	if newWriters < 0 {
+		panic(fmt.Sprintf("Refcounting bug, pipe has negative writers: %v.", newWriters))
+	}
+}
+
+// HasReaders returns whether the pipe has any active readers.
+func (p *Pipe) HasReaders() bool {
+	return atomic.LoadInt32(&p.readers) > 0
+}
+
+// HasWriters returns whether the pipe has any active writers.
+func (p *Pipe) HasWriters() bool {
+	return atomic.LoadInt32(&p.writers) > 0
+}
+
+// rReadinessLocked calculates the read readiness.
+//
+// Precondition: mu must be held.
+func (p *Pipe) rReadinessLocked() waiter.EventMask {
+	ready := waiter.EventMask(0)
+	if p.HasReaders() && p.data.Front() != nil {
+		ready |= waiter.EventIn
+	}
+	if !p.HasWriters() && p.hadWriter {
+		// POLLHUP must be suppressed until the pipe has had at least one writer
+		// at some point. Otherwise a reader thread may poll and immediately get
+		// a POLLHUP before the writer ever opens the pipe, which the reader may
+		// interpret as the writer opening then closing the pipe.
+		ready |= waiter.EventHUp
+	}
+	return ready
+}
+
+// rReadiness returns a mask that states whether the read end of the pipe is
+// ready for reading.
+func (p *Pipe) rReadiness() waiter.EventMask {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.rReadinessLocked()
+}
+
+// wReadinessLocked calculates the write readiness.
+//
+// Precondition: mu must be held.
+func (p *Pipe) wReadinessLocked() waiter.EventMask {
+	ready := waiter.EventMask(0)
+	if p.HasWriters() && p.size < p.max {
+		ready |= waiter.EventOut
+	}
+	if !p.HasReaders() {
+		ready |= waiter.EventErr
+	}
+	return ready
+}
+
+// wReadiness returns a mask that states whether the write end of the pipe
+// is ready for writing.
+func (p *Pipe) wReadiness() waiter.EventMask {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.wReadinessLocked()
+}
+
+// rwReadiness returns a mask that states whether a read-write handle to the
+// pipe is ready for IO.
+func (p *Pipe) rwReadiness() waiter.EventMask {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.rReadinessLocked() | p.wReadinessLocked()
+}
+
+// queued returns the amount of queued data.
+func (p *Pipe) queued() int64 {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.size
+}
+
+// PipeSize implements PipeSizer.PipeSize.
+func (p *Pipe) PipeSize() int64 {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.max
+}
+
+// SetPipeSize implements PipeSize.SetPipeSize.
+func (p *Pipe) SetPipeSize(size int64) (int64, error) {
+	if size < 0 {
+		return 0, syserror.EINVAL
+	}
+	if size < MinimumPipeSize {
+		size = MinimumPipeSize // Per spec.
+	}
+	if size > MaximumPipeSize {
+		return 0, syserror.EPERM
+	}
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	if size < p.size {
+		return 0, syserror.EBUSY
+	}
+	p.max = size
+	return size, nil
+}
diff --git a/pkg/sentry/kernel/pipe/pipe_state_autogen.go b/pkg/sentry/kernel/pipe/pipe_state_autogen.go
new file mode 100755
index 000000000..5d3686109
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe_state_autogen.go
@@ -0,0 +1,134 @@
+// automatically generated by stateify.
+
+package pipe
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *buffer) beforeSave() {}
+func (x *buffer) save(m state.Map) {
+	x.beforeSave()
+	m.Save("data", &x.data)
+	m.Save("read", &x.read)
+	m.Save("write", &x.write)
+	m.Save("bufferEntry", &x.bufferEntry)
+}
+
+func (x *buffer) afterLoad() {}
+func (x *buffer) load(m state.Map) {
+	m.Load("data", &x.data)
+	m.Load("read", &x.read)
+	m.Load("write", &x.write)
+	m.Load("bufferEntry", &x.bufferEntry)
+}
+
+func (x *bufferList) beforeSave() {}
+func (x *bufferList) save(m state.Map) {
+	x.beforeSave()
+	m.Save("head", &x.head)
+	m.Save("tail", &x.tail)
+}
+
+func (x *bufferList) afterLoad() {}
+func (x *bufferList) load(m state.Map) {
+	m.Load("head", &x.head)
+	m.Load("tail", &x.tail)
+}
+
+func (x *bufferEntry) beforeSave() {}
+func (x *bufferEntry) save(m state.Map) {
+	x.beforeSave()
+	m.Save("next", &x.next)
+	m.Save("prev", &x.prev)
+}
+
+func (x *bufferEntry) afterLoad() {}
+func (x *bufferEntry) load(m state.Map) {
+	m.Load("next", &x.next)
+	m.Load("prev", &x.prev)
+}
+
+func (x *inodeOperations) beforeSave() {}
+func (x *inodeOperations) save(m state.Map) {
+	x.beforeSave()
+	m.Save("InodeSimpleAttributes", &x.InodeSimpleAttributes)
+	m.Save("p", &x.p)
+}
+
+func (x *inodeOperations) afterLoad() {}
+func (x *inodeOperations) load(m state.Map) {
+	m.Load("InodeSimpleAttributes", &x.InodeSimpleAttributes)
+	m.Load("p", &x.p)
+}
+
+func (x *Pipe) beforeSave() {}
+func (x *Pipe) save(m state.Map) {
+	x.beforeSave()
+	m.Save("isNamed", &x.isNamed)
+	m.Save("atomicIOBytes", &x.atomicIOBytes)
+	m.Save("Dirent", &x.Dirent)
+	m.Save("readers", &x.readers)
+	m.Save("writers", &x.writers)
+	m.Save("data", &x.data)
+	m.Save("max", &x.max)
+	m.Save("size", &x.size)
+	m.Save("hadWriter", &x.hadWriter)
+}
+
+func (x *Pipe) afterLoad() {}
+func (x *Pipe) load(m state.Map) {
+	m.Load("isNamed", &x.isNamed)
+	m.Load("atomicIOBytes", &x.atomicIOBytes)
+	m.Load("Dirent", &x.Dirent)
+	m.Load("readers", &x.readers)
+	m.Load("writers", &x.writers)
+	m.Load("data", &x.data)
+	m.Load("max", &x.max)
+	m.Load("size", &x.size)
+	m.Load("hadWriter", &x.hadWriter)
+}
+
+func (x *Reader) beforeSave() {}
+func (x *Reader) save(m state.Map) {
+	x.beforeSave()
+	m.Save("ReaderWriter", &x.ReaderWriter)
+}
+
+func (x *Reader) afterLoad() {}
+func (x *Reader) load(m state.Map) {
+	m.Load("ReaderWriter", &x.ReaderWriter)
+}
+
+func (x *ReaderWriter) beforeSave() {}
+func (x *ReaderWriter) save(m state.Map) {
+	x.beforeSave()
+	m.Save("Pipe", &x.Pipe)
+}
+
+func (x *ReaderWriter) afterLoad() {}
+func (x *ReaderWriter) load(m state.Map) {
+	m.Load("Pipe", &x.Pipe)
+}
+
+func (x *Writer) beforeSave() {}
+func (x *Writer) save(m state.Map) {
+	x.beforeSave()
+	m.Save("ReaderWriter", &x.ReaderWriter)
+}
+
+func (x *Writer) afterLoad() {}
+func (x *Writer) load(m state.Map) {
+	m.Load("ReaderWriter", &x.ReaderWriter)
+}
+
+func init() {
+	state.Register("pipe.buffer", (*buffer)(nil), state.Fns{Save: (*buffer).save, Load: (*buffer).load})
+	state.Register("pipe.bufferList", (*bufferList)(nil), state.Fns{Save: (*bufferList).save, Load: (*bufferList).load})
+	state.Register("pipe.bufferEntry", (*bufferEntry)(nil), state.Fns{Save: (*bufferEntry).save, Load: (*bufferEntry).load})
+	state.Register("pipe.inodeOperations", (*inodeOperations)(nil), state.Fns{Save: (*inodeOperations).save, Load: (*inodeOperations).load})
+	state.Register("pipe.Pipe", (*Pipe)(nil), state.Fns{Save: (*Pipe).save, Load: (*Pipe).load})
+	state.Register("pipe.Reader", (*Reader)(nil), state.Fns{Save: (*Reader).save, Load: (*Reader).load})
+	state.Register("pipe.ReaderWriter", (*ReaderWriter)(nil), state.Fns{Save: (*ReaderWriter).save, Load: (*ReaderWriter).load})
+	state.Register("pipe.Writer", (*Writer)(nil), state.Fns{Save: (*Writer).save, Load: (*Writer).load})
+}
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
new file mode 100644
index 000000000..656be824d
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Reader satisfies the fs.FileOperations interface for read-only pipes.
+// Reader should be used with !fs.FileFlags.Write to reject writes.
+//
+// +stateify savable
+type Reader struct {
+	ReaderWriter
+}
+
+// Release implements fs.FileOperations.Release.
+//
+// This overrides ReaderWriter.Release.
+func (r *Reader) Release() {
+	r.Pipe.rClose()
+
+	// Wake up writers.
+	r.Pipe.Notify(waiter.EventOut)
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (r *Reader) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return r.Pipe.rReadiness() & mask
+}
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
new file mode 100644
index 000000000..e560b9be9
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -0,0 +1,96 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"math"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// ReaderWriter satisfies the FileOperations interface and services both
+// read and write requests. This should only be used directly for named pipes.
+// pipe(2) and pipe2(2) only support unidirectional pipes and should use
+// either pipe.Reader or pipe.Writer.
+//
+// +stateify savable
+type ReaderWriter struct {
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	*Pipe
+}
+
+// Release implements fs.FileOperations.Release.
+func (rw *ReaderWriter) Release() {
+	rw.Pipe.rClose()
+	rw.Pipe.wClose()
+
+	// Wake up readers and writers.
+	rw.Pipe.Notify(waiter.EventIn | waiter.EventOut)
+}
+
+// Read implements fs.FileOperations.Read.
+func (rw *ReaderWriter) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	n, err := rw.Pipe.read(ctx, dst)
+	if n > 0 {
+		rw.Pipe.Notify(waiter.EventOut)
+	}
+	return n, err
+}
+
+// Write implements fs.FileOperations.Write.
+func (rw *ReaderWriter) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	n, err := rw.Pipe.write(ctx, src)
+	if n > 0 {
+		rw.Pipe.Notify(waiter.EventIn)
+	}
+	return n, err
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (rw *ReaderWriter) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return rw.Pipe.rwReadiness() & mask
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (rw *ReaderWriter) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Switch on ioctl request.
+	switch int(args[1].Int()) {
+	case linux.FIONREAD:
+		v := rw.queued()
+		if v > math.MaxInt32 {
+			v = math.MaxInt32 // Silently truncate.
+		}
+		// Copy result to user-space.
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	default:
+		return 0, syscall.ENOTTY
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
new file mode 100644
index 000000000..8d5b68541
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Writer satisfies the fs.FileOperations interface for write-only pipes.
+// Writer should be used with !fs.FileFlags.Read to reject reads.
+//
+// +stateify savable
+type Writer struct {
+	ReaderWriter
+}
+
+// Release implements fs.FileOperations.Release.
+//
+// This overrides ReaderWriter.Release.
+func (w *Writer) Release() {
+	w.Pipe.wClose()
+
+	// Wake up readers.
+	w.Pipe.Notify(waiter.EventHUp)
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (w *Writer) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return w.Pipe.wReadiness() & mask
+}
diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go
new file mode 100644
index 000000000..a016b4087
--- /dev/null
+++ b/pkg/sentry/kernel/posixtimer.go
@@ -0,0 +1,306 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// IntervalTimer represents a POSIX interval timer as described by
+// timer_create(2).
+//
+// +stateify savable
+type IntervalTimer struct {
+	timer *ktime.Timer
+
+	// If target is not nil, it receives signo from timer expirations. If group
+	// is true, these signals are thread-group-directed. These fields are
+	// immutable.
+	target *Task
+	signo  linux.Signal
+	id     linux.TimerID
+	sigval uint64
+	group  bool
+
+	// If sigpending is true, a signal to target is already queued, and timer
+	// expirations should increment overrunCur instead of sending another
+	// signal. sigpending is protected by target's signal mutex. (If target is
+	// nil, the timer will never send signals, so sigpending will be unused.)
+	sigpending bool
+
+	// If sigorphan is true, timer's setting has been changed since sigpending
+	// last became true, such that overruns should no longer be counted in the
+	// pending signals si_overrun. sigorphan is protected by target's signal
+	// mutex.
+	sigorphan bool
+
+	// overrunCur is the number of overruns that have occurred since the last
+	// time a signal was sent. overrunCur is protected by target's signal
+	// mutex.
+	overrunCur uint64
+
+	// Consider the last signal sent by this timer that has been dequeued.
+	// overrunLast is the number of overruns that occurred between when this
+	// signal was sent and when it was dequeued. Equivalently, overrunLast was
+	// the value of overrunCur when this signal was dequeued. overrunLast is
+	// protected by target's signal mutex.
+	overrunLast uint64
+}
+
+// DestroyTimer releases it's resources.
+func (it *IntervalTimer) DestroyTimer() {
+	it.timer.Destroy()
+	it.timerSettingChanged()
+	// A destroyed IntervalTimer is still potentially reachable via a
+	// pendingSignal; nil out timer so that it won't be saved.
+	it.timer = nil
+}
+
+func (it *IntervalTimer) timerSettingChanged() {
+	if it.target == nil {
+		return
+	}
+	it.target.tg.pidns.owner.mu.RLock()
+	defer it.target.tg.pidns.owner.mu.RUnlock()
+	it.target.tg.signalHandlers.mu.Lock()
+	defer it.target.tg.signalHandlers.mu.Unlock()
+	it.sigorphan = true
+	it.overrunCur = 0
+	it.overrunLast = 0
+}
+
+// PauseTimer pauses the associated Timer.
+func (it *IntervalTimer) PauseTimer() {
+	it.timer.Pause()
+}
+
+// ResumeTimer resumes the associated Timer.
+func (it *IntervalTimer) ResumeTimer() {
+	it.timer.Resume()
+}
+
+// Preconditions: it.target's signal mutex must be locked.
+func (it *IntervalTimer) updateDequeuedSignalLocked(si *arch.SignalInfo) {
+	it.sigpending = false
+	if it.sigorphan {
+		return
+	}
+	it.overrunLast = it.overrunCur
+	it.overrunCur = 0
+	si.SetOverrun(saturateI32FromU64(it.overrunLast))
+}
+
+// Preconditions: it.target's signal mutex must be locked.
+func (it *IntervalTimer) signalRejectedLocked() {
+	it.sigpending = false
+	if it.sigorphan {
+		return
+	}
+	it.overrunCur++
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (it *IntervalTimer) Notify(exp uint64) {
+	if it.target == nil {
+		return
+	}
+
+	it.target.tg.pidns.owner.mu.RLock()
+	defer it.target.tg.pidns.owner.mu.RUnlock()
+	it.target.tg.signalHandlers.mu.Lock()
+	defer it.target.tg.signalHandlers.mu.Unlock()
+
+	if it.sigpending {
+		it.overrunCur += exp
+		return
+	}
+
+	// sigpending must be set before sendSignalTimerLocked() so that it can be
+	// unset if the signal is discarded (in which case sendSignalTimerLocked()
+	// will return nil).
+	it.sigpending = true
+	it.sigorphan = false
+	it.overrunCur += exp - 1
+	si := &arch.SignalInfo{
+		Signo: int32(it.signo),
+		Code:  arch.SignalInfoTimer,
+	}
+	si.SetTimerID(it.id)
+	si.SetSigval(it.sigval)
+	// si_overrun is set when the signal is dequeued.
+	if err := it.target.sendSignalTimerLocked(si, it.group, it); err != nil {
+		it.signalRejectedLocked()
+	}
+}
+
+// Destroy implements ktime.TimerListener.Destroy. Users of Timer should call
+// DestroyTimer instead.
+func (it *IntervalTimer) Destroy() {
+}
+
+// IntervalTimerCreate implements timer_create(2).
+func (t *Task) IntervalTimerCreate(c ktime.Clock, sigev *linux.Sigevent) (linux.TimerID, error) {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+
+	// Allocate a timer ID.
+	var id linux.TimerID
+	end := t.tg.nextTimerID
+	for {
+		id = t.tg.nextTimerID
+		_, ok := t.tg.timers[id]
+		t.tg.nextTimerID++
+		if t.tg.nextTimerID < 0 {
+			t.tg.nextTimerID = 0
+		}
+		if !ok {
+			break
+		}
+		if t.tg.nextTimerID == end {
+			return 0, syserror.EAGAIN
+		}
+	}
+
+	// "The implementation of the default case where evp [sic] is NULL is
+	// handled inside glibc, which invokes the underlying system call with a
+	// suitably populated sigevent structure." - timer_create(2). This is
+	// misleading; the timer_create syscall also handles a NULL sevp as
+	// described by the man page
+	// (kernel/time/posix-timers.c:sys_timer_create(), do_timer_create()). This
+	// must be handled here instead of the syscall wrapper since sigval is the
+	// timer ID, which isn't available until we allocate it in this function.
+	if sigev == nil {
+		sigev = &linux.Sigevent{
+			Signo:  int32(linux.SIGALRM),
+			Notify: linux.SIGEV_SIGNAL,
+			Value:  uint64(id),
+		}
+	}
+
+	// Construct the timer.
+	it := &IntervalTimer{
+		id:     id,
+		sigval: sigev.Value,
+	}
+	switch sigev.Notify {
+	case linux.SIGEV_NONE:
+		// leave it.target = nil
+	case linux.SIGEV_SIGNAL, linux.SIGEV_THREAD:
+		// POSIX SIGEV_THREAD semantics are implemented in userspace by libc;
+		// to the kernel, SIGEV_THREAD and SIGEV_SIGNAL are equivalent. (See
+		// Linux's kernel/time/posix-timers.c:good_sigevent().)
+		it.target = t.tg.leader
+		it.group = true
+	case linux.SIGEV_THREAD_ID:
+		t.tg.pidns.owner.mu.RLock()
+		target, ok := t.tg.pidns.tasks[ThreadID(sigev.Tid)]
+		t.tg.pidns.owner.mu.RUnlock()
+		if !ok || target.tg != t.tg {
+			return 0, syserror.EINVAL
+		}
+		it.target = target
+	default:
+		return 0, syserror.EINVAL
+	}
+	if sigev.Notify != linux.SIGEV_NONE {
+		it.signo = linux.Signal(sigev.Signo)
+		if !it.signo.IsValid() {
+			return 0, syserror.EINVAL
+		}
+	}
+	it.timer = ktime.NewTimer(c, it)
+
+	t.tg.timers[id] = it
+	return id, nil
+}
+
+// IntervalTimerDelete implements timer_delete(2).
+func (t *Task) IntervalTimerDelete(id linux.TimerID) error {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+	it := t.tg.timers[id]
+	if it == nil {
+		return syserror.EINVAL
+	}
+	delete(t.tg.timers, id)
+	it.DestroyTimer()
+	return nil
+}
+
+// IntervalTimerSettime implements timer_settime(2).
+func (t *Task) IntervalTimerSettime(id linux.TimerID, its linux.Itimerspec, abs bool) (linux.Itimerspec, error) {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+	it := t.tg.timers[id]
+	if it == nil {
+		return linux.Itimerspec{}, syserror.EINVAL
+	}
+
+	newS, err := ktime.SettingFromItimerspec(its, abs, it.timer.Clock())
+	if err != nil {
+		return linux.Itimerspec{}, err
+	}
+	tm, oldS := it.timer.SwapAnd(newS, it.timerSettingChanged)
+	its = ktime.ItimerspecFromSetting(tm, oldS)
+	return its, nil
+}
+
+// IntervalTimerGettime implements timer_gettime(2).
+func (t *Task) IntervalTimerGettime(id linux.TimerID) (linux.Itimerspec, error) {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+	it := t.tg.timers[id]
+	if it == nil {
+		return linux.Itimerspec{}, syserror.EINVAL
+	}
+
+	tm, s := it.timer.Get()
+	its := ktime.ItimerspecFromSetting(tm, s)
+	return its, nil
+}
+
+// IntervalTimerGetoverrun implements timer_getoverrun(2).
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) IntervalTimerGetoverrun(id linux.TimerID) (int32, error) {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+	it := t.tg.timers[id]
+	if it == nil {
+		return 0, syserror.EINVAL
+	}
+	// By timer_create(2) invariant, either it.target == nil (in which case
+	// it.overrunLast is immutably 0) or t.tg == it.target.tg; and the fact
+	// that t is executing timer_getoverrun(2) means that t.tg can't be
+	// completing execve, so t.tg.signalHandlers can't be changing, allowing us
+	// to lock t.tg.signalHandlers.mu without holding the TaskSet mutex.
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	// This is consistent with Linux after 78c9c4dfbf8c ("posix-timers:
+	// Sanitize overrun handling").
+	return saturateI32FromU64(it.overrunLast), nil
+}
+
+func saturateI32FromU64(x uint64) int32 {
+	if x > math.MaxInt32 {
+		return math.MaxInt32
+	}
+	return int32(x)
+}
diff --git a/pkg/sentry/kernel/process_group_list.go b/pkg/sentry/kernel/process_group_list.go
new file mode 100755
index 000000000..853145237
--- /dev/null
+++ b/pkg/sentry/kernel/process_group_list.go
@@ -0,0 +1,173 @@
+package kernel
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type processGroupElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (processGroupElementMapper) linkerFor(elem *ProcessGroup) *ProcessGroup { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+//      for e := l.Front(); e != nil; e = e.Next() {
+// 		// do something with e.
+//      }
+//
+// +stateify savable
+type processGroupList struct {
+	head *ProcessGroup
+	tail *ProcessGroup
+}
+
+// Reset resets list l to the empty state.
+func (l *processGroupList) Reset() {
+	l.head = nil
+	l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+func (l *processGroupList) Empty() bool {
+	return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+func (l *processGroupList) Front() *ProcessGroup {
+	return l.head
+}
+
+// Back returns the last element of list l or nil.
+func (l *processGroupList) Back() *ProcessGroup {
+	return l.tail
+}
+
+// PushFront inserts the element e at the front of list l.
+func (l *processGroupList) PushFront(e *ProcessGroup) {
+	processGroupElementMapper{}.linkerFor(e).SetNext(l.head)
+	processGroupElementMapper{}.linkerFor(e).SetPrev(nil)
+
+	if l.head != nil {
+		processGroupElementMapper{}.linkerFor(l.head).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+
+	l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+func (l *processGroupList) PushBack(e *ProcessGroup) {
+	processGroupElementMapper{}.linkerFor(e).SetNext(nil)
+	processGroupElementMapper{}.linkerFor(e).SetPrev(l.tail)
+
+	if l.tail != nil {
+		processGroupElementMapper{}.linkerFor(l.tail).SetNext(e)
+	} else {
+		l.head = e
+	}
+
+	l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+func (l *processGroupList) PushBackList(m *processGroupList) {
+	if l.head == nil {
+		l.head = m.head
+		l.tail = m.tail
+	} else if m.head != nil {
+		processGroupElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+		processGroupElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+		l.tail = m.tail
+	}
+
+	m.head = nil
+	m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+func (l *processGroupList) InsertAfter(b, e *ProcessGroup) {
+	a := processGroupElementMapper{}.linkerFor(b).Next()
+	processGroupElementMapper{}.linkerFor(e).SetNext(a)
+	processGroupElementMapper{}.linkerFor(e).SetPrev(b)
+	processGroupElementMapper{}.linkerFor(b).SetNext(e)
+
+	if a != nil {
+		processGroupElementMapper{}.linkerFor(a).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+}
+
+// InsertBefore inserts e before a.
+func (l *processGroupList) InsertBefore(a, e *ProcessGroup) {
+	b := processGroupElementMapper{}.linkerFor(a).Prev()
+	processGroupElementMapper{}.linkerFor(e).SetNext(a)
+	processGroupElementMapper{}.linkerFor(e).SetPrev(b)
+	processGroupElementMapper{}.linkerFor(a).SetPrev(e)
+
+	if b != nil {
+		processGroupElementMapper{}.linkerFor(b).SetNext(e)
+	} else {
+		l.head = e
+	}
+}
+
+// Remove removes e from l.
+func (l *processGroupList) Remove(e *ProcessGroup) {
+	prev := processGroupElementMapper{}.linkerFor(e).Prev()
+	next := processGroupElementMapper{}.linkerFor(e).Next()
+
+	if prev != nil {
+		processGroupElementMapper{}.linkerFor(prev).SetNext(next)
+	} else {
+		l.head = next
+	}
+
+	if next != nil {
+		processGroupElementMapper{}.linkerFor(next).SetPrev(prev)
+	} else {
+		l.tail = prev
+	}
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type processGroupEntry struct {
+	next *ProcessGroup
+	prev *ProcessGroup
+}
+
+// Next returns the entry that follows e in the list.
+func (e *processGroupEntry) Next() *ProcessGroup {
+	return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+func (e *processGroupEntry) Prev() *ProcessGroup {
+	return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+func (e *processGroupEntry) SetNext(elem *ProcessGroup) {
+	e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+func (e *processGroupEntry) SetPrev(elem *ProcessGroup) {
+	e.prev = elem
+}
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
new file mode 100644
index 000000000..4423e7efd
--- /dev/null
+++ b/pkg/sentry/kernel/ptrace.go
@@ -0,0 +1,1105 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// ptraceOptions are the subset of options controlling a task's ptrace behavior
+// that are set by ptrace(PTRACE_SETOPTIONS).
+//
+// +stateify savable
+type ptraceOptions struct {
+	// ExitKill is true if the tracee should be sent SIGKILL when the tracer
+	// exits.
+	ExitKill bool
+
+	// If SysGood is true, set bit 7 in the signal number for
+	// syscall-entry-stop and syscall-exit-stop traps delivered to this task's
+	// tracer.
+	SysGood bool
+
+	// TraceClone is true if the tracer wants to receive PTRACE_EVENT_CLONE
+	// events.
+	TraceClone bool
+
+	// TraceExec is true if the tracer wants to receive PTRACE_EVENT_EXEC
+	// events.
+	TraceExec bool
+
+	// TraceExit is true if the tracer wants to receive PTRACE_EVENT_EXIT
+	// events.
+	TraceExit bool
+
+	// TraceFork is true if the tracer wants to receive PTRACE_EVENT_FORK
+	// events.
+	TraceFork bool
+
+	// TraceSeccomp is true if the tracer wants to receive PTRACE_EVENT_SECCOMP
+	// events.
+	TraceSeccomp bool
+
+	// TraceVfork is true if the tracer wants to receive PTRACE_EVENT_VFORK
+	// events.
+	TraceVfork bool
+
+	// TraceVforkDone is true if the tracer wants to receive
+	// PTRACE_EVENT_VFORK_DONE events.
+	TraceVforkDone bool
+}
+
+// ptraceSyscallMode controls the behavior of a ptraced task at syscall entry
+// and exit.
+type ptraceSyscallMode int
+
+const (
+	// ptraceSyscallNone indicates that the task has never ptrace-stopped, or
+	// that it was resumed from its last ptrace-stop by PTRACE_CONT or
+	// PTRACE_DETACH. The task's syscalls will not be intercepted.
+	ptraceSyscallNone ptraceSyscallMode = iota
+
+	// ptraceSyscallIntercept indicates that the task was resumed from its last
+	// ptrace-stop by PTRACE_SYSCALL. The next time the task enters or exits a
+	// syscall, a ptrace-stop will occur.
+	ptraceSyscallIntercept
+
+	// ptraceSyscallEmu indicates that the task was resumed from its last
+	// ptrace-stop by PTRACE_SYSEMU or PTRACE_SYSEMU_SINGLESTEP. The next time
+	// the task enters a syscall, the syscall will be skipped, and a
+	// ptrace-stop will occur.
+	ptraceSyscallEmu
+)
+
+// CanTrace checks that t is permitted to access target's state, as defined by
+// ptrace(2), subsection "Ptrace access mode checking". If attach is true, it
+// checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access
+// mode PTRACE_MODE_READ.
+func (t *Task) CanTrace(target *Task, attach bool) bool {
+	// "1. If the calling thread and the target thread are in the same thread
+	// group, access is always allowed." - ptrace(2)
+	//
+	// Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access()
+	// should not deny sub-threads", first released in Linux 3.12), the rule
+	// only applies if t and target are the same task. But, as that commit
+	// message puts it, "[any] security check is pointless when the tasks share
+	// the same ->mm."
+	if t.tg == target.tg {
+		return true
+	}
+
+	// """
+	// 2. If the access mode specifies PTRACE_MODE_FSCREDS (ED: snipped,
+	// doesn't exist until Linux 4.5).
+	//
+	// Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the
+	// caller's real UID and GID for the checks in the next step. (Most APIs
+	// that check the caller's UID and GID use the effective IDs. For
+	// historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs
+	// instead.)
+	//
+	// 3. Deny access if neither of the following is true:
+	//
+	// - The real, effective, and saved-set user IDs of the target match the
+	// caller's user ID, *and* the real, effective, and saved-set group IDs of
+	// the target match the caller's group ID.
+	//
+	// - The caller has the CAP_SYS_PTRACE capability in the user namespace of
+	// the target.
+	//
+	// 4. Deny access if the target process "dumpable" attribute has a value
+	// other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in
+	// prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in
+	// the user namespace of the target process.
+	//
+	// 5. The kernel LSM security_ptrace_access_check() interface is invoked to
+	// see if ptrace access is permitted. The results depend on the LSM(s). The
+	// implementation of this interface in the commoncap LSM performs the
+	// following steps:
+	//
+	// a) If the access mode includes PTRACE_MODE_FSCREDS, then use the
+	// caller's effective capability set; otherwise (the access mode specifies
+	// PTRACE_MODE_REALCREDS, so) use the caller's permitted capability set.
+	//
+	// b) Deny access if neither of the following is true:
+	//
+	// - The caller and the target process are in the same user namespace, and
+	// the caller's capabilities are a proper superset of the target process's
+	// permitted capabilities.
+	//
+	// - The caller has the CAP_SYS_PTRACE capability in the target process's
+	// user namespace.
+	//
+	// Note that the commoncap LSM does not distinguish between
+	// PTRACE_MODE_READ and PTRACE_MODE_ATTACH. (ED: From earlier in this
+	// section: "the commoncap LSM ... is always invoked".)
+	// """
+	callerCreds := t.Credentials()
+	targetCreds := target.Credentials()
+	if callerCreds.HasCapabilityIn(linux.CAP_SYS_PTRACE, targetCreds.UserNamespace) {
+		return true
+	}
+	if cuid := callerCreds.RealKUID; cuid != targetCreds.RealKUID || cuid != targetCreds.EffectiveKUID || cuid != targetCreds.SavedKUID {
+		return false
+	}
+	if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID {
+		return false
+	}
+	// TODO(b/31916171): dumpability check
+	if callerCreds.UserNamespace != targetCreds.UserNamespace {
+		return false
+	}
+	if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 {
+		return false
+	}
+	// TODO: Yama LSM
+	return true
+}
+
+// Tracer returns t's ptrace Tracer.
+func (t *Task) Tracer() *Task {
+	return t.ptraceTracer.Load().(*Task)
+}
+
+// hasTracer returns true if t has a ptrace tracer attached.
+func (t *Task) hasTracer() bool {
+	// This isn't just inlined into callers so that if Task.Tracer() turns out
+	// to be too expensive because of e.g. interface conversion, we can switch
+	// to having a separate atomic flag more easily.
+	return t.Tracer() != nil
+}
+
+// ptraceStop is a TaskStop placed on tasks in a ptrace-stop.
+//
+// +stateify savable
+type ptraceStop struct {
+	// If frozen is true, the stopped task's tracer is currently operating on
+	// it, so Task.Kill should not remove the stop.
+	frozen bool
+
+	// If listen is true, the stopped task's tracer invoked PTRACE_LISTEN, so
+	// ptraceFreeze should fail.
+	listen bool
+}
+
+// Killable implements TaskStop.Killable.
+func (s *ptraceStop) Killable() bool {
+	return !s.frozen
+}
+
+// beginPtraceStopLocked initiates an unfrozen ptrace-stop on t. If t has been
+// killed, the stop is skipped, and beginPtraceStopLocked returns false.
+//
+// beginPtraceStopLocked does not signal t's tracer or wake it if it is
+// waiting.
+//
+// Preconditions: The TaskSet mutex must be locked. The caller must be running
+// on the task goroutine.
+func (t *Task) beginPtraceStopLocked() bool {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	// This is analogous to Linux's kernel/signal.c:ptrace_stop() => ... =>
+	// kernel/sched/core.c:__schedule() => signal_pending_state() check, which
+	// is what prevents tasks from entering ptrace-stops after being killed.
+	// Note that if t was SIGKILLed and beingPtraceStopLocked is being called
+	// for PTRACE_EVENT_EXIT, the task will have dequeued the signal before
+	// entering the exit path, so t.killedLocked() will no longer return true.
+	// This is consistent with Linux: "Bugs: ... A SIGKILL signal may still
+	// cause a PTRACE_EVENT_EXIT stop before actual signal death. This may be
+	// changed in the future; SIGKILL is meant to always immediately kill tasks
+	// even under ptrace. Last confirmed on Linux 3.13." - ptrace(2)
+	if t.killedLocked() {
+		return false
+	}
+	t.beginInternalStopLocked(&ptraceStop{})
+	return true
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceTrapLocked(code int32) {
+	// This is unconditional in ptrace_stop().
+	t.tg.signalHandlers.mu.Lock()
+	t.trapStopPending = false
+	t.tg.signalHandlers.mu.Unlock()
+	t.ptraceCode = code
+	t.ptraceSiginfo = &arch.SignalInfo{
+		Signo: int32(linux.SIGTRAP),
+		Code:  code,
+	}
+	t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
+	t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+	if t.beginPtraceStopLocked() {
+		tracer := t.Tracer()
+		tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP))
+		tracer.tg.eventQueue.Notify(EventTraceeStop)
+	}
+}
+
+// ptraceFreeze checks if t is in a ptraceStop. If so, it freezes the
+// ptraceStop, temporarily preventing it from being removed by a concurrent
+// Task.Kill, and returns true. Otherwise it returns false.
+//
+// Preconditions: The TaskSet mutex must be locked. The caller must be running
+// on the task goroutine of t's tracer.
+func (t *Task) ptraceFreeze() bool {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.stop == nil {
+		return false
+	}
+	s, ok := t.stop.(*ptraceStop)
+	if !ok {
+		return false
+	}
+	if s.listen {
+		return false
+	}
+	s.frozen = true
+	return true
+}
+
+// ptraceUnfreeze ends the effect of a previous successful call to
+// ptraceFreeze.
+//
+// Preconditions: t must be in a frozen ptraceStop.
+func (t *Task) ptraceUnfreeze() {
+	// t.tg.signalHandlers is stable because t is in a frozen ptrace-stop,
+	// preventing its thread group from completing execve.
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.ptraceUnfreezeLocked()
+}
+
+// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be
+// locked.
+func (t *Task) ptraceUnfreezeLocked() {
+	// Do this even if the task has been killed to ensure a panic if t.stop is
+	// nil or not a ptraceStop.
+	t.stop.(*ptraceStop).frozen = false
+	if t.killedLocked() {
+		t.endInternalStopLocked()
+	}
+}
+
+// ptraceUnstop implements ptrace request PTRACE_CONT, PTRACE_SYSCALL,
+// PTRACE_SINGLESTEP, PTRACE_SYSEMU, or PTRACE_SYSEMU_SINGLESTEP depending on
+// mode and singlestep.
+//
+// Preconditions: t must be in a frozen ptrace stop.
+//
+// Postconditions: If ptraceUnstop returns nil, t will no longer be in a ptrace
+// stop.
+func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error {
+	if sig != 0 && !sig.IsValid() {
+		return syserror.EIO
+	}
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.ptraceCode = int32(sig)
+	t.ptraceSyscallMode = mode
+	t.ptraceSinglestep = singlestep
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.endInternalStopLocked()
+	return nil
+}
+
+func (t *Task) ptraceTraceme() error {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if t.hasTracer() {
+		return syserror.EPERM
+	}
+	if t.parent == nil {
+		// In Linux, only init can not have a parent, and init is assumed never
+		// to invoke PTRACE_TRACEME. In the sentry, TGID 1 is an arbitrary user
+		// application that may invoke PTRACE_TRACEME; having no parent can
+		// also occur if all tasks in the parent thread group have exited, and
+		// failed to find a living thread group to reparent to. The former case
+		// is treated as if TGID 1 has an exited parent in an invisible
+		// ancestor PID namespace that is an owner of the root user namespace
+		// (and consequently has CAP_SYS_PTRACE), and the latter case is a
+		// special form of the exited parent case below. In either case,
+		// returning nil here is correct.
+		return nil
+	}
+	if !t.parent.CanTrace(t, true) {
+		return syserror.EPERM
+	}
+	if t.parent.exitState != TaskExitNone {
+		// Fail silently, as if we were successfully attached but then
+		// immediately detached. This is consistent with Linux.
+		return nil
+	}
+	t.ptraceTracer.Store(t.parent)
+	t.parent.ptraceTracees[t] = struct{}{}
+	return nil
+}
+
+// ptraceAttach implements ptrace(PTRACE_ATTACH, target) if seize is false, and
+// ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller.
+func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error {
+	if t.tg == target.tg {
+		return syserror.EPERM
+	}
+	if !t.CanTrace(target, true) {
+		return syserror.EPERM
+	}
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if target.hasTracer() {
+		return syserror.EPERM
+	}
+	// Attaching to zombies and dead tasks is not permitted; the exit
+	// notification logic relies on this. Linux allows attaching to PF_EXITING
+	// tasks, though.
+	if target.exitState >= TaskExitZombie {
+		return syserror.EPERM
+	}
+	if seize {
+		if err := target.ptraceSetOptionsLocked(opts); err != nil {
+			return syserror.EIO
+		}
+	}
+	target.ptraceTracer.Store(t)
+	t.ptraceTracees[target] = struct{}{}
+	target.ptraceSeized = seize
+	target.tg.signalHandlers.mu.Lock()
+	// "Unlike PTRACE_ATTACH, PTRACE_SEIZE does not stop the process." -
+	// ptrace(2)
+	if !seize {
+		target.sendSignalLocked(&arch.SignalInfo{
+			Signo: int32(linux.SIGSTOP),
+			Code:  arch.SignalInfoUser,
+		}, false /* group */)
+	}
+	// Undocumented Linux feature: If the tracee is already group-stopped (and
+	// consequently will not report the SIGSTOP just sent), force it to leave
+	// and re-enter the stop so that it will switch to a ptrace-stop.
+	if target.stop == (*groupStop)(nil) {
+		target.trapStopPending = true
+		target.endInternalStopLocked()
+		// TODO(jamieliu): Linux blocks ptrace_attach() until the task has
+		// entered the ptrace-stop (or exited) via JOBCTL_TRAPPING.
+	}
+	target.tg.signalHandlers.mu.Unlock()
+	return nil
+}
+
+// ptraceDetach implements ptrace(PTRACE_DETACH, target, 0, sig). t is the
+// caller.
+//
+// Preconditions: target must be a tracee of t in a frozen ptrace stop.
+//
+// Postconditions: If ptraceDetach returns nil, target will no longer be in a
+// ptrace stop.
+func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error {
+	if sig != 0 && !sig.IsValid() {
+		return syserror.EIO
+	}
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	target.ptraceCode = int32(sig)
+	target.forgetTracerLocked()
+	delete(t.ptraceTracees, target)
+	return nil
+}
+
+// exitPtrace is called in the exit path to detach all of t's tracees.
+func (t *Task) exitPtrace() {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	for target := range t.ptraceTracees {
+		if target.ptraceOpts.ExitKill {
+			target.tg.signalHandlers.mu.Lock()
+			target.sendSignalLocked(&arch.SignalInfo{
+				Signo: int32(linux.SIGKILL),
+			}, false /* group */)
+			target.tg.signalHandlers.mu.Unlock()
+		}
+		// Leave ptraceCode unchanged so that if the task is ptrace-stopped, it
+		// observes the ptraceCode it set before it entered the stop. I believe
+		// this is consistent with Linux.
+		target.forgetTracerLocked()
+	}
+	// "nil maps cannot be saved"
+	t.ptraceTracees = make(map[*Task]struct{})
+}
+
+// forgetTracerLocked detaches t's tracer and ensures that t is no longer
+// ptrace-stopped.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) forgetTracerLocked() {
+	t.ptraceSeized = false
+	t.ptraceOpts = ptraceOptions{}
+	t.ptraceSyscallMode = ptraceSyscallNone
+	t.ptraceSinglestep = false
+	t.ptraceTracer.Store((*Task)(nil))
+	if t.exitTracerNotified && !t.exitTracerAcked {
+		t.exitTracerAcked = true
+		t.exitNotifyLocked(true)
+	}
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	// Unset t.trapStopPending, which might have been set by PTRACE_INTERRUPT. If
+	// it wasn't, it will be reset via t.groupStopPending after the following.
+	t.trapStopPending = false
+	// If t's thread group is in a group stop and t is eligible to participate,
+	// make it do so. This is essentially the reverse of the special case in
+	// ptraceAttach, which converts a group stop to a ptrace stop. ("Handling
+	// of restart from group-stop is currently buggy, but the "as planned"
+	// behavior is to leave tracee stopped and waiting for SIGCONT." -
+	// ptrace(2))
+	if (t.tg.groupStopComplete || t.tg.groupStopPendingCount != 0) && !t.groupStopPending && t.exitState < TaskExitInitiated {
+		t.groupStopPending = true
+		// t already participated in the group stop when it unset
+		// groupStopPending.
+		t.groupStopAcknowledged = true
+		t.interrupt()
+	}
+	if _, ok := t.stop.(*ptraceStop); ok {
+		t.endInternalStopLocked()
+	}
+}
+
+// ptraceSignalLocked is called after signal dequeueing to check if t should
+// enter ptrace signal-delivery-stop.
+//
+// Preconditions: The signal mutex must be locked. The caller must be running
+// on the task goroutine.
+func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool {
+	if linux.Signal(info.Signo) == linux.SIGKILL {
+		return false
+	}
+	if !t.hasTracer() {
+		return false
+	}
+	// The tracer might change this signal into a stop signal, in which case
+	// any SIGCONT received after the signal was originally dequeued should
+	// cancel it. This is consistent with Linux.
+	t.tg.groupStopDequeued = true
+	// This is unconditional in ptrace_stop().
+	t.trapStopPending = false
+	// Can't lock the TaskSet mutex while holding a signal mutex.
+	t.tg.signalHandlers.mu.Unlock()
+	defer t.tg.signalHandlers.mu.Lock()
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	tracer := t.Tracer()
+	if tracer == nil {
+		return false
+	}
+	t.ptraceCode = info.Signo
+	t.ptraceSiginfo = info
+	t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo)
+	if t.beginPtraceStopLocked() {
+		tracer.signalStop(t, arch.CLD_TRAPPED, info.Signo)
+		tracer.tg.eventQueue.Notify(EventTraceeStop)
+	}
+	return true
+}
+
+// ptraceSeccomp is called when a seccomp-bpf filter returns action
+// SECCOMP_RET_TRACE to check if t should enter PTRACE_EVENT_SECCOMP stop. data
+// is the lower 16 bits of the filter's return value.
+func (t *Task) ptraceSeccomp(data uint16) bool {
+	if !t.hasTracer() {
+		return false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if !t.ptraceOpts.TraceSeccomp {
+		return false
+	}
+	t.Debugf("Entering PTRACE_EVENT_SECCOMP stop")
+	t.ptraceEventLocked(linux.PTRACE_EVENT_SECCOMP, uint64(data))
+	return true
+}
+
+// ptraceSyscallEnter is called immediately before entering a syscall to check
+// if t should enter ptrace syscall-enter-stop.
+func (t *Task) ptraceSyscallEnter() (taskRunState, bool) {
+	if !t.hasTracer() {
+		return nil, false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	switch t.ptraceSyscallMode {
+	case ptraceSyscallNone:
+		return nil, false
+	case ptraceSyscallIntercept:
+		t.Debugf("Entering syscall-enter-stop from PTRACE_SYSCALL")
+		t.ptraceSyscallStopLocked()
+		return (*runSyscallAfterSyscallEnterStop)(nil), true
+	case ptraceSyscallEmu:
+		t.Debugf("Entering syscall-enter-stop from PTRACE_SYSEMU")
+		t.ptraceSyscallStopLocked()
+		return (*runSyscallAfterSysemuStop)(nil), true
+	}
+	panic(fmt.Sprintf("Unknown ptraceSyscallMode: %v", t.ptraceSyscallMode))
+}
+
+// ptraceSyscallExit is called immediately after leaving a syscall to check if
+// t should enter ptrace syscall-exit-stop.
+func (t *Task) ptraceSyscallExit() {
+	if !t.hasTracer() {
+		return
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if t.ptraceSyscallMode != ptraceSyscallIntercept {
+		return
+	}
+	t.Debugf("Entering syscall-exit-stop")
+	t.ptraceSyscallStopLocked()
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceSyscallStopLocked() {
+	code := int32(linux.SIGTRAP)
+	if t.ptraceOpts.SysGood {
+		code |= 0x80
+	}
+	t.ptraceTrapLocked(code)
+}
+
+type ptraceCloneKind int32
+
+const (
+	// ptraceCloneKindClone represents a call to Task.Clone where
+	// TerminationSignal is not SIGCHLD and Vfork is false.
+	ptraceCloneKindClone ptraceCloneKind = iota
+
+	// ptraceCloneKindFork represents a call to Task.Clone where
+	// TerminationSignal is SIGCHLD and Vfork is false.
+	ptraceCloneKindFork
+
+	// ptraceCloneKindVfork represents a call to Task.Clone where Vfork is
+	// true.
+	ptraceCloneKindVfork
+)
+
+// ptraceClone is called at the end of a clone or fork syscall to check if t
+// should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK
+// stop. child is the new task.
+func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool {
+	if !t.hasTracer() {
+		return false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	event := false
+	if !opts.Untraced {
+		switch kind {
+		case ptraceCloneKindClone:
+			if t.ptraceOpts.TraceClone {
+				t.Debugf("Entering PTRACE_EVENT_CLONE stop")
+				t.ptraceEventLocked(linux.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child]))
+				event = true
+			}
+		case ptraceCloneKindFork:
+			if t.ptraceOpts.TraceFork {
+				t.Debugf("Entering PTRACE_EVENT_FORK stop")
+				t.ptraceEventLocked(linux.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child]))
+				event = true
+			}
+		case ptraceCloneKindVfork:
+			if t.ptraceOpts.TraceVfork {
+				t.Debugf("Entering PTRACE_EVENT_VFORK stop")
+				t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child]))
+				event = true
+			}
+		default:
+			panic(fmt.Sprintf("Unknown ptraceCloneKind: %v", kind))
+		}
+	}
+	// "If the PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or PTRACE_O_TRACECLONE
+	// options are in effect, then children created by, respectively, vfork(2)
+	// or clone(2) with the CLONE_VFORK flag, fork(2) or clone(2) with the exit
+	// signal set to SIGCHLD, and other kinds of clone(2), are automatically
+	// attached to the same tracer which traced their parent. SIGSTOP is
+	// delivered to the children, causing them to enter signal-delivery-stop
+	// after they exit the system call which created them." - ptrace(2)
+	//
+	// clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is
+	// confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() =>
+	// include/linux/ptrace.h:ptrace_init_task().
+	if event || opts.InheritTracer {
+		tracer := t.Tracer()
+		if tracer != nil {
+			child.ptraceTracer.Store(tracer)
+			tracer.ptraceTracees[child] = struct{}{}
+			// "The "seized" behavior ... is inherited by children that are
+			// automatically attached using PTRACE_O_TRACEFORK,
+			// PTRACE_O_TRACEVFORK, and PTRACE_O_TRACECLONE." - ptrace(2)
+			child.ptraceSeized = t.ptraceSeized
+			// "Flags are inherited by new tracees created and "auto-attached"
+			// via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or
+			// PTRACE_O_TRACECLONE options." - ptrace(2)
+			child.ptraceOpts = t.ptraceOpts
+			child.tg.signalHandlers.mu.Lock()
+			// "PTRACE_SEIZE: ... Automatically attached children stop with
+			// PTRACE_EVENT_STOP and WSTOPSIG(status) returns SIGTRAP instead
+			// of having SIGSTOP signal delivered to them." - ptrace(2)
+			if child.ptraceSeized {
+				child.trapStopPending = true
+			} else {
+				child.pendingSignals.enqueue(&arch.SignalInfo{
+					Signo: int32(linux.SIGSTOP),
+				}, nil)
+			}
+			// The child will self-interrupt() when its task goroutine starts
+			// running, so we don't have to.
+			child.tg.signalHandlers.mu.Unlock()
+		}
+	}
+	return event
+}
+
+// ptraceVforkDone is called after the end of a vfork stop to check if t should
+// enter PTRACE_EVENT_VFORK_DONE stop. child is the new task's thread ID in t's
+// PID namespace.
+func (t *Task) ptraceVforkDone(child ThreadID) bool {
+	if !t.hasTracer() {
+		return false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if !t.ptraceOpts.TraceVforkDone {
+		return false
+	}
+	t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop")
+	t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK_DONE, uint64(child))
+	return true
+}
+
+// ptraceExec is called at the end of an execve syscall to check if t should
+// enter PTRACE_EVENT_EXEC stop. oldTID is t's thread ID, in its *tracer's* PID
+// namespace, prior to the execve. (If t did not have a tracer at the time
+// oldTID was read, oldTID may be 0. This is consistent with Linux.)
+func (t *Task) ptraceExec(oldTID ThreadID) {
+	if !t.hasTracer() {
+		return
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	// Recheck with the TaskSet mutex locked. Most ptrace points don't need to
+	// do this because detaching resets ptrace options, but PTRACE_EVENT_EXEC
+	// is special because both TraceExec and !TraceExec do something if a
+	// tracer is attached.
+	if !t.hasTracer() {
+		return
+	}
+	if t.ptraceOpts.TraceExec {
+		t.Debugf("Entering PTRACE_EVENT_EXEC stop")
+		t.ptraceEventLocked(linux.PTRACE_EVENT_EXEC, uint64(oldTID))
+		return
+	}
+	// "If the PTRACE_O_TRACEEXEC option is not in effect for the execing
+	// tracee, and if the tracee was PTRACE_ATTACHed rather that [sic]
+	// PTRACE_SEIZEd, the kernel delivers an extra SIGTRAP to the tracee after
+	// execve(2) returns. This is an ordinary signal (similar to one which can
+	// be generated by `kill -TRAP`, not a special kind of ptrace-stop.
+	// Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0
+	// (SI_USER). This signal may be blocked by signal mask, and thus may be
+	// delivered (much) later." - ptrace(2)
+	if t.ptraceSeized {
+		return
+	}
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.sendSignalLocked(&arch.SignalInfo{
+		Signo: int32(linux.SIGTRAP),
+		Code:  arch.SignalInfoUser,
+	}, false /* group */)
+}
+
+// ptraceExit is called early in the task exit path to check if t should enter
+// PTRACE_EVENT_EXIT stop.
+func (t *Task) ptraceExit() {
+	if !t.hasTracer() {
+		return
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if !t.ptraceOpts.TraceExit {
+		return
+	}
+	t.tg.signalHandlers.mu.Lock()
+	status := t.exitStatus.Status()
+	t.tg.signalHandlers.mu.Unlock()
+	t.Debugf("Entering PTRACE_EVENT_EXIT stop")
+	t.ptraceEventLocked(linux.PTRACE_EVENT_EXIT, uint64(status))
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceEventLocked(event int32, msg uint64) {
+	t.ptraceEventMsg = msg
+	// """
+	// PTRACE_EVENT stops are observed by the tracer as waitpid(2) returning
+	// with WIFSTOPPED(status), and WSTOPSIG(status) returns SIGTRAP. An
+	// additional bit is set in the higher byte of the status word: the value
+	// status>>8 will be
+	//
+	//   (SIGTRAP | PTRACE_EVENT_foo << 8).
+	//
+	// ...
+	//
+	// """ - ptrace(2)
+	t.ptraceTrapLocked(int32(linux.SIGTRAP) | (event << 8))
+}
+
+// ptraceKill implements ptrace(PTRACE_KILL, target). t is the caller.
+func (t *Task) ptraceKill(target *Task) error {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if target.Tracer() != t {
+		return syserror.ESRCH
+	}
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	// "This operation is deprecated; do not use it! Instead, send a SIGKILL
+	// directly using kill(2) or tgkill(2). The problem with PTRACE_KILL is
+	// that it requires the tracee to be in signal-delivery-stop, otherwise it
+	// may not work (i.e., may complete successfully but won't kill the
+	// tracee)." - ptrace(2)
+	if target.stop == nil {
+		return nil
+	}
+	if _, ok := target.stop.(*ptraceStop); !ok {
+		return nil
+	}
+	target.ptraceCode = int32(linux.SIGKILL)
+	target.endInternalStopLocked()
+	return nil
+}
+
+func (t *Task) ptraceInterrupt(target *Task) error {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if target.Tracer() != t {
+		return syserror.ESRCH
+	}
+	if !target.ptraceSeized {
+		return syserror.EIO
+	}
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if target.killedLocked() || target.exitState >= TaskExitInitiated {
+		return nil
+	}
+	target.trapStopPending = true
+	if s, ok := target.stop.(*ptraceStop); ok && s.listen {
+		target.endInternalStopLocked()
+	}
+	target.interrupt()
+	return nil
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing. t must have a
+// tracer.
+func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
+	const valid = uintptr(linux.PTRACE_O_EXITKILL |
+		linux.PTRACE_O_TRACESYSGOOD |
+		linux.PTRACE_O_TRACECLONE |
+		linux.PTRACE_O_TRACEEXEC |
+		linux.PTRACE_O_TRACEEXIT |
+		linux.PTRACE_O_TRACEFORK |
+		linux.PTRACE_O_TRACESECCOMP |
+		linux.PTRACE_O_TRACEVFORK |
+		linux.PTRACE_O_TRACEVFORKDONE)
+	if opts&^valid != 0 {
+		return syserror.EINVAL
+	}
+	t.ptraceOpts = ptraceOptions{
+		ExitKill:       opts&linux.PTRACE_O_EXITKILL != 0,
+		SysGood:        opts&linux.PTRACE_O_TRACESYSGOOD != 0,
+		TraceClone:     opts&linux.PTRACE_O_TRACECLONE != 0,
+		TraceExec:      opts&linux.PTRACE_O_TRACEEXEC != 0,
+		TraceExit:      opts&linux.PTRACE_O_TRACEEXIT != 0,
+		TraceFork:      opts&linux.PTRACE_O_TRACEFORK != 0,
+		TraceSeccomp:   opts&linux.PTRACE_O_TRACESECCOMP != 0,
+		TraceVfork:     opts&linux.PTRACE_O_TRACEVFORK != 0,
+		TraceVforkDone: opts&linux.PTRACE_O_TRACEVFORKDONE != 0,
+	}
+	return nil
+}
+
+// Ptrace implements the ptrace system call.
+func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
+	// PTRACE_TRACEME ignores all other arguments.
+	if req == linux.PTRACE_TRACEME {
+		return t.ptraceTraceme()
+	}
+	// All other ptrace requests operate on a current or future tracee
+	// specified by pid.
+	target := t.tg.pidns.TaskWithID(pid)
+	if target == nil {
+		return syserror.ESRCH
+	}
+
+	// PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already
+	// a tracee.
+	if req == linux.PTRACE_ATTACH || req == linux.PTRACE_SEIZE {
+		seize := req == linux.PTRACE_SEIZE
+		if seize && addr != 0 {
+			return syserror.EIO
+		}
+		return t.ptraceAttach(target, seize, uintptr(data))
+	}
+	// PTRACE_KILL and PTRACE_INTERRUPT require that the target is a tracee,
+	// but does not require that it is ptrace-stopped.
+	if req == linux.PTRACE_KILL {
+		return t.ptraceKill(target)
+	}
+	if req == linux.PTRACE_INTERRUPT {
+		return t.ptraceInterrupt(target)
+	}
+	// All other ptrace requests require that the target is a ptrace-stopped
+	// tracee, and freeze the ptrace-stop so the tracee can be operated on.
+	t.tg.pidns.owner.mu.RLock()
+	if target.Tracer() != t {
+		t.tg.pidns.owner.mu.RUnlock()
+		return syserror.ESRCH
+	}
+	if !target.ptraceFreeze() {
+		t.tg.pidns.owner.mu.RUnlock()
+		// "Most ptrace commands (all except PTRACE_ATTACH, PTRACE_SEIZE,
+		// PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the
+		// tracee to be in a ptrace-stop, otherwise they fail with ESRCH." -
+		// ptrace(2)
+		return syserror.ESRCH
+	}
+	t.tg.pidns.owner.mu.RUnlock()
+	// Even if the target has a ptrace-stop active, the tracee's task goroutine
+	// may not yet have reached Task.doStop; wait for it to do so. This is safe
+	// because there's no way for target to initiate a ptrace-stop and then
+	// block (by calling Task.block) before entering it.
+	//
+	// Caveat: If tasks were just restored, the tracee's first call to
+	// Task.Activate (in Task.run) occurs before its first call to Task.doStop,
+	// which may block if the tracer's address space is active.
+	t.UninterruptibleSleepStart(true)
+	target.waitGoroutineStoppedOrExited()
+	t.UninterruptibleSleepFinish(true)
+
+	// Resuming commands end the ptrace stop, but only if successful.
+	// PTRACE_LISTEN ends the ptrace stop if trapNotifyPending is already set on the
+	// target.
+	switch req {
+	case linux.PTRACE_DETACH:
+		if err := t.ptraceDetach(target, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+
+	case linux.PTRACE_CONT:
+		if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+
+	case linux.PTRACE_SYSCALL:
+		if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+
+	case linux.PTRACE_SINGLESTEP:
+		if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+
+	case linux.PTRACE_SYSEMU:
+		if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+
+	case linux.PTRACE_SYSEMU_SINGLESTEP:
+		if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+
+	case linux.PTRACE_LISTEN:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		if !target.ptraceSeized {
+			return syserror.EIO
+		}
+		if target.ptraceSiginfo == nil {
+			return syserror.EIO
+		}
+		if target.ptraceSiginfo.Code>>8 != linux.PTRACE_EVENT_STOP {
+			return syserror.EIO
+		}
+		target.tg.signalHandlers.mu.Lock()
+		defer target.tg.signalHandlers.mu.Unlock()
+		if target.trapNotifyPending {
+			target.endInternalStopLocked()
+		} else {
+			target.stop.(*ptraceStop).listen = true
+			target.ptraceUnfreezeLocked()
+		}
+		return nil
+	}
+
+	// All other ptrace requests expect us to unfreeze the stop.
+	defer target.ptraceUnfreeze()
+
+	switch req {
+	case linux.PTRACE_PEEKTEXT, linux.PTRACE_PEEKDATA:
+		// "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and
+		// PTRACE_PEEKUSER requests have a different API: they store the result
+		// at the address specified by the data parameter, and the return value
+		// is the error flag." - ptrace(2)
+		word := t.Arch().Native(0)
+		if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{
+			IgnorePermissions: true,
+		}); err != nil {
+			return err
+		}
+		_, err := t.CopyOut(data, word)
+		return err
+
+	case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
+		_, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{
+			IgnorePermissions: true,
+		})
+		return err
+
+	case linux.PTRACE_GETREGSET:
+		// "Read the tracee's registers. addr specifies, in an
+		// architecture-dependent way, the type of registers to be read. ...
+		// data points to a struct iovec, which describes the destination
+		// buffer's location and length. On return, the kernel modifies iov.len
+		// to indicate the actual number of bytes returned." - ptrace(2)
+		ars, err := t.CopyInIovecs(data, 1)
+		if err != nil {
+			return err
+		}
+		ar := ars.Head()
+		n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: ar.Start,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		}, int(ar.Length()))
+		if err != nil {
+			return err
+		}
+
+		// Update iovecs to represent the range of the written register set.
+		end, ok := ar.Start.AddLength(uint64(n))
+		if !ok {
+			panic(fmt.Sprintf("%#x + %#x overflows. Invalid reg size > %#x", ar.Start, n, ar.Length()))
+		}
+		ar.End = end
+		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+
+	case linux.PTRACE_SETREGSET:
+		ars, err := t.CopyInIovecs(data, 1)
+		if err != nil {
+			return err
+		}
+		ar := ars.Head()
+		n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: ar.Start,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		}, int(ar.Length()))
+		if err != nil {
+			return err
+		}
+		ar.End -= usermem.Addr(n)
+		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+
+	case linux.PTRACE_GETSIGINFO:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		if target.ptraceSiginfo == nil {
+			return syserror.EINVAL
+		}
+		_, err := t.CopyOut(data, target.ptraceSiginfo)
+		return err
+
+	case linux.PTRACE_SETSIGINFO:
+		var info arch.SignalInfo
+		if _, err := t.CopyIn(data, &info); err != nil {
+			return err
+		}
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		if target.ptraceSiginfo == nil {
+			return syserror.EINVAL
+		}
+		target.ptraceSiginfo = &info
+		return nil
+
+	case linux.PTRACE_GETSIGMASK:
+		if addr != linux.SignalSetSize {
+			return syserror.EINVAL
+		}
+		_, err := t.CopyOut(data, target.SignalMask())
+		return err
+
+	case linux.PTRACE_SETSIGMASK:
+		if addr != linux.SignalSetSize {
+			return syserror.EINVAL
+		}
+		var mask linux.SignalSet
+		if _, err := t.CopyIn(data, &mask); err != nil {
+			return err
+		}
+		// The target's task goroutine is stopped, so this is safe:
+		target.SetSignalMask(mask &^ UnblockableSignals)
+		return nil
+
+	case linux.PTRACE_SETOPTIONS:
+		t.tg.pidns.owner.mu.Lock()
+		defer t.tg.pidns.owner.mu.Unlock()
+		return target.ptraceSetOptionsLocked(uintptr(data))
+
+	case linux.PTRACE_GETEVENTMSG:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		_, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg)
+		return err
+
+	// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
+
+	default:
+		return t.ptraceArch(target, req, addr, data)
+	}
+}
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
new file mode 100644
index 000000000..048eeaa3f
--- /dev/null
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -0,0 +1,89 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// ptraceArch implements arch-specific ptrace commands.
+func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error {
+	switch req {
+	case linux.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER
+		n, err := target.Arch().PtracePeekUser(uintptr(addr))
+		if err != nil {
+			return err
+		}
+		_, err = t.CopyOut(data, n)
+		return err
+
+	case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
+		return target.Arch().PtracePokeUser(uintptr(addr), uintptr(data))
+
+	case linux.PTRACE_GETREGS:
+		// "Copy the tracee's general-purpose ... registers ... to the address
+		// data in the tracer. ... (addr is ignored.) Note that SPARC systems
+		// have the meaning of data and addr reversed ..."
+		_, err := target.Arch().PtraceGetRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case linux.PTRACE_GETFPREGS:
+		_, err := target.Arch().PtraceGetFPRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case linux.PTRACE_SETREGS:
+		_, err := target.Arch().PtraceSetRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case linux.PTRACE_SETFPREGS:
+		_, err := target.Arch().PtraceSetFPRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	default:
+		return syserror.EIO
+	}
+}
diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go
new file mode 100644
index 000000000..4899c813f
--- /dev/null
+++ b/pkg/sentry/kernel/ptrace_arm64.go
@@ -0,0 +1,28 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// ptraceArch implements arch-specific ptrace commands.
+func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error {
+	return syserror.EIO
+}
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
new file mode 100644
index 000000000..c4fb2c56c
--- /dev/null
+++ b/pkg/sentry/kernel/rseq.go
@@ -0,0 +1,120 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Restartable sequences, as described in https://lwn.net/Articles/650333/.
+
+// RSEQCriticalRegion describes a restartable sequence critical region.
+//
+// +stateify savable
+type RSEQCriticalRegion struct {
+	// When a task in this thread group has its CPU preempted (as defined by
+	// platform.ErrContextCPUPreempted) or has a signal delivered to an
+	// application handler while its instruction pointer is in CriticalSection,
+	// set the instruction pointer to Restart and application register r10 (on
+	// amd64) to the former instruction pointer.
+	CriticalSection usermem.AddrRange
+	Restart         usermem.Addr
+}
+
+// RSEQAvailable returns true if t supports restartable sequences.
+func (t *Task) RSEQAvailable() bool {
+	return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption()
+}
+
+// RSEQCriticalRegion returns a copy of t's thread group's current restartable
+// sequence.
+func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion {
+	return *t.tg.rscr.Load().(*RSEQCriticalRegion)
+}
+
+// SetRSEQCriticalRegion replaces t's thread group's restartable sequence.
+//
+// Preconditions: t.RSEQAvailable() == true.
+func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error {
+	// These checks are somewhat more lenient than in Linux, which (bizarrely)
+	// requires rscr.CriticalSection to be non-empty and rscr.Restart to be
+	// outside of rscr.CriticalSection, even if rscr.CriticalSection.Start == 0
+	// (which disables the critical region).
+	if rscr.CriticalSection.Start == 0 {
+		rscr.CriticalSection.End = 0
+		rscr.Restart = 0
+		t.tg.rscr.Store(&rscr)
+		return nil
+	}
+	if rscr.CriticalSection.Start >= rscr.CriticalSection.End {
+		return syserror.EINVAL
+	}
+	if rscr.CriticalSection.Contains(rscr.Restart) {
+		return syserror.EINVAL
+	}
+	// TODO(jamieliu): check that rscr.CriticalSection and rscr.Restart are in
+	// the application address range, for consistency with Linux
+	t.tg.rscr.Store(&rscr)
+	return nil
+}
+
+// RSEQCPUAddr returns the address that RSEQ will keep updated with t's CPU
+// number.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) RSEQCPUAddr() usermem.Addr {
+	return t.rseqCPUAddr
+}
+
+// SetRSEQCPUAddr replaces the address that RSEQ will keep updated with t's CPU
+// number.
+//
+// Preconditions: t.RSEQAvailable() == true. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error {
+	t.rseqCPUAddr = addr
+	if addr != 0 {
+		t.rseqCPU = int32(hostcpu.GetCPU())
+		if err := t.rseqCopyOutCPU(); err != nil {
+			t.rseqCPUAddr = 0
+			t.rseqCPU = -1
+			return syserror.EINVAL // yes, EINVAL, not err or EFAULT
+		}
+	} else {
+		t.rseqCPU = -1
+	}
+	return nil
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqCopyOutCPU() error {
+	buf := t.CopyScratchBuffer(4)
+	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
+	_, err := t.CopyOutBytes(t.rseqCPUAddr, buf)
+	return err
+}
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) rseqInterrupt() {
+	rscr := t.tg.rscr.Load().(*RSEQCriticalRegion)
+	if ip := t.Arch().IP(); rscr.CriticalSection.Contains(usermem.Addr(ip)) {
+		t.Debugf("Interrupted RSEQ critical section at %#x; restarting at %#x", ip, rscr.Restart)
+		t.Arch().SetIP(uintptr(rscr.Restart))
+		t.Arch().SetRSEQInterruptedIP(ip)
+	}
+}
diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go
new file mode 100644
index 000000000..c6c436690
--- /dev/null
+++ b/pkg/sentry/kernel/sched/cpuset.go
@@ -0,0 +1,105 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sched
+
+import "math/bits"
+
+const (
+	bitsPerByte  = 8
+	bytesPerLong = 8 // only for 64-bit architectures
+)
+
+// CPUSet contains a bitmap to record CPU information.
+//
+// Note that this definition is only correct for little-endian architectures,
+// since Linux's cpumask_t uses unsigned long.
+type CPUSet []byte
+
+// CPUSetSize returns the size in bytes of a CPUSet that can contain num cpus.
+func CPUSetSize(num uint) uint {
+	// NOTE(b/68859821): Applications may expect that the size of a CPUSet in
+	// bytes is always a multiple of sizeof(unsigned long), since this is true
+	// in Linux. Thus we always round up.
+	bytes := (num + bitsPerByte - 1) / bitsPerByte
+	longs := (bytes + bytesPerLong - 1) / bytesPerLong
+	return longs * bytesPerLong
+}
+
+// NewCPUSet returns a CPUSet for the given number of CPUs which initially
+// contains no CPUs.
+func NewCPUSet(num uint) CPUSet {
+	return CPUSet(make([]byte, CPUSetSize(num)))
+}
+
+// NewFullCPUSet returns a CPUSet for the given number of CPUs, all of which
+// are present in the set.
+func NewFullCPUSet(num uint) CPUSet {
+	c := NewCPUSet(num)
+	var i uint
+	for ; i < num/bitsPerByte; i++ {
+		c[i] = 0xff
+	}
+	if rem := num % bitsPerByte; rem != 0 {
+		c[i] = (1 << rem) - 1
+	}
+	return c
+}
+
+// Size returns the size of 'c' in bytes.
+func (c CPUSet) Size() uint {
+	return uint(len(c))
+}
+
+// NumCPUs returns how many cpus are set in the CPUSet.
+func (c CPUSet) NumCPUs() uint {
+	var n int
+	for _, b := range c {
+		n += bits.OnesCount8(b)
+	}
+	return uint(n)
+}
+
+// Copy returns a copy of the CPUSet.
+func (c CPUSet) Copy() CPUSet {
+	return append(CPUSet(nil), c...)
+}
+
+// Set sets the bit corresponding to cpu.
+func (c *CPUSet) Set(cpu uint) {
+	(*c)[cpu/bitsPerByte] |= 1 << (cpu % bitsPerByte)
+}
+
+// ClearAbove clears bits corresponding to cpu and all higher cpus.
+func (c *CPUSet) ClearAbove(cpu uint) {
+	i := cpu / bitsPerByte
+	if i >= c.Size() {
+		return
+	}
+	(*c)[i] &^= 0xff << (cpu % bitsPerByte)
+	for i++; i < c.Size(); i++ {
+		(*c)[i] = 0
+	}
+}
+
+// ForEachCPU iterates over the CPUSet and calls fn with the cpu index if
+// it's set.
+func (c CPUSet) ForEachCPU(fn func(uint)) {
+	for i := uint(0); i < c.Size()*bitsPerByte; i++ {
+		bit := uint(1) << (i & (bitsPerByte - 1))
+		if uint(c[i/bitsPerByte])&bit == bit {
+			fn(i)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/sched/sched.go b/pkg/sentry/kernel/sched/sched.go
new file mode 100644
index 000000000..de18c9d02
--- /dev/null
+++ b/pkg/sentry/kernel/sched/sched.go
@@ -0,0 +1,16 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sched implements scheduler related features.
+package sched
diff --git a/pkg/sentry/kernel/sched/sched_state_autogen.go b/pkg/sentry/kernel/sched/sched_state_autogen.go
new file mode 100755
index 000000000..2a482732e
--- /dev/null
+++ b/pkg/sentry/kernel/sched/sched_state_autogen.go
@@ -0,0 +1,4 @@
+// automatically generated by stateify.
+
+package sched
+
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
new file mode 100644
index 000000000..cc75eb08a
--- /dev/null
+++ b/pkg/sentry/kernel/seccomp.go
@@ -0,0 +1,217 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/bpf"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const maxSyscallFilterInstructions = 1 << 15
+
+// seccompData is equivalent to struct seccomp_data, which contains the data
+// passed to seccomp-bpf filters.
+type seccompData struct {
+	// nr is the system call number.
+	nr int32
+
+	// arch is an AUDIT_ARCH_* value indicating the system call convention.
+	arch uint32
+
+	// instructionPointer is the value of the instruction pointer at the time
+	// of the system call.
+	instructionPointer uint64
+
+	// args contains the first 6 system call arguments.
+	args [6]uint64
+}
+
+func (d *seccompData) asBPFInput() bpf.Input {
+	return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder}
+}
+
+func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo {
+	si := &arch.SignalInfo{
+		Signo: int32(linux.SIGSYS),
+		Errno: errno,
+		Code:  arch.SYS_SECCOMP,
+	}
+	si.SetCallAddr(uint64(ip))
+	si.SetSyscall(sysno)
+	si.SetArch(t.SyscallTable().AuditNumber)
+	return si
+}
+
+// checkSeccompSyscall applies the task's seccomp filters before the execution
+// of syscall sysno at instruction pointer ip. (These parameters must be passed
+// in because vsyscalls do not use the values in t.Arch().)
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) linux.BPFAction {
+	result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip))
+	action := result & linux.SECCOMP_RET_ACTION
+	switch action {
+	case linux.SECCOMP_RET_TRAP:
+		// "Results in the kernel sending a SIGSYS signal to the triggering
+		// task without executing the system call. ... The SECCOMP_RET_DATA
+		// portion of the return value will be passed as si_errno." -
+		// Documentation/prctl/seccomp_filter.txt
+		t.SendSignal(seccompSiginfo(t, int32(result.Data()), sysno, ip))
+		// "The return value register will contain an arch-dependent value." In
+		// practice, it's ~always the syscall number.
+		t.Arch().SetReturn(uintptr(sysno))
+
+	case linux.SECCOMP_RET_ERRNO:
+		// "Results in the lower 16-bits of the return value being passed to
+		// userland as the errno without executing the system call."
+		t.Arch().SetReturn(-uintptr(result.Data()))
+
+	case linux.SECCOMP_RET_TRACE:
+		// "When returned, this value will cause the kernel to attempt to
+		// notify a ptrace()-based tracer prior to executing the system call.
+		// If there is no tracer present, -ENOSYS is returned to userland and
+		// the system call is not executed."
+		if !t.ptraceSeccomp(result.Data()) {
+			// This useless-looking temporary is needed because Go.
+			tmp := uintptr(syscall.ENOSYS)
+			t.Arch().SetReturn(-tmp)
+			return linux.SECCOMP_RET_ERRNO
+		}
+
+	case linux.SECCOMP_RET_ALLOW:
+		// "Results in the system call being executed."
+
+	case linux.SECCOMP_RET_KILL_THREAD:
+		// "Results in the task exiting immediately without executing the
+		// system call. The exit status of the task will be SIGSYS, not
+		// SIGKILL."
+
+	default:
+		// consistent with Linux
+		return linux.SECCOMP_RET_KILL_THREAD
+	}
+	return action
+}
+
+func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
+	data := seccompData{
+		nr:                 sysno,
+		arch:               t.tc.st.AuditNumber,
+		instructionPointer: uint64(ip),
+	}
+	// data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
+	// we can't do any slicing tricks or even use copy/append here.
+	for i, arg := range args {
+		if i >= len(data.args) {
+			break
+		}
+		data.args[i] = arg.Uint64()
+	}
+	input := data.asBPFInput()
+
+	ret := uint32(linux.SECCOMP_RET_ALLOW)
+	f := t.syscallFilters.Load()
+	if f == nil {
+		return ret
+	}
+
+	// "Every filter successfully installed will be evaluated (in reverse
+	// order) for each system call the task makes." - kernel/seccomp.c
+	for i := len(f.([]bpf.Program)) - 1; i >= 0; i-- {
+		thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input)
+		if err != nil {
+			t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
+			thisRet = uint32(linux.SECCOMP_RET_KILL_THREAD)
+		}
+		// "If multiple filters exist, the return value for the evaluation of a
+		// given system call will always use the highest precedent value." -
+		// Documentation/prctl/seccomp_filter.txt
+		//
+		// (Note that this contradicts prctl(2): "If the filters permit prctl()
+		// calls, then additional filters can be added; they are run in order
+		// until the first non-allow result is seen." prctl(2) is incorrect.)
+		//
+		// "The ordering ensures that a min_t() over composed return values
+		// always selects the least permissive choice." -
+		// include/uapi/linux/seccomp.h
+		if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) {
+			ret = thisRet
+		}
+	}
+
+	return ret
+}
+
+// AppendSyscallFilter adds BPF program p as a system call filter.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) AppendSyscallFilter(p bpf.Program, syncAll bool) error {
+	// While syscallFilters are an atomic.Value we must take the mutex to prevent
+	// our read-copy-update from happening while another task is syncing syscall
+	// filters to us, this keeps the filters in a consistent state.
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+
+	// Cap the combined length of all syscall filters (plus a penalty of 4
+	// instructions per filter beyond the first) to maxSyscallFilterInstructions.
+	// This restriction is inherited from Linux.
+	totalLength := p.Length()
+	var newFilters []bpf.Program
+
+	if sf := t.syscallFilters.Load(); sf != nil {
+		oldFilters := sf.([]bpf.Program)
+		for _, f := range oldFilters {
+			totalLength += f.Length() + 4
+		}
+		newFilters = append(newFilters, oldFilters...)
+	}
+
+	if totalLength > maxSyscallFilterInstructions {
+		return syserror.ENOMEM
+	}
+
+	newFilters = append(newFilters, p)
+	t.syscallFilters.Store(newFilters)
+
+	if syncAll {
+		// Note: No new privs is always assumed to be set.
+		for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() {
+			if ot != t {
+				var copiedFilters []bpf.Program
+				copiedFilters = append(copiedFilters, newFilters...)
+				ot.syscallFilters.Store(copiedFilters)
+			}
+		}
+	}
+
+	return nil
+}
+
+// SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current
+// seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP)
+// and /proc/[pid]/status.
+func (t *Task) SeccompMode() int {
+	f := t.syscallFilters.Load()
+	if f != nil && len(f.([]bpf.Program)) > 0 {
+		return linux.SECCOMP_MODE_FILTER
+	}
+	return linux.SECCOMP_MODE_NONE
+}
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
new file mode 100644
index 000000000..9d0620e02
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -0,0 +1,571 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package semaphore implements System V semaphores.
+package semaphore
+
+import (
+	"fmt"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	valueMax = 32767 // SEMVMX
+
+	// semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL).
+	semaphoresMax = 32000
+
+	// setMax is "system-wide limit on the number of semaphore sets" (SEMMNI).
+	setsMax = 32000
+
+	// semaphoresTotalMax is "system-wide limit on the number of semaphores"
+	// (SEMMNS = SEMMNI*SEMMSL).
+	semaphoresTotalMax = 1024000000
+)
+
+// Registry maintains a set of semaphores that can be found by key or ID.
+//
+// +stateify savable
+type Registry struct {
+	// userNS owning the ipc name this registry belongs to. Immutable.
+	userNS *auth.UserNamespace
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	semaphores map[int32]*Set
+	lastIDUsed int32
+}
+
+// Set represents a set of semaphores that can be operated atomically.
+//
+// +stateify savable
+type Set struct {
+	// registry owning this sem set. Immutable.
+	registry *Registry
+
+	// Id is a handle that identifies the set.
+	ID int32
+
+	// key is an user provided key that can be shared between processes.
+	key int32
+
+	// creator is the user that created the set. Immutable.
+	creator fs.FileOwner
+
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	owner      fs.FileOwner
+	perms      fs.FilePermissions
+	opTime     ktime.Time
+	changeTime ktime.Time
+
+	// sems holds all semaphores in the set. The slice itself is immutable after
+	// it's been set, however each 'sem' object in the slice requires 'mu' lock.
+	sems []sem
+
+	// dead is set to true when the set is removed and can't be reached anymore.
+	// All waiters must wake up and fail when set is dead.
+	dead bool
+}
+
+// sem represents a single semanphore from a set.
+//
+// +stateify savable
+type sem struct {
+	value   int16
+	waiters waiterList `state:"zerovalue"`
+	pid     int32
+}
+
+// waiter represents a caller that is waiting for the semaphore value to
+// become positive or zero.
+//
+// +stateify savable
+type waiter struct {
+	waiterEntry
+
+	// value represents how much resource the waiter needs to wake up.
+	value int16
+	ch    chan struct{}
+}
+
+// NewRegistry creates a new semaphore set registry.
+func NewRegistry(userNS *auth.UserNamespace) *Registry {
+	return &Registry{
+		userNS:     userNS,
+		semaphores: make(map[int32]*Set),
+	}
+}
+
+// FindOrCreate searches for a semaphore set that matches 'key'. If not found,
+// it may create a new one if requested. If private is true, key is ignored and
+// a new set is always created. If create is false, it fails if a set cannot
+// be found. If exclusive is true, it fails if a set with the same key already
+// exists.
+func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) {
+	if nsems < 0 || nsems > semaphoresMax {
+		return nil, syserror.EINVAL
+	}
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if !private {
+		// Look up an existing semaphore.
+		if set := r.findByKey(key); set != nil {
+			set.mu.Lock()
+			defer set.mu.Unlock()
+
+			// Check that caller can access semaphore set.
+			creds := auth.CredentialsFromContext(ctx)
+			if !set.checkPerms(creds, fs.PermsFromMode(mode)) {
+				return nil, syserror.EACCES
+			}
+
+			// Validate parameters.
+			if nsems > int32(set.Size()) {
+				return nil, syserror.EINVAL
+			}
+			if create && exclusive {
+				return nil, syserror.EEXIST
+			}
+			return set, nil
+		}
+
+		if !create {
+			// Semaphore not found and should not be created.
+			return nil, syserror.ENOENT
+		}
+	}
+
+	// Zero is only valid if an existing set is found.
+	if nsems == 0 {
+		return nil, syserror.EINVAL
+	}
+
+	// Apply system limits.
+	if len(r.semaphores) >= setsMax {
+		return nil, syserror.EINVAL
+	}
+	if r.totalSems() > int(semaphoresTotalMax-nsems) {
+		return nil, syserror.EINVAL
+	}
+
+	// Finally create a new set.
+	owner := fs.FileOwnerFromContext(ctx)
+	perms := fs.FilePermsFromMode(mode)
+	return r.newSet(ctx, key, owner, owner, perms, nsems)
+}
+
+// RemoveID removes set with give 'id' from the registry and marks the set as
+// dead. All waiters will be awakened and fail.
+func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	set := r.semaphores[id]
+	if set == nil {
+		return syserror.EINVAL
+	}
+
+	set.mu.Lock()
+	defer set.mu.Unlock()
+
+	// "The effective user ID of the calling process must match the creator or
+	// owner of the semaphore set, or the caller must be privileged."
+	if !set.checkCredentials(creds) && !set.checkCapability(creds) {
+		return syserror.EACCES
+	}
+
+	delete(r.semaphores, set.ID)
+	set.destroy()
+	return nil
+}
+
+func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) {
+	set := &Set{
+		registry:   r,
+		key:        key,
+		owner:      owner,
+		creator:    owner,
+		perms:      perms,
+		changeTime: ktime.NowFromContext(ctx),
+		sems:       make([]sem, nsems),
+	}
+
+	// Find the next available ID.
+	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
+		// Handle wrap around.
+		if id < 0 {
+			id = 0
+			continue
+		}
+		if r.semaphores[id] == nil {
+			r.lastIDUsed = id
+			r.semaphores[id] = set
+			set.ID = id
+			return set, nil
+		}
+	}
+
+	log.Warningf("Semaphore map is full, they must be leaking")
+	return nil, syserror.ENOMEM
+}
+
+// FindByID looks up a set given an ID.
+func (r *Registry) FindByID(id int32) *Set {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return r.semaphores[id]
+}
+
+func (r *Registry) findByKey(key int32) *Set {
+	for _, v := range r.semaphores {
+		if v.key == key {
+			return v
+		}
+	}
+	return nil
+}
+
+func (r *Registry) totalSems() int {
+	totalSems := 0
+	for _, v := range r.semaphores {
+		totalSems += v.Size()
+	}
+	return totalSems
+}
+
+func (s *Set) findSem(num int32) *sem {
+	if num < 0 || int(num) >= s.Size() {
+		return nil
+	}
+	return &s.sems[num]
+}
+
+// Size returns the number of semaphores in the set. Size is immutable.
+func (s *Set) Size() int {
+	return len(s.sems)
+}
+
+// Change changes some fields from the set atomically.
+func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The effective UID of the calling process must match the owner or creator
+	// of the semaphore set, or the caller must be privileged."
+	if !s.checkCredentials(creds) && !s.checkCapability(creds) {
+		return syserror.EACCES
+	}
+
+	s.owner = owner
+	s.perms = perms
+	s.changeTime = ktime.NowFromContext(ctx)
+	return nil
+}
+
+// SetVal overrides a semaphore value, waking up waiters as needed.
+func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error {
+	if val < 0 || val > valueMax {
+		return syserror.ERANGE
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have alter permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Write: true}) {
+		return syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return syserror.ERANGE
+	}
+
+	// TODO(b/29354920): Clear undo entries in all processes
+	sem.value = val
+	sem.pid = pid
+	s.changeTime = ktime.NowFromContext(ctx)
+	sem.wakeWaiters()
+	return nil
+}
+
+// SetValAll overrides all semaphores values, waking up waiters as needed. It also
+// sets semaphore's PID which was fixed in Linux 4.6.
+//
+// 'len(vals)' must be equal to 's.Size()'.
+func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credentials, pid int32) error {
+	if len(vals) != s.Size() {
+		panic(fmt.Sprintf("vals length (%d) different that Set.Size() (%d)", len(vals), s.Size()))
+	}
+
+	for _, val := range vals {
+		if val < 0 || val > valueMax {
+			return syserror.ERANGE
+		}
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have alter permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Write: true}) {
+		return syserror.EACCES
+	}
+
+	for i, val := range vals {
+		sem := &s.sems[i]
+
+		// TODO(b/29354920): Clear undo entries in all processes
+		sem.value = int16(val)
+		sem.pid = pid
+		sem.wakeWaiters()
+	}
+	s.changeTime = ktime.NowFromContext(ctx)
+	return nil
+}
+
+// GetVal returns a semaphore value.
+func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return 0, syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return 0, syserror.ERANGE
+	}
+	return sem.value, nil
+}
+
+// GetValAll returns value for all semaphores.
+func (s *Set) GetValAll(creds *auth.Credentials) ([]uint16, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return nil, syserror.EACCES
+	}
+
+	vals := make([]uint16, s.Size())
+	for i, sem := range s.sems {
+		vals[i] = uint16(sem.value)
+	}
+	return vals, nil
+}
+
+// GetPID returns the PID set when performing operations in the semaphore.
+func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return 0, syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return 0, syserror.ERANGE
+	}
+	return sem.pid, nil
+}
+
+// ExecuteOps attempts to execute a list of operations to the set. It only
+// succeeds when all operations can be applied. No changes are made if it fails.
+//
+// On failure, it may return an error (retries are hopeless) or it may return
+// a channel that can be waited on before attempting again.
+func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials, pid int32) (chan struct{}, int32, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// Did it race with a removal operation?
+	if s.dead {
+		return nil, 0, syserror.EIDRM
+	}
+
+	// Validate the operations.
+	readOnly := true
+	for _, op := range ops {
+		if s.findSem(int32(op.SemNum)) == nil {
+			return nil, 0, syserror.EFBIG
+		}
+		if op.SemOp != 0 {
+			readOnly = false
+		}
+	}
+
+	if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) {
+		return nil, 0, syserror.EACCES
+	}
+
+	ch, num, err := s.executeOps(ctx, ops, pid)
+	if err != nil {
+		return nil, 0, err
+	}
+	return ch, num, nil
+}
+
+func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (chan struct{}, int32, error) {
+	// Changes to semaphores go to this slice temporarily until they all succeed.
+	tmpVals := make([]int16, len(s.sems))
+	for i := range s.sems {
+		tmpVals[i] = s.sems[i].value
+	}
+
+	for _, op := range ops {
+		sem := &s.sems[op.SemNum]
+		if op.SemOp == 0 {
+			// Handle 'wait for zero' operation.
+			if tmpVals[op.SemNum] != 0 {
+				// Semaphore isn't 0, must wait.
+				if op.SemFlg&linux.IPC_NOWAIT != 0 {
+					return nil, 0, syserror.ErrWouldBlock
+				}
+
+				w := newWaiter(op.SemOp)
+				sem.waiters.PushBack(w)
+				return w.ch, int32(op.SemNum), nil
+			}
+		} else {
+			if op.SemOp < 0 {
+				// Handle 'wait' operation.
+				if -op.SemOp > valueMax {
+					return nil, 0, syserror.ERANGE
+				}
+				if -op.SemOp > tmpVals[op.SemNum] {
+					// Not enough resources, must wait.
+					if op.SemFlg&linux.IPC_NOWAIT != 0 {
+						return nil, 0, syserror.ErrWouldBlock
+					}
+
+					w := newWaiter(op.SemOp)
+					sem.waiters.PushBack(w)
+					return w.ch, int32(op.SemNum), nil
+				}
+			} else {
+				// op.SemOp > 0: Handle 'signal' operation.
+				if tmpVals[op.SemNum] > valueMax-op.SemOp {
+					return nil, 0, syserror.ERANGE
+				}
+			}
+
+			tmpVals[op.SemNum] += op.SemOp
+		}
+	}
+
+	// All operations succeeded, apply them.
+	// TODO(b/29354920): handle undo operations.
+	for i, v := range tmpVals {
+		s.sems[i].value = v
+		s.sems[i].wakeWaiters()
+		s.sems[i].pid = pid
+	}
+	s.opTime = ktime.NowFromContext(ctx)
+	return nil, 0, nil
+}
+
+// AbortWait notifies that a waiter is giving up and will not wait on the
+// channel anymore.
+func (s *Set) AbortWait(num int32, ch chan struct{}) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	sem := &s.sems[num]
+	for w := sem.waiters.Front(); w != nil; w = w.Next() {
+		if w.ch == ch {
+			sem.waiters.Remove(w)
+			return
+		}
+	}
+	// Waiter may not be found in case it raced with wakeWaiters().
+}
+
+func (s *Set) checkCredentials(creds *auth.Credentials) bool {
+	return s.owner.UID == creds.EffectiveKUID ||
+		s.owner.GID == creds.EffectiveKGID ||
+		s.creator.UID == creds.EffectiveKUID ||
+		s.creator.GID == creds.EffectiveKGID
+}
+
+func (s *Set) checkCapability(creds *auth.Credentials) bool {
+	return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok()
+}
+
+func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool {
+	// Are we owner, or in group, or other?
+	p := s.perms.Other
+	if s.owner.UID == creds.EffectiveKUID {
+		p = s.perms.User
+	} else if creds.InGroup(s.owner.GID) {
+		p = s.perms.Group
+	}
+
+	// Are permissions satisfied without capability checks?
+	if p.SupersetOf(reqPerms) {
+		return true
+	}
+
+	return s.checkCapability(creds)
+}
+
+// destroy destroys the set. Caller must hold 's.mu'.
+func (s *Set) destroy() {
+	// Notify all waiters. They will fail on the next attempt to execute
+	// operations and return error.
+	s.dead = true
+	for _, s := range s.sems {
+		for w := s.waiters.Front(); w != nil; w = w.Next() {
+			w.ch <- struct{}{}
+		}
+		s.waiters.Reset()
+	}
+}
+
+// wakeWaiters goes over all waiters and checks which of them can be notified.
+func (s *sem) wakeWaiters() {
+	// Note that this will release all waiters waiting for 0 too.
+	for w := s.waiters.Front(); w != nil; {
+		if s.value < w.value {
+			// Still blocked, skip it.
+			continue
+		}
+		w.ch <- struct{}{}
+		old := w
+		w = w.Next()
+		s.waiters.Remove(old)
+	}
+}
+
+func newWaiter(val int16) *waiter {
+	return &waiter{
+		value: val,
+		ch:    make(chan struct{}, 1),
+	}
+}
diff --git a/pkg/sentry/kernel/semaphore/semaphore_state_autogen.go b/pkg/sentry/kernel/semaphore/semaphore_state_autogen.go
new file mode 100755
index 000000000..1551f792e
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/semaphore_state_autogen.go
@@ -0,0 +1,115 @@
+// automatically generated by stateify.
+
+package semaphore
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *Registry) beforeSave() {}
+func (x *Registry) save(m state.Map) {
+	x.beforeSave()
+	m.Save("userNS", &x.userNS)
+	m.Save("semaphores", &x.semaphores)
+	m.Save("lastIDUsed", &x.lastIDUsed)
+}
+
+func (x *Registry) afterLoad() {}
+func (x *Registry) load(m state.Map) {
+	m.Load("userNS", &x.userNS)
+	m.Load("semaphores", &x.semaphores)
+	m.Load("lastIDUsed", &x.lastIDUsed)
+}
+
+func (x *Set) beforeSave() {}
+func (x *Set) save(m state.Map) {
+	x.beforeSave()
+	m.Save("registry", &x.registry)
+	m.Save("ID", &x.ID)
+	m.Save("key", &x.key)
+	m.Save("creator", &x.creator)
+	m.Save("owner", &x.owner)
+	m.Save("perms", &x.perms)
+	m.Save("opTime", &x.opTime)
+	m.Save("changeTime", &x.changeTime)
+	m.Save("sems", &x.sems)
+	m.Save("dead", &x.dead)
+}
+
+func (x *Set) afterLoad() {}
+func (x *Set) load(m state.Map) {
+	m.Load("registry", &x.registry)
+	m.Load("ID", &x.ID)
+	m.Load("key", &x.key)
+	m.Load("creator", &x.creator)
+	m.Load("owner", &x.owner)
+	m.Load("perms", &x.perms)
+	m.Load("opTime", &x.opTime)
+	m.Load("changeTime", &x.changeTime)
+	m.Load("sems", &x.sems)
+	m.Load("dead", &x.dead)
+}
+
+func (x *sem) beforeSave() {}
+func (x *sem) save(m state.Map) {
+	x.beforeSave()
+	if !state.IsZeroValue(x.waiters) { m.Failf("waiters is %v, expected zero", x.waiters) }
+	m.Save("value", &x.value)
+	m.Save("pid", &x.pid)
+}
+
+func (x *sem) afterLoad() {}
+func (x *sem) load(m state.Map) {
+	m.Load("value", &x.value)
+	m.Load("pid", &x.pid)
+}
+
+func (x *waiter) beforeSave() {}
+func (x *waiter) save(m state.Map) {
+	x.beforeSave()
+	m.Save("waiterEntry", &x.waiterEntry)
+	m.Save("value", &x.value)
+	m.Save("ch", &x.ch)
+}
+
+func (x *waiter) afterLoad() {}
+func (x *waiter) load(m state.Map) {
+	m.Load("waiterEntry", &x.waiterEntry)
+	m.Load("value", &x.value)
+	m.Load("ch", &x.ch)
+}
+
+func (x *waiterList) beforeSave() {}
+func (x *waiterList) save(m state.Map) {
+	x.beforeSave()
+	m.Save("head", &x.head)
+	m.Save("tail", &x.tail)
+}
+
+func (x *waiterList) afterLoad() {}
+func (x *waiterList) load(m state.Map) {
+	m.Load("head", &x.head)
+	m.Load("tail", &x.tail)
+}
+
+func (x *waiterEntry) beforeSave() {}
+func (x *waiterEntry) save(m state.Map) {
+	x.beforeSave()
+	m.Save("next", &x.next)
+	m.Save("prev", &x.prev)
+}
+
+func (x *waiterEntry) afterLoad() {}
+func (x *waiterEntry) load(m state.Map) {
+	m.Load("next", &x.next)
+	m.Load("prev", &x.prev)
+}
+
+func init() {
+	state.Register("semaphore.Registry", (*Registry)(nil), state.Fns{Save: (*Registry).save, Load: (*Registry).load})
+	state.Register("semaphore.Set", (*Set)(nil), state.Fns{Save: (*Set).save, Load: (*Set).load})
+	state.Register("semaphore.sem", (*sem)(nil), state.Fns{Save: (*sem).save, Load: (*sem).load})
+	state.Register("semaphore.waiter", (*waiter)(nil), state.Fns{Save: (*waiter).save, Load: (*waiter).load})
+	state.Register("semaphore.waiterList", (*waiterList)(nil), state.Fns{Save: (*waiterList).save, Load: (*waiterList).load})
+	state.Register("semaphore.waiterEntry", (*waiterEntry)(nil), state.Fns{Save: (*waiterEntry).save, Load: (*waiterEntry).load})
+}
diff --git a/pkg/sentry/kernel/semaphore/waiter_list.go b/pkg/sentry/kernel/semaphore/waiter_list.go
new file mode 100755
index 000000000..33e29fb55
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/waiter_list.go
@@ -0,0 +1,173 @@
+package semaphore
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type waiterElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (waiterElementMapper) linkerFor(elem *waiter) *waiter { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+//      for e := l.Front(); e != nil; e = e.Next() {
+// 		// do something with e.
+//      }
+//
+// +stateify savable
+type waiterList struct {
+	head *waiter
+	tail *waiter
+}
+
+// Reset resets list l to the empty state.
+func (l *waiterList) Reset() {
+	l.head = nil
+	l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+func (l *waiterList) Empty() bool {
+	return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+func (l *waiterList) Front() *waiter {
+	return l.head
+}
+
+// Back returns the last element of list l or nil.
+func (l *waiterList) Back() *waiter {
+	return l.tail
+}
+
+// PushFront inserts the element e at the front of list l.
+func (l *waiterList) PushFront(e *waiter) {
+	waiterElementMapper{}.linkerFor(e).SetNext(l.head)
+	waiterElementMapper{}.linkerFor(e).SetPrev(nil)
+
+	if l.head != nil {
+		waiterElementMapper{}.linkerFor(l.head).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+
+	l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+func (l *waiterList) PushBack(e *waiter) {
+	waiterElementMapper{}.linkerFor(e).SetNext(nil)
+	waiterElementMapper{}.linkerFor(e).SetPrev(l.tail)
+
+	if l.tail != nil {
+		waiterElementMapper{}.linkerFor(l.tail).SetNext(e)
+	} else {
+		l.head = e
+	}
+
+	l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+func (l *waiterList) PushBackList(m *waiterList) {
+	if l.head == nil {
+		l.head = m.head
+		l.tail = m.tail
+	} else if m.head != nil {
+		waiterElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+		waiterElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+		l.tail = m.tail
+	}
+
+	m.head = nil
+	m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+func (l *waiterList) InsertAfter(b, e *waiter) {
+	a := waiterElementMapper{}.linkerFor(b).Next()
+	waiterElementMapper{}.linkerFor(e).SetNext(a)
+	waiterElementMapper{}.linkerFor(e).SetPrev(b)
+	waiterElementMapper{}.linkerFor(b).SetNext(e)
+
+	if a != nil {
+		waiterElementMapper{}.linkerFor(a).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+}
+
+// InsertBefore inserts e before a.
+func (l *waiterList) InsertBefore(a, e *waiter) {
+	b := waiterElementMapper{}.linkerFor(a).Prev()
+	waiterElementMapper{}.linkerFor(e).SetNext(a)
+	waiterElementMapper{}.linkerFor(e).SetPrev(b)
+	waiterElementMapper{}.linkerFor(a).SetPrev(e)
+
+	if b != nil {
+		waiterElementMapper{}.linkerFor(b).SetNext(e)
+	} else {
+		l.head = e
+	}
+}
+
+// Remove removes e from l.
+func (l *waiterList) Remove(e *waiter) {
+	prev := waiterElementMapper{}.linkerFor(e).Prev()
+	next := waiterElementMapper{}.linkerFor(e).Next()
+
+	if prev != nil {
+		waiterElementMapper{}.linkerFor(prev).SetNext(next)
+	} else {
+		l.head = next
+	}
+
+	if next != nil {
+		waiterElementMapper{}.linkerFor(next).SetPrev(prev)
+	} else {
+		l.tail = prev
+	}
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type waiterEntry struct {
+	next *waiter
+	prev *waiter
+}
+
+// Next returns the entry that follows e in the list.
+func (e *waiterEntry) Next() *waiter {
+	return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+func (e *waiterEntry) Prev() *waiter {
+	return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+func (e *waiterEntry) SetNext(elem *waiter) {
+	e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+func (e *waiterEntry) SetPrev(elem *waiter) {
+	e.prev = elem
+}
diff --git a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go
new file mode 100755
index 000000000..4bf8719f2
--- /dev/null
+++ b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go
@@ -0,0 +1,55 @@
+package kernel
+
+import (
+	"reflect"
+	"strings"
+	"unsafe"
+
+	"fmt"
+	"gvisor.googlesource.com/gvisor/third_party/gvsync"
+)
+
+// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
+// with any writer critical sections in sc.
+func SeqAtomicLoadTaskGoroutineSchedInfo(sc *gvsync.SeqCount, ptr *TaskGoroutineSchedInfo) TaskGoroutineSchedInfo {
+	// This function doesn't use SeqAtomicTryLoad because doing so is
+	// measurably, significantly (~20%) slower; Go is awful at inlining.
+	var val TaskGoroutineSchedInfo
+	for {
+		epoch := sc.BeginRead()
+		if gvsync.RaceEnabled {
+
+			gvsync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
+		} else {
+
+			val = *ptr
+		}
+		if sc.ReadOk(epoch) {
+			break
+		}
+	}
+	return val
+}
+
+// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section
+// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read
+// would race with a writer critical section, SeqAtomicTryLoad returns
+// (unspecified, false).
+func SeqAtomicTryLoadTaskGoroutineSchedInfo(sc *gvsync.SeqCount, epoch gvsync.SeqCountEpoch, ptr *TaskGoroutineSchedInfo) (TaskGoroutineSchedInfo, bool) {
+	var val TaskGoroutineSchedInfo
+	if gvsync.RaceEnabled {
+		gvsync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
+	} else {
+		val = *ptr
+	}
+	return val, sc.ReadOk(epoch)
+}
+
+func initTaskGoroutineSchedInfo() {
+	var val TaskGoroutineSchedInfo
+	typ := reflect.TypeOf(val)
+	name := typ.Name()
+	if ptrs := gvsync.PointersInType(typ, name); len(ptrs) != 0 {
+		panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n")))
+	}
+}
diff --git a/pkg/sentry/kernel/session_list.go b/pkg/sentry/kernel/session_list.go
new file mode 100755
index 000000000..9ba27b164
--- /dev/null
+++ b/pkg/sentry/kernel/session_list.go
@@ -0,0 +1,173 @@
+package kernel
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type sessionElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (sessionElementMapper) linkerFor(elem *Session) *Session { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+//      for e := l.Front(); e != nil; e = e.Next() {
+// 		// do something with e.
+//      }
+//
+// +stateify savable
+type sessionList struct {
+	head *Session
+	tail *Session
+}
+
+// Reset resets list l to the empty state.
+func (l *sessionList) Reset() {
+	l.head = nil
+	l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+func (l *sessionList) Empty() bool {
+	return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+func (l *sessionList) Front() *Session {
+	return l.head
+}
+
+// Back returns the last element of list l or nil.
+func (l *sessionList) Back() *Session {
+	return l.tail
+}
+
+// PushFront inserts the element e at the front of list l.
+func (l *sessionList) PushFront(e *Session) {
+	sessionElementMapper{}.linkerFor(e).SetNext(l.head)
+	sessionElementMapper{}.linkerFor(e).SetPrev(nil)
+
+	if l.head != nil {
+		sessionElementMapper{}.linkerFor(l.head).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+
+	l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+func (l *sessionList) PushBack(e *Session) {
+	sessionElementMapper{}.linkerFor(e).SetNext(nil)
+	sessionElementMapper{}.linkerFor(e).SetPrev(l.tail)
+
+	if l.tail != nil {
+		sessionElementMapper{}.linkerFor(l.tail).SetNext(e)
+	} else {
+		l.head = e
+	}
+
+	l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+func (l *sessionList) PushBackList(m *sessionList) {
+	if l.head == nil {
+		l.head = m.head
+		l.tail = m.tail
+	} else if m.head != nil {
+		sessionElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+		sessionElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+		l.tail = m.tail
+	}
+
+	m.head = nil
+	m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+func (l *sessionList) InsertAfter(b, e *Session) {
+	a := sessionElementMapper{}.linkerFor(b).Next()
+	sessionElementMapper{}.linkerFor(e).SetNext(a)
+	sessionElementMapper{}.linkerFor(e).SetPrev(b)
+	sessionElementMapper{}.linkerFor(b).SetNext(e)
+
+	if a != nil {
+		sessionElementMapper{}.linkerFor(a).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+}
+
+// InsertBefore inserts e before a.
+func (l *sessionList) InsertBefore(a, e *Session) {
+	b := sessionElementMapper{}.linkerFor(a).Prev()
+	sessionElementMapper{}.linkerFor(e).SetNext(a)
+	sessionElementMapper{}.linkerFor(e).SetPrev(b)
+	sessionElementMapper{}.linkerFor(a).SetPrev(e)
+
+	if b != nil {
+		sessionElementMapper{}.linkerFor(b).SetNext(e)
+	} else {
+		l.head = e
+	}
+}
+
+// Remove removes e from l.
+func (l *sessionList) Remove(e *Session) {
+	prev := sessionElementMapper{}.linkerFor(e).Prev()
+	next := sessionElementMapper{}.linkerFor(e).Next()
+
+	if prev != nil {
+		sessionElementMapper{}.linkerFor(prev).SetNext(next)
+	} else {
+		l.head = next
+	}
+
+	if next != nil {
+		sessionElementMapper{}.linkerFor(next).SetPrev(prev)
+	} else {
+		l.tail = prev
+	}
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type sessionEntry struct {
+	next *Session
+	prev *Session
+}
+
+// Next returns the entry that follows e in the list.
+func (e *sessionEntry) Next() *Session {
+	return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+func (e *sessionEntry) Prev() *Session {
+	return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+func (e *sessionEntry) SetNext(elem *Session) {
+	e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+func (e *sessionEntry) SetPrev(elem *Session) {
+	e.prev = elem
+}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
new file mode 100644
index 000000000..610e199da
--- /dev/null
+++ b/pkg/sentry/kernel/sessions.go
@@ -0,0 +1,508 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SessionID is the public identifier.
+type SessionID ThreadID
+
+// ProcessGroupID is the public identifier.
+type ProcessGroupID ThreadID
+
+// Session contains a leader threadgroup and a list of ProcessGroups.
+//
+// +stateify savable
+type Session struct {
+	refs refs.AtomicRefCount
+
+	// leader is the originator of the Session.
+	//
+	// Note that this may no longer be running (and may be reaped), so the
+	// ID is cached upon initial creation. The leader is still required
+	// however, since its PIDNamespace defines the scope of the Session.
+	//
+	// The leader is immutable.
+	leader *ThreadGroup
+
+	// id is the cached identifier in the leader's namespace.
+	//
+	// The id is immutable.
+	id SessionID
+
+	// ProcessGroups is a list of process groups in this Session. This is
+	// protected by TaskSet.mu.
+	processGroups processGroupList
+
+	// sessionEntry is the embed for TaskSet.sessions. This is protected by
+	// TaskSet.mu.
+	sessionEntry
+}
+
+// incRef grabs a reference.
+func (s *Session) incRef() {
+	s.refs.IncRef()
+}
+
+// decRef drops a reference.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (s *Session) decRef() {
+	s.refs.DecRefWithDestructor(func() {
+		// Remove translations from the leader.
+		for ns := s.leader.pidns; ns != nil; ns = ns.parent {
+			id := ns.sids[s]
+			delete(ns.sids, s)
+			delete(ns.sessions, id)
+		}
+
+		// Remove from the list of global Sessions.
+		s.leader.pidns.owner.sessions.Remove(s)
+	})
+}
+
+// ProcessGroup contains an originator threadgroup and a parent Session.
+//
+// +stateify savable
+type ProcessGroup struct {
+	refs refs.AtomicRefCount // not exported.
+
+	// originator is the originator of the group.
+	//
+	// See note re: leader in Session. The same applies here.
+	//
+	// The originator is immutable.
+	originator *ThreadGroup
+
+	// id is the cached identifier in the originator's namespace.
+	//
+	// The id is immutable.
+	id ProcessGroupID
+
+	// Session is the parent Session.
+	//
+	// The session is immutable.
+	session *Session
+
+	// ancestors is the number of thread groups in this process group whose
+	// parent is in a different process group in the same session.
+	//
+	// The name is derived from the fact that process groups where
+	// ancestors is zero are considered "orphans".
+	//
+	// ancestors is protected by TaskSet.mu.
+	ancestors uint32
+
+	// processGroupEntry is the embedded entry for Sessions.groups. This is
+	// protected by TaskSet.mu.
+	processGroupEntry
+}
+
+// Originator retuns the originator of the process group.
+func (pg *ProcessGroup) Originator() *ThreadGroup {
+	return pg.originator
+}
+
+// IsOrphan returns true if this process group is an orphan.
+func (pg *ProcessGroup) IsOrphan() bool {
+	pg.originator.TaskSet().mu.RLock()
+	defer pg.originator.TaskSet().mu.RUnlock()
+	return pg.ancestors == 0
+}
+
+// incRefWithParent grabs a reference.
+//
+// This function is called when this ProcessGroup is being associated with some
+// new ThreadGroup, tg. parentPG is the ProcessGroup of tg's parent
+// ThreadGroup. If tg is init, then parentPG may be nil.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) incRefWithParent(parentPG *ProcessGroup) {
+	// We acquire an "ancestor" reference in the case of a nil parent.
+	// This is because the process being associated is init, and init can
+	// never be orphaned (we count it as always having an ancestor).
+	if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) {
+		pg.ancestors++
+	}
+
+	pg.refs.IncRef()
+}
+
+// decRefWithParent drops a reference.
+//
+// parentPG is per incRefWithParent.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
+	// See incRefWithParent regarding parent == nil.
+	if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) {
+		pg.ancestors--
+	}
+
+	alive := true
+	pg.refs.DecRefWithDestructor(func() {
+		alive = false // don't bother with handleOrphan.
+
+		// Remove translations from the originator.
+		for ns := pg.originator.pidns; ns != nil; ns = ns.parent {
+			id := ns.pgids[pg]
+			delete(ns.pgids, pg)
+			delete(ns.processGroups, id)
+		}
+
+		// Remove the list of process groups.
+		pg.session.processGroups.Remove(pg)
+		pg.session.decRef()
+	})
+	if alive {
+		pg.handleOrphan()
+	}
+}
+
+// parentPG returns the parent process group.
+//
+// Precondition: callers must hold TaskSet.mu.
+func (tg *ThreadGroup) parentPG() *ProcessGroup {
+	if tg.leader.parent != nil {
+		return tg.leader.parent.tg.processGroup
+	}
+	return nil
+}
+
+// handleOrphan checks whether the process group is an orphan and has any
+// stopped jobs. If yes, then appropriate signals are delivered to each thread
+// group within the process group.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) handleOrphan() {
+	// Check if this process is an orphan.
+	if pg.ancestors != 0 {
+		return
+	}
+
+	// See if there are any stopped jobs.
+	hasStopped := false
+	pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) {
+		if tg.processGroup != pg {
+			return
+		}
+		tg.signalHandlers.mu.Lock()
+		if tg.groupStopComplete {
+			hasStopped = true
+		}
+		tg.signalHandlers.mu.Unlock()
+	})
+	if !hasStopped {
+		return
+	}
+
+	// Deliver appropriate signals to all thread groups.
+	pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) {
+		if tg.processGroup != pg {
+			return
+		}
+		tg.signalHandlers.mu.Lock()
+		tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGHUP), true /* group */)
+		tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGCONT), true /* group */)
+		tg.signalHandlers.mu.Unlock()
+	})
+
+	return
+}
+
+// Session returns the process group's session without taking a reference.
+func (pg *ProcessGroup) Session() *Session {
+	return pg.session
+}
+
+// SendSignal sends a signal to all processes inside the process group. It is
+// analagous to kernel/signal.c:kill_pgrp.
+func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error {
+	tasks := pg.originator.TaskSet()
+	tasks.mu.RLock()
+	defer tasks.mu.RUnlock()
+
+	var lastErr error
+	for tg := range tasks.Root.tgids {
+		if tg.ProcessGroup() == pg {
+			tg.signalHandlers.mu.Lock()
+			infoCopy := *info
+			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
+				lastErr = err
+			}
+			tg.signalHandlers.mu.Unlock()
+		}
+	}
+	return lastErr
+}
+
+// CreateSession creates a new Session, with the ThreadGroup as the leader.
+//
+// EPERM may be returned if either the given ThreadGroup is already a Session
+// leader, or a ProcessGroup already exists for the ThreadGroup's ID.
+func (tg *ThreadGroup) CreateSession() error {
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+	return tg.createSession()
+}
+
+// createSession creates a new session for a threadgroup.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (tg *ThreadGroup) createSession() error {
+	// Get the ID for this thread in the current namespace.
+	id := tg.pidns.tgids[tg]
+
+	// Check if this ThreadGroup already leads a Session, or
+	// if the proposed group is already taken.
+	for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() {
+		if s.leader.pidns != tg.pidns {
+			continue
+		}
+		if s.leader == tg {
+			return syserror.EPERM
+		}
+		if s.id == SessionID(id) {
+			return syserror.EPERM
+		}
+		for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
+			if pg.id == ProcessGroupID(id) {
+				return syserror.EPERM
+			}
+		}
+	}
+
+	// Create a new Session, with a single reference.
+	s := &Session{
+		id:     SessionID(id),
+		leader: tg,
+	}
+
+	// Create a new ProcessGroup, belonging to that Session.
+	// This also has a single reference (assigned below).
+	//
+	// Note that since this is a new session and a new process group, there
+	// will be zero ancestors for this process group. (It is an orphan at
+	// this point.)
+	pg := &ProcessGroup{
+		id:         ProcessGroupID(id),
+		originator: tg,
+		session:    s,
+		ancestors:  0,
+	}
+
+	// Tie them and return the result.
+	s.processGroups.PushBack(pg)
+	tg.pidns.owner.sessions.PushBack(s)
+
+	// Leave the current group, and assign the new one.
+	if tg.processGroup != nil {
+		oldParentPG := tg.parentPG()
+		tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+			childTG.processGroup.incRefWithParent(pg)
+			childTG.processGroup.decRefWithParent(oldParentPG)
+		})
+		tg.processGroup.decRefWithParent(oldParentPG)
+		tg.processGroup = pg
+	} else {
+		// The current process group may be nil only in the case of an
+		// unparented thread group (i.e. the init process). This would
+		// not normally occur, but we allow it for the convenience of
+		// CreateSession working from that point. There will be no
+		// child processes. We always say that the very first group
+		// created has ancestors (avoids checks elsewhere).
+		//
+		// Note that this mirrors the parent == nil logic in
+		// incRef/decRef/reparent, which counts nil as an ancestor.
+		tg.processGroup = pg
+		tg.processGroup.ancestors++
+	}
+
+	// Ensure a translation is added to all namespaces.
+	for ns := tg.pidns; ns != nil; ns = ns.parent {
+		local := ns.tgids[tg]
+		ns.sids[s] = SessionID(local)
+		ns.sessions[SessionID(local)] = s
+		ns.pgids[pg] = ProcessGroupID(local)
+		ns.processGroups[ProcessGroupID(local)] = pg
+	}
+
+	return nil
+}
+
+// CreateProcessGroup creates a new process group.
+//
+// An EPERM error will be returned if the ThreadGroup belongs to a different
+// Session, is a Session leader or the group already exists.
+func (tg *ThreadGroup) CreateProcessGroup() error {
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+
+	// Get the ID for this thread in the current namespace.
+	id := tg.pidns.tgids[tg]
+
+	// Per above, check for a Session leader or existing group.
+	for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() {
+		if s.leader.pidns != tg.pidns {
+			continue
+		}
+		if s.leader == tg {
+			return syserror.EPERM
+		}
+		for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
+			if pg.id == ProcessGroupID(id) {
+				return syserror.EPERM
+			}
+		}
+	}
+
+	// Create a new ProcessGroup, belonging to the current Session.
+	//
+	// We manually adjust the ancestors if the parent is in the same
+	// session.
+	tg.processGroup.session.incRef()
+	pg := &ProcessGroup{
+		id:         ProcessGroupID(id),
+		originator: tg,
+		session:    tg.processGroup.session,
+	}
+	if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
+		pg.ancestors++
+	}
+
+	// Assign the new process group; adjust children.
+	oldParentPG := tg.parentPG()
+	tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+		childTG.processGroup.incRefWithParent(pg)
+		childTG.processGroup.decRefWithParent(oldParentPG)
+	})
+	tg.processGroup.decRefWithParent(oldParentPG)
+	tg.processGroup = pg
+
+	// Add the new process group to the session.
+	pg.session.processGroups.PushBack(pg)
+
+	// Ensure this translation is added to all namespaces.
+	for ns := tg.pidns; ns != nil; ns = ns.parent {
+		local := ns.tgids[tg]
+		ns.pgids[pg] = ProcessGroupID(local)
+		ns.processGroups[ProcessGroupID(local)] = pg
+	}
+
+	return nil
+}
+
+// JoinProcessGroup joins an existing process group.
+//
+// This function will return EACCES if an exec has been performed since fork
+// by the given ThreadGroup, and EPERM if the Sessions are not the same or the
+// group does not exist.
+//
+// If checkExec is set, then the join is not permitted after the process has
+// executed exec at least once.
+func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID, checkExec bool) error {
+	pidns.owner.mu.Lock()
+	defer pidns.owner.mu.Unlock()
+
+	// Lookup the ProcessGroup.
+	pg := pidns.processGroups[pgid]
+	if pg == nil {
+		return syserror.EPERM
+	}
+
+	// Disallow the join if an execve has performed, per POSIX.
+	if checkExec && tg.execed {
+		return syserror.EACCES
+	}
+
+	// See if it's in the same session as ours.
+	if pg.session != tg.processGroup.session {
+		return syserror.EPERM
+	}
+
+	// Join the group; adjust children.
+	parentPG := tg.parentPG()
+	pg.incRefWithParent(parentPG)
+	tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+		childTG.processGroup.incRefWithParent(pg)
+		childTG.processGroup.decRefWithParent(tg.processGroup)
+	})
+	tg.processGroup.decRefWithParent(parentPG)
+	tg.processGroup = pg
+
+	return nil
+}
+
+// Session returns the ThreadGroup's Session.
+//
+// A reference is not taken on the session.
+func (tg *ThreadGroup) Session() *Session {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.processGroup.session
+}
+
+// IDOfSession returns the Session assigned to s in PID namespace ns.
+//
+// If this group isn't visible in this namespace, zero will be returned. It is
+// the callers responsibility to check that before using this function.
+func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.sids[s]
+}
+
+// SessionWithID returns the Session with the given ID in the PID namespace ns,
+// or nil if that given ID is not defined in this namespace.
+//
+// A reference is not taken on the session.
+func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.sessions[id]
+}
+
+// ProcessGroup returns the ThreadGroup's ProcessGroup.
+//
+// A reference is not taken on the process group.
+func (tg *ThreadGroup) ProcessGroup() *ProcessGroup {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.processGroup
+}
+
+// IDOfProcessGroup returns the process group assigned to pg in PID namespace ns.
+//
+// The same constraints apply as IDOfSession.
+func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.pgids[pg]
+}
+
+// ProcessGroupWithID returns the ProcessGroup with the given ID in the PID
+// namespace ns, or nil if that given ID is not defined in this namespace.
+//
+// A reference is not taken on the process group.
+func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.processGroups[id]
+}
diff --git a/pkg/sentry/kernel/shm/device.go b/pkg/sentry/kernel/shm/device.go
new file mode 100644
index 000000000..3cb759072
--- /dev/null
+++ b/pkg/sentry/kernel/shm/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package shm
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+// shmDevice is the kernel shm device.
+var shmDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
new file mode 100644
index 000000000..00393b5f0
--- /dev/null
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -0,0 +1,671 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package shm implements sysv shared memory segments.
+//
+// Known missing features:
+//
+// - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement
+//   memory locking in general.
+//
+// - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy
+//   way to implement hugetlb support on a per-map basis, and it has no impact
+//   on correctness.
+//
+// - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap
+//   so it's meaningless to reserve space for swap.
+//
+// - No per-process segment size enforcement. This feature probably isn't used
+//   much anyways, since Linux sets the per-process limits to the system-wide
+//   limits by default.
+//
+// Lock ordering: mm.mappingMu -> shm registry lock -> shm lock
+package shm
+
+import (
+	"fmt"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Key represents a shm segment key. Analogous to a file name.
+type Key int32
+
+// ID represents the opaque handle for a shm segment. Analogous to an fd.
+type ID int32
+
+// Registry tracks all shared memory segments in an IPC namespace. The registry
+// provides the mechanisms for creating and finding segments, and reporting
+// global shm parameters.
+//
+// +stateify savable
+type Registry struct {
+	// userNS owns the IPC namespace this registry belong to. Immutable.
+	userNS *auth.UserNamespace
+
+	// mu protects all fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// shms maps segment ids to segments.
+	shms map[ID]*Shm
+
+	// keysToShms maps segment keys to segments.
+	keysToShms map[Key]*Shm
+
+	// Sum of the sizes of all existing segments rounded up to page size, in
+	// units of page size.
+	totalPages uint64
+
+	// ID assigned to the last created segment. Used to quickly find the next
+	// unused ID.
+	lastIDUsed ID
+}
+
+// NewRegistry creates a new shm registry.
+func NewRegistry(userNS *auth.UserNamespace) *Registry {
+	return &Registry{
+		userNS:     userNS,
+		shms:       make(map[ID]*Shm),
+		keysToShms: make(map[Key]*Shm),
+	}
+}
+
+// FindByID looks up a segment given an ID.
+func (r *Registry) FindByID(id ID) *Shm {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return r.shms[id]
+}
+
+// dissociateKey removes the association between a segment and its key,
+// preventing it from being discovered in the registry. This doesn't necessarily
+// mean the segment is about to be destroyed. This is analogous to unlinking a
+// file; the segment can still be used by a process already referencing it, but
+// cannot be discovered by a new process.
+func (r *Registry) dissociateKey(s *Shm) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.key != linux.IPC_PRIVATE {
+		delete(r.keysToShms, s.key)
+		s.key = linux.IPC_PRIVATE
+	}
+}
+
+// FindOrCreate looks up or creates a segment in the registry. It's functionally
+// analogous to open(2).
+func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
+	if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
+		// "A new segment was to be created and size is less than SHMMIN or
+		// greater than SHMMAX." - man shmget(2)
+		//
+		// Note that 'private' always implies the creation of a new segment
+		// whether IPC_CREAT is specified or not.
+		return nil, syserror.EINVAL
+	}
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if len(r.shms) >= linux.SHMMNI {
+		// "All possible shared memory IDs have been taken (SHMMNI) ..."
+		//   - man shmget(2)
+		return nil, syserror.ENOSPC
+	}
+
+	if !private {
+		// Look up an existing segment.
+		if shm := r.keysToShms[key]; shm != nil {
+			shm.mu.Lock()
+			defer shm.mu.Unlock()
+
+			// Check that caller can access the segment.
+			if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) {
+				// "The user does not have permission to access the shared
+				// memory segment, and does not have the CAP_IPC_OWNER
+				// capability in the user namespace that governs its IPC
+				// namespace." - man shmget(2)
+				return nil, syserror.EACCES
+			}
+
+			if size > shm.size {
+				// "A segment for the given key exists, but size is greater than
+				// the size of that segment." - man shmget(2)
+				return nil, syserror.EINVAL
+			}
+
+			if create && exclusive {
+				// "IPC_CREAT and IPC_EXCL were specified in shmflg, but a
+				// shared memory segment already exists for key."
+				//  - man shmget(2)
+				return nil, syserror.EEXIST
+			}
+
+			return shm, nil
+		}
+
+		if !create {
+			// "No segment exists for the given key, and IPC_CREAT was not
+			// specified." - man shmget(2)
+			return nil, syserror.ENOENT
+		}
+	}
+
+	var sizeAligned uint64
+	if val, ok := usermem.Addr(size).RoundUp(); ok {
+		sizeAligned = uint64(val)
+	} else {
+		return nil, syserror.EINVAL
+	}
+
+	if numPages := sizeAligned / usermem.PageSize; r.totalPages+numPages > linux.SHMALL {
+		// "... allocating a segment of the requested size would cause the
+		// system to exceed the system-wide limit on shared memory (SHMALL)."
+		//   - man shmget(2)
+		return nil, syserror.ENOSPC
+	}
+
+	// Need to create a new segment.
+	creator := fs.FileOwnerFromContext(ctx)
+	perms := fs.FilePermsFromMode(mode)
+	return r.newShm(ctx, pid, key, creator, perms, size)
+}
+
+// newShm creates a new segment in the registry.
+//
+// Precondition: Caller must hold r.mu.
+func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	if mfp == nil {
+		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
+	}
+
+	effectiveSize := uint64(usermem.Addr(size).MustRoundUp())
+	fr, err := mfp.MemoryFile().Allocate(effectiveSize, usage.Anonymous)
+	if err != nil {
+		return nil, err
+	}
+
+	shm := &Shm{
+		mfp:           mfp,
+		registry:      r,
+		creator:       creator,
+		size:          size,
+		effectiveSize: effectiveSize,
+		fr:            fr,
+		key:           key,
+		perms:         perms,
+		owner:         creator,
+		creatorPID:    pid,
+		changeTime:    ktime.NowFromContext(ctx),
+	}
+
+	// Find the next available ID.
+	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
+		// Handle wrap around.
+		if id < 0 {
+			id = 0
+			continue
+		}
+		if r.shms[id] == nil {
+			r.lastIDUsed = id
+
+			shm.ID = id
+			r.shms[id] = shm
+			r.keysToShms[key] = shm
+
+			r.totalPages += effectiveSize / usermem.PageSize
+
+			return shm, nil
+		}
+	}
+
+	log.Warningf("Shm ids exhuasted, they may be leaking")
+	return nil, syserror.ENOSPC
+}
+
+// IPCInfo reports global parameters for sysv shared memory segments on this
+// system. See shmctl(IPC_INFO).
+func (r *Registry) IPCInfo() *linux.ShmParams {
+	return &linux.ShmParams{
+		ShmMax: linux.SHMMAX,
+		ShmMin: linux.SHMMIN,
+		ShmMni: linux.SHMMNI,
+		ShmSeg: linux.SHMSEG,
+		ShmAll: linux.SHMALL,
+	}
+}
+
+// ShmInfo reports linux-specific global parameters for sysv shared memory
+// segments on this system. See shmctl(SHM_INFO).
+func (r *Registry) ShmInfo() *linux.ShmInfo {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	return &linux.ShmInfo{
+		UsedIDs: int32(r.lastIDUsed),
+		ShmTot:  r.totalPages,
+		ShmRss:  r.totalPages, // We could probably get a better estimate from memory accounting.
+		ShmSwp:  0,            // No reclaim at the moment.
+	}
+}
+
+// remove deletes a segment from this registry, deaccounting the memory used by
+// the segment.
+//
+// Precondition: Must follow a call to r.dissociateKey(s).
+func (r *Registry) remove(s *Shm) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if s.key != linux.IPC_PRIVATE {
+		panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked()))
+	}
+
+	delete(r.shms, s.ID)
+	r.totalPages -= s.effectiveSize / usermem.PageSize
+}
+
+// Shm represents a single shared memory segment.
+//
+// Shm segment are backed directly by an allocation from platform
+// memory. Segments are always mapped as a whole, greatly simplifying how
+// mappings are tracked. However note that mremap and munmap calls may cause the
+// vma for a segment to become fragmented; which requires special care when
+// unmapping a segment. See mm/shm.go.
+//
+// Segments persist until they are explicitly marked for destruction via
+// shmctl(SHM_RMID).
+//
+// Shm implements memmap.Mappable and memmap.MappingIdentity.
+//
+// +stateify savable
+type Shm struct {
+	// AtomicRefCount tracks the number of references to this segment from
+	// maps. A segment always holds a reference to itself, until it's marked for
+	// destruction.
+	refs.AtomicRefCount
+
+	mfp pgalloc.MemoryFileProvider
+
+	// registry points to the shm registry containing this segment. Immutable.
+	registry *Registry
+
+	// ID is the kernel identifier for this segment. Immutable.
+	ID ID
+
+	// creator is the user that created the segment. Immutable.
+	creator fs.FileOwner
+
+	// size is the requested size of the segment at creation, in
+	// bytes. Immutable.
+	size uint64
+
+	// effectiveSize of the segment, rounding up to the next page
+	// boundary. Immutable.
+	//
+	// Invariant: effectiveSize must be a multiple of usermem.PageSize.
+	effectiveSize uint64
+
+	// fr is the offset into mfp.MemoryFile() that backs this contents of this
+	// segment. Immutable.
+	fr platform.FileRange
+
+	// mu protects all fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// key is the public identifier for this segment.
+	key Key
+
+	// perms is the access permissions for the segment.
+	perms fs.FilePermissions
+
+	// owner of this segment.
+	owner fs.FileOwner
+	// attachTime is updated on every successful shmat.
+	attachTime ktime.Time
+	// detachTime is updated on every successful shmdt.
+	detachTime ktime.Time
+	// changeTime is updated on every successful changes to the segment via
+	// shmctl(IPC_SET).
+	changeTime ktime.Time
+
+	// creatorPID is the PID of the process that created the segment.
+	creatorPID int32
+	// lastAttachDetachPID is the pid of the process that issued the last shmat
+	// or shmdt syscall.
+	lastAttachDetachPID int32
+
+	// pendingDestruction indicates the segment was marked as destroyed through
+	// shmctl(IPC_RMID). When marked as destroyed, the segment will not be found
+	// in the registry and can no longer be attached. When the last user
+	// detaches from the segment, it is destroyed.
+	pendingDestruction bool
+}
+
+// Precondition: Caller must hold s.mu.
+func (s *Shm) debugLocked() string {
+	return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}",
+		s.ID, s.key, s.size, s.ReadRefs(), s.pendingDestruction)
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (s *Shm) MappedName(ctx context.Context) string {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return fmt.Sprintf("SYSV%08d", s.key)
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (s *Shm) DeviceID() uint64 {
+	return shmDevice.DeviceID()
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (s *Shm) InodeID() uint64 {
+	// "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use
+	// this. Changing this will break them." -- Linux, ipc/shm.c:newseg()
+	return uint64(s.ID)
+}
+
+// DecRef overrides refs.RefCount.DecRef with a destructor.
+//
+// Precondition: Caller must not hold s.mu.
+func (s *Shm) DecRef() {
+	s.DecRefWithDestructor(s.destroy)
+}
+
+// Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
+// segments.
+func (s *Shm) Msync(context.Context, memmap.MappableRange) error {
+	return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.attachTime = ktime.NowFromContext(ctx)
+	if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
+		s.lastAttachDetachPID = pid
+	} else {
+		// AddMapping is called during a syscall, so ctx should always be a task
+		// context.
+		log.Warningf("Adding mapping to %s but couldn't get the current pid; not updating the last attach pid", s.debugLocked())
+	}
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	// TODO(b/38173783): RemoveMapping may be called during task exit, when ctx
+	// is context.Background. Gracefully handle missing clocks. Failing to
+	// update the detach time in these cases is ok, since no one can observe the
+	// omission.
+	if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
+		s.detachTime = clock.Now()
+	}
+
+	// If called from a non-task context we also won't have a threadgroup
+	// id. Silently skip updating the lastAttachDetachPid in that case.
+	if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
+		s.lastAttachDetachPID = pid
+	} else {
+		log.Debugf("Couldn't obtain pid when removing mapping to %s, not updating the last detach pid.", s.debugLocked())
+	}
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error {
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	var err error
+	if required.End > s.fr.Length() {
+		err = &memmap.BusError{syserror.EFAULT}
+	}
+	if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 {
+		return []memmap.Translation{
+			{
+				Source: source,
+				File:   s.mfp.MemoryFile(),
+				Offset: s.fr.Start + source.Start,
+				Perms:  usermem.AnyAccess,
+			},
+		}, err
+	}
+	return nil, err
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (s *Shm) InvalidateUnsavable(ctx context.Context) error {
+	return nil
+}
+
+// AttachOpts describes various flags passed to shmat(2).
+type AttachOpts struct {
+	Execute  bool
+	Readonly bool
+	Remap    bool
+}
+
+// ConfigureAttach creates an mmap configuration for the segment with the
+// requested attach options.
+//
+// ConfigureAttach returns with a ref on s on success. The caller should drop
+// this once the map is installed. This reference prevents s from being
+// destroyed before the returned configuration is used.
+func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.pendingDestruction && s.ReadRefs() == 0 {
+		return memmap.MMapOpts{}, syserror.EIDRM
+	}
+
+	if !s.checkPermissions(ctx, fs.PermMask{
+		Read:    true,
+		Write:   !opts.Readonly,
+		Execute: opts.Execute,
+	}) {
+		// "The calling process does not have the required permissions for the
+		// requested attach type, and does not have the CAP_IPC_OWNER capability
+		// in the user namespace that governs its IPC namespace." - man shmat(2)
+		return memmap.MMapOpts{}, syserror.EACCES
+	}
+	s.IncRef()
+	return memmap.MMapOpts{
+		Length: s.size,
+		Offset: 0,
+		Addr:   addr,
+		Fixed:  opts.Remap,
+		Perms: usermem.AccessType{
+			Read:    true,
+			Write:   !opts.Readonly,
+			Execute: opts.Execute,
+		},
+		MaxPerms:        usermem.AnyAccess,
+		Mappable:        s,
+		MappingIdentity: s,
+	}, nil
+}
+
+// EffectiveSize returns the size of the underlying shared memory segment. This
+// may be larger than the requested size at creation, due to rounding to page
+// boundaries.
+func (s *Shm) EffectiveSize() uint64 {
+	return s.effectiveSize
+}
+
+// IPCStat returns information about a shm. See shmctl(IPC_STAT).
+func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The caller must have read permission on the shared memory segment."
+	//   - man shmctl(2)
+	if !s.checkPermissions(ctx, fs.PermMask{Read: true}) {
+		// "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow
+		// read access for shmid, and the calling process does not have the
+		// CAP_IPC_OWNER capability in the user namespace that governs its IPC
+		// namespace." - man shmctl(2)
+		return nil, syserror.EACCES
+	}
+
+	var mode uint16
+	if s.pendingDestruction {
+		mode |= linux.SHM_DEST
+	}
+	creds := auth.CredentialsFromContext(ctx)
+
+	nattach := uint64(s.ReadRefs())
+	// Don't report the self-reference we keep prior to being marked for
+	// destruction. However, also don't report a count of -1 for segments marked
+	// as destroyed, with no mappings.
+	if !s.pendingDestruction {
+		nattach--
+	}
+
+	ds := &linux.ShmidDS{
+		ShmPerm: linux.IPCPerm{
+			Key:  uint32(s.key),
+			UID:  uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
+			GID:  uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
+			CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
+			CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
+			Mode: mode | uint16(s.perms.LinuxMode()),
+			Seq:  0, // IPC sequences not supported.
+		},
+		ShmSegsz:   s.size,
+		ShmAtime:   s.attachTime.TimeT(),
+		ShmDtime:   s.detachTime.TimeT(),
+		ShmCtime:   s.changeTime.TimeT(),
+		ShmCpid:    s.creatorPID,
+		ShmLpid:    s.lastAttachDetachPID,
+		ShmNattach: nattach,
+	}
+
+	return ds, nil
+}
+
+// Set modifies attributes for a segment. See shmctl(IPC_SET).
+func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if !s.checkOwnership(ctx) {
+		return syserror.EPERM
+	}
+
+	creds := auth.CredentialsFromContext(ctx)
+	uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID))
+	gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID))
+	if !uid.Ok() || !gid.Ok() {
+		return syserror.EINVAL
+	}
+
+	// User may only modify the lower 9 bits of the mode. All the other bits are
+	// always 0 for the underlying inode.
+	mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff)
+	s.perms = fs.FilePermsFromMode(mode)
+
+	s.owner.UID = uid
+	s.owner.GID = gid
+
+	s.changeTime = ktime.NowFromContext(ctx)
+	return nil
+}
+
+func (s *Shm) destroy() {
+	s.mfp.MemoryFile().DecRef(s.fr)
+	s.registry.remove(s)
+}
+
+// MarkDestroyed marks a segment for destruction. The segment is actually
+// destroyed once it has no references. MarkDestroyed may be called multiple
+// times, and is safe to call after a segment has already been destroyed. See
+// shmctl(IPC_RMID).
+func (s *Shm) MarkDestroyed() {
+	s.registry.dissociateKey(s)
+
+	s.mu.Lock()
+	// Only drop the segment's self-reference once, when destruction is
+	// requested. Otherwise, repeated calls to shmctl(IPC_RMID) would force a
+	// segment to be destroyed prematurely, potentially with active maps to the
+	// segment's address range. Remaining references are dropped when the
+	// segment is detached or unmaped.
+	if !s.pendingDestruction {
+		s.pendingDestruction = true
+		s.mu.Unlock() // Must release s.mu before calling s.DecRef.
+		s.DecRef()
+		return
+	}
+	s.mu.Unlock()
+}
+
+// checkOwnership verifies whether a segment may be accessed by ctx as an
+// owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux.
+//
+// Precondition: Caller must hold s.mu.
+func (s *Shm) checkOwnership(ctx context.Context) bool {
+	creds := auth.CredentialsFromContext(ctx)
+	if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID {
+		return true
+	}
+
+	// Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux
+	// doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented
+	// for use to "override IPC ownership checks".
+	return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS)
+}
+
+// checkPermissions verifies whether a segment is accessible by ctx for access
+// described by req. See ipc/util.c:ipcperms() in Linux.
+//
+// Precondition: Caller must hold s.mu.
+func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool {
+	creds := auth.CredentialsFromContext(ctx)
+
+	p := s.perms.Other
+	if s.owner.UID == creds.EffectiveKUID {
+		p = s.perms.User
+	} else if creds.InGroup(s.owner.GID) {
+		p = s.perms.Group
+	}
+	if p.SupersetOf(req) {
+		return true
+	}
+
+	// Tasks with CAP_IPC_OWNER may bypass permission checks.
+	return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS)
+}
diff --git a/pkg/sentry/kernel/shm/shm_state_autogen.go b/pkg/sentry/kernel/shm/shm_state_autogen.go
new file mode 100755
index 000000000..d94d01fce
--- /dev/null
+++ b/pkg/sentry/kernel/shm/shm_state_autogen.go
@@ -0,0 +1,74 @@
+// automatically generated by stateify.
+
+package shm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *Registry) beforeSave() {}
+func (x *Registry) save(m state.Map) {
+	x.beforeSave()
+	m.Save("userNS", &x.userNS)
+	m.Save("shms", &x.shms)
+	m.Save("keysToShms", &x.keysToShms)
+	m.Save("totalPages", &x.totalPages)
+	m.Save("lastIDUsed", &x.lastIDUsed)
+}
+
+func (x *Registry) afterLoad() {}
+func (x *Registry) load(m state.Map) {
+	m.Load("userNS", &x.userNS)
+	m.Load("shms", &x.shms)
+	m.Load("keysToShms", &x.keysToShms)
+	m.Load("totalPages", &x.totalPages)
+	m.Load("lastIDUsed", &x.lastIDUsed)
+}
+
+func (x *Shm) beforeSave() {}
+func (x *Shm) save(m state.Map) {
+	x.beforeSave()
+	m.Save("AtomicRefCount", &x.AtomicRefCount)
+	m.Save("mfp", &x.mfp)
+	m.Save("registry", &x.registry)
+	m.Save("ID", &x.ID)
+	m.Save("creator", &x.creator)
+	m.Save("size", &x.size)
+	m.Save("effectiveSize", &x.effectiveSize)
+	m.Save("fr", &x.fr)
+	m.Save("key", &x.key)
+	m.Save("perms", &x.perms)
+	m.Save("owner", &x.owner)
+	m.Save("attachTime", &x.attachTime)
+	m.Save("detachTime", &x.detachTime)
+	m.Save("changeTime", &x.changeTime)
+	m.Save("creatorPID", &x.creatorPID)
+	m.Save("lastAttachDetachPID", &x.lastAttachDetachPID)
+	m.Save("pendingDestruction", &x.pendingDestruction)
+}
+
+func (x *Shm) afterLoad() {}
+func (x *Shm) load(m state.Map) {
+	m.Load("AtomicRefCount", &x.AtomicRefCount)
+	m.Load("mfp", &x.mfp)
+	m.Load("registry", &x.registry)
+	m.Load("ID", &x.ID)
+	m.Load("creator", &x.creator)
+	m.Load("size", &x.size)
+	m.Load("effectiveSize", &x.effectiveSize)
+	m.Load("fr", &x.fr)
+	m.Load("key", &x.key)
+	m.Load("perms", &x.perms)
+	m.Load("owner", &x.owner)
+	m.Load("attachTime", &x.attachTime)
+	m.Load("detachTime", &x.detachTime)
+	m.Load("changeTime", &x.changeTime)
+	m.Load("creatorPID", &x.creatorPID)
+	m.Load("lastAttachDetachPID", &x.lastAttachDetachPID)
+	m.Load("pendingDestruction", &x.pendingDestruction)
+}
+
+func init() {
+	state.Register("shm.Registry", (*Registry)(nil), state.Fns{Save: (*Registry).save, Load: (*Registry).load})
+	state.Register("shm.Shm", (*Shm)(nil), state.Fns{Save: (*Shm).save, Load: (*Shm).load})
+}
diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
new file mode 100644
index 000000000..b528ec0dc
--- /dev/null
+++ b/pkg/sentry/kernel/signal.go
@@ -0,0 +1,76 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+)
+
+// SignalPanic is used to panic the running threads. It is a signal which
+// cannot be used by the application: it must be caught and ignored by the
+// runtime (in order to catch possible races).
+const SignalPanic = linux.SIGUSR2
+
+// sendExternalSignal is called when an asynchronous signal is sent to the
+// sentry ("in sentry context"). On some platforms, it may also be called when
+// an asynchronous signal is sent to sandboxed application threads ("in
+// application context").
+//
+// context is used only for debugging to differentiate these cases.
+//
+// Preconditions: Kernel must have an init process.
+func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) {
+	switch linux.Signal(info.Signo) {
+	case platform.SignalInterrupt:
+		// Assume that a call to platform.Context.Interrupt() misfired.
+
+	case SignalPanic:
+		// SignalPanic is also specially handled in sentry setup to ensure that
+		// it causes a panic even after tasks exit, but SignalPanic may also
+		// be sent here if it is received while in app context.
+		panic("Signal-induced panic")
+
+	default:
+		log.Infof("Received external signal %d in %s context", info.Signo, context)
+		if k.globalInit == nil {
+			panic(fmt.Sprintf("Received external signal %d before init created", info.Signo))
+		}
+		k.globalInit.SendSignal(info)
+	}
+}
+
+// SignalInfoPriv returns a SignalInfo equivalent to Linux's SEND_SIG_PRIV.
+func SignalInfoPriv(sig linux.Signal) *arch.SignalInfo {
+	return &arch.SignalInfo{
+		Signo: int32(sig),
+		Code:  arch.SignalInfoKernel,
+	}
+}
+
+// SignalInfoNoInfo returns a SignalInfo equivalent to Linux's SEND_SIG_NOINFO.
+func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *arch.SignalInfo {
+	info := &arch.SignalInfo{
+		Signo: int32(sig),
+		Code:  arch.SignalInfoUser,
+	}
+	info.SetPid(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg)))
+	info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+	return info
+}
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
new file mode 100644
index 000000000..ce8bcb5e5
--- /dev/null
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -0,0 +1,89 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+// SignalHandlers holds information about signal actions.
+//
+// +stateify savable
+type SignalHandlers struct {
+	// mu protects actions, as well as the signal state of all tasks and thread
+	// groups using this SignalHandlers object. (See comment on
+	// ThreadGroup.signalHandlers.)
+	mu sync.Mutex `state:"nosave"`
+
+	// actions is the action to be taken upon receiving each signal.
+	actions map[linux.Signal]arch.SignalAct
+}
+
+// NewSignalHandlers returns a new SignalHandlers specifying all default
+// actions.
+func NewSignalHandlers() *SignalHandlers {
+	return &SignalHandlers{
+		actions: make(map[linux.Signal]arch.SignalAct),
+	}
+}
+
+// Fork returns a copy of sh for a new thread group.
+func (sh *SignalHandlers) Fork() *SignalHandlers {
+	sh2 := NewSignalHandlers()
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	for sig, act := range sh.actions {
+		sh2.actions[sig] = act
+	}
+	return sh2
+}
+
+// CopyForExec returns a copy of sh for a thread group that is undergoing an
+// execve. (See comments in Task.finishExec.)
+func (sh *SignalHandlers) CopyForExec() *SignalHandlers {
+	sh2 := NewSignalHandlers()
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	for sig, act := range sh.actions {
+		if act.Handler == arch.SignalActIgnore {
+			sh2.actions[sig] = arch.SignalAct{
+				Handler: arch.SignalActIgnore,
+			}
+		}
+	}
+	return sh2
+}
+
+// IsIgnored returns true if the signal is ignored.
+func (sh *SignalHandlers) IsIgnored(sig linux.Signal) bool {
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	sa, ok := sh.actions[sig]
+	return ok && sa.Handler == arch.SignalActIgnore
+}
+
+// dequeueActionLocked returns the SignalAct that should be used to handle sig.
+//
+// Preconditions: sh.mu must be locked.
+func (sh *SignalHandlers) dequeueAction(sig linux.Signal) arch.SignalAct {
+	act := sh.actions[sig]
+	if act.IsResetHandler() {
+		delete(sh.actions, sig)
+	}
+	return act
+}
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
new file mode 100644
index 000000000..0572053db
--- /dev/null
+++ b/pkg/sentry/kernel/syscalls.go
@@ -0,0 +1,307 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// maxSyscallNum is the highest supported syscall number.
+//
+// The types below create fast lookup slices for all syscalls. This maximum
+// serves as a sanity check that we don't allocate huge slices for a very large
+// syscall.
+const maxSyscallNum = 2000
+
+// SyscallFn is a syscall implementation.
+type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error)
+
+// MissingFn is a syscall to be called when an implementation is missing.
+type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error)
+
+// Possible flags for SyscallFlagsTable.enable.
+const (
+	// syscallPresent indicates that this is not a missing syscall.
+	//
+	// This flag is used internally in SyscallFlagsTable.
+	syscallPresent = 1 << iota
+
+	// StraceEnableLog enables syscall log tracing.
+	StraceEnableLog
+
+	// StraceEnableEvent enables syscall event tracing.
+	StraceEnableEvent
+
+	// ExternalBeforeEnable enables the external hook before syscall execution.
+	ExternalBeforeEnable
+
+	// ExternalAfterEnable enables the external hook after syscall execution.
+	ExternalAfterEnable
+)
+
+// StraceEnableBits combines both strace log and event flags.
+const StraceEnableBits = StraceEnableLog | StraceEnableEvent
+
+// SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall
+// basis.
+type SyscallFlagsTable struct {
+	// mu protects writes to the fields below.
+	//
+	// Atomic loads are always allowed. Atomic stores are allowed only
+	// while mu is held.
+	mu sync.Mutex
+
+	// enable contains the enable bits for each syscall.
+	//
+	// missing syscalls have the same value in enable as missingEnable to
+	// avoid an extra branch in Word.
+	enable []uint32
+
+	// missingEnable contains the enable bits for missing syscalls.
+	missingEnable uint32
+}
+
+// Init initializes the struct, with all syscalls in table set to enable.
+//
+// max is the largest syscall number in table.
+func (e *SyscallFlagsTable) init(table map[uintptr]SyscallFn, max uintptr) {
+	e.enable = make([]uint32, max+1)
+	for num := range table {
+		e.enable[num] = syscallPresent
+	}
+}
+
+// Word returns the enable bitfield for sysno.
+func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 {
+	if sysno < uintptr(len(e.enable)) {
+		return atomic.LoadUint32(&e.enable[sysno])
+	}
+
+	return atomic.LoadUint32(&e.missingEnable)
+}
+
+// Enable sets enable bit bit for all syscalls based on s.
+//
+// Syscalls missing from s are disabled.
+//
+// Syscalls missing from the initial table passed to Init cannot be added as
+// individual syscalls. If present in s they will be ignored.
+//
+// Callers to Word may see either the old or new value while this function
+// is executing.
+func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	missingVal := atomic.LoadUint32(&e.missingEnable)
+	if missingEnable {
+		missingVal |= bit
+	} else {
+		missingVal &^= bit
+	}
+	atomic.StoreUint32(&e.missingEnable, missingVal)
+
+	for num := range e.enable {
+		val := atomic.LoadUint32(&e.enable[num])
+		if !bits.IsOn32(val, syscallPresent) {
+			// Missing.
+			atomic.StoreUint32(&e.enable[num], missingVal)
+			continue
+		}
+
+		if s[uintptr(num)] {
+			val |= bit
+		} else {
+			val &^= bit
+		}
+		atomic.StoreUint32(&e.enable[num], val)
+	}
+}
+
+// EnableAll sets enable bit bit for all syscalls, present and missing.
+func (e *SyscallFlagsTable) EnableAll(bit uint32) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	missingVal := atomic.LoadUint32(&e.missingEnable)
+	missingVal |= bit
+	atomic.StoreUint32(&e.missingEnable, missingVal)
+
+	for num := range e.enable {
+		val := atomic.LoadUint32(&e.enable[num])
+		if !bits.IsOn32(val, syscallPresent) {
+			// Missing.
+			atomic.StoreUint32(&e.enable[num], missingVal)
+			continue
+		}
+
+		val |= bit
+		atomic.StoreUint32(&e.enable[num], val)
+	}
+}
+
+// Stracer traces syscall execution.
+type Stracer interface {
+	// SyscallEnter is called on syscall entry.
+	//
+	// The returned private data is passed to SyscallExit.
+	//
+	// TODO(gvisor.dev/issue/155): remove kernel imports from the strace
+	// package so that the type can be used directly.
+	SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
+
+	// SyscallExit is called on syscall exit.
+	SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error)
+}
+
+// SyscallTable is a lookup table of system calls. Critically, a SyscallTable
+// is *immutable*. In order to make supporting suspend and resume sane, they
+// must be uniquely registered and may not change during operation.
+//
+// +stateify savable
+type SyscallTable struct {
+	// OS is the operating system that this syscall table implements.
+	OS abi.OS `state:"wait"`
+
+	// Arch is the architecture that this syscall table targets.
+	Arch arch.Arch `state:"wait"`
+
+	// The OS version that this syscall table implements.
+	Version Version `state:"manual"`
+
+	// AuditNumber is a numeric constant that represents the syscall table. If
+	// non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
+	// linux/audit.h.
+	AuditNumber uint32 `state:"manual"`
+
+	// Table is the collection of functions.
+	Table map[uintptr]SyscallFn `state:"manual"`
+
+	// lookup is a fixed-size array that holds the syscalls (indexed by
+	// their numbers). It is used for fast look ups.
+	lookup []SyscallFn `state:"manual"`
+
+	// Emulate is a collection of instruction addresses to emulate. The
+	// keys are addresses, and the values are system call numbers.
+	Emulate map[usermem.Addr]uintptr `state:"manual"`
+
+	// The function to call in case of a missing system call.
+	Missing MissingFn `state:"manual"`
+
+	// Stracer traces this syscall table.
+	Stracer Stracer `state:"manual"`
+
+	// External is used to handle an external callback.
+	External func(*Kernel) `state:"manual"`
+
+	// ExternalFilterBefore is called before External is called before the syscall is executed.
+	// External is not called if it returns false.
+	ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+
+	// ExternalFilterAfter is called before External is called after the syscall is executed.
+	// External is not called if it returns false.
+	ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+
+	// FeatureEnable stores the strace and one-shot enable bits.
+	FeatureEnable SyscallFlagsTable `state:"manual"`
+}
+
+// allSyscallTables contains all known tables.
+var allSyscallTables []*SyscallTable
+
+// SyscallTables returns a read-only slice of registered SyscallTables.
+func SyscallTables() []*SyscallTable {
+	return allSyscallTables
+}
+
+// LookupSyscallTable returns the SyscallCall table for the OS/Arch combination.
+func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) {
+	for _, s := range allSyscallTables {
+		if s.OS == os && s.Arch == a {
+			return s, true
+		}
+	}
+	return nil, false
+}
+
+// RegisterSyscallTable registers a new syscall table for use by a Kernel.
+func RegisterSyscallTable(s *SyscallTable) {
+	if s.Table == nil {
+		// Ensure non-nil lookup table.
+		s.Table = make(map[uintptr]SyscallFn)
+	}
+	if s.Emulate == nil {
+		// Ensure non-nil emulate table.
+		s.Emulate = make(map[usermem.Addr]uintptr)
+	}
+
+	var max uintptr
+	for num := range s.Table {
+		if num > max {
+			max = num
+		}
+	}
+
+	if max > maxSyscallNum {
+		panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max))
+	}
+
+	s.lookup = make([]SyscallFn, max+1)
+
+	// Initialize the fast-lookup table.
+	for num, fn := range s.Table {
+		s.lookup[num] = fn
+	}
+
+	s.FeatureEnable.init(s.Table, max)
+
+	if _, ok := LookupSyscallTable(s.OS, s.Arch); ok {
+		panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch))
+	}
+
+	// Save a reference to this table.
+	//
+	// This is required for a Kernel to find the table and for save/restore
+	// operations below.
+	allSyscallTables = append(allSyscallTables, s)
+}
+
+// Lookup returns the syscall implementation, if one exists.
+func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
+	if sysno < uintptr(len(s.lookup)) {
+		return s.lookup[sysno]
+	}
+
+	return nil
+}
+
+// LookupEmulate looks up an emulation syscall number.
+func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
+	sysno, ok := s.Emulate[addr]
+	return sysno, ok
+}
+
+// mapLookup is similar to Lookup, except that it only uses the syscall table,
+// that is, it skips the fast look array. This is available for benchmarking.
+func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn {
+	return s.Table[sysno]
+}
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
new file mode 100644
index 000000000..00358326b
--- /dev/null
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import "fmt"
+
+// afterLoad is invoked by stateify.
+func (s *SyscallTable) afterLoad() {
+	otherTable, ok := LookupSyscallTable(s.OS, s.Arch)
+	if !ok {
+		// Couldn't find a reference?
+		panic(fmt.Sprintf("syscall table not found for OS %v Arch %v", s.OS, s.Arch))
+	}
+
+	// Copy the table.
+	*s = *otherTable
+}
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
new file mode 100644
index 000000000..175d1b247
--- /dev/null
+++ b/pkg/sentry/kernel/syslog.go
@@ -0,0 +1,106 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"math/rand"
+	"sync"
+)
+
+// syslog represents a sentry-global kernel log.
+//
+// Currently, it contains only fun messages for a dmesg easter egg.
+//
+// +stateify savable
+type syslog struct {
+	// mu protects the below.
+	mu sync.Mutex `state:"nosave"`
+
+	// msg is the syslog message buffer. It is lazily initialized.
+	msg []byte
+}
+
+// Log returns a copy of the syslog.
+func (s *syslog) Log() []byte {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if s.msg != nil {
+		// Already initialized, just return a copy.
+		o := make([]byte, len(s.msg))
+		copy(o, s.msg)
+		return o
+	}
+
+	// Not initialized, create message.
+	allMessages := []string{
+		"Synthesizing system calls...",
+		"Mounting deweydecimalfs...",
+		"Moving files to filing cabinet...",
+		"Digging up root...",
+		"Constructing home...",
+		"Segmenting fault lines...",
+		"Creating bureaucratic processes...",
+		"Searching for needles in stacks...",
+		"Preparing for the zombie uprising...",
+		"Feeding the init monster...",
+		"Creating cloned children...",
+		"Daemonizing children...",
+		"Waiting for children...",
+		"Gathering forks...",
+		"Committing treasure map to memory...",
+		"Reading process obituaries...",
+		"Searching for socket adapter...",
+		"Creating process schedule...",
+		"Generating random numbers by fair dice roll...",
+		"Rewriting operating system in Javascript...",
+		"Consulting tar man page...",
+		"Forking spaghetti code...",
+		"Checking naughty and nice process list...",
+		"Checking naughty and nice process list...", // Check it up to twice.
+		"Granting licence to kill(2)...",            // British spelling for British movie.
+		"Letting the watchdogs out...",
+	}
+
+	selectMessage := func() string {
+		i := rand.Intn(len(allMessages))
+		m := allMessages[i]
+
+		// Delete the selected message.
+		allMessages[i] = allMessages[len(allMessages)-1]
+		allMessages = allMessages[:len(allMessages)-1]
+
+		return m
+	}
+
+	const format = "<6>[%11.6f] %s\n"
+
+	s.msg = append(s.msg, []byte(fmt.Sprintf(format, 0.0, "Starting gVisor..."))...)
+
+	time := 0.1
+	for i := 0; i < 10; i++ {
+		time += rand.Float64() / 2
+		s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...)
+	}
+
+	time += rand.Float64() / 2
+	s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...)
+
+	// Return a copy.
+	o := make([]byte, len(s.msg))
+	copy(o, s.msg)
+	return o
+}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
new file mode 100644
index 000000000..f9378c2de
--- /dev/null
+++ b/pkg/sentry/kernel/task.go
@@ -0,0 +1,723 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bpf"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/third_party/gvsync"
+)
+
+// Task represents a thread of execution in the untrusted app.  It
+// includes registers and any thread-specific state that you would
+// normally expect.
+//
+// Each task is associated with a goroutine, called the task goroutine, that
+// executes code (application code, system calls, etc.) on behalf of that task.
+// See Task.run (task_run.go).
+//
+// All fields that are "owned by the task goroutine" can only be mutated by the
+// task goroutine while it is running. The task goroutine does not require
+// synchronization to read these fields, although it still requires
+// synchronization as described for those fields to mutate them.
+//
+// All fields that are "exclusive to the task goroutine" can only be accessed
+// by the task goroutine while it is running. The task goroutine does not
+// require synchronization to read or write these fields.
+//
+// +stateify savable
+type Task struct {
+	taskNode
+
+	// runState is what the task goroutine is executing if it is not stopped.
+	// If runState is nil, the task goroutine should exit or has exited.
+	// runState is exclusive to the task goroutine.
+	runState taskRunState
+
+	// haveSyscallReturn is true if tc.Arch().Return() represents a value
+	// returned by a syscall (or set by ptrace after a syscall).
+	//
+	// haveSyscallReturn is exclusive to the task goroutine.
+	haveSyscallReturn bool
+
+	// interruptChan is notified whenever the task goroutine is interrupted
+	// (usually by a pending signal). interruptChan is effectively a condition
+	// variable that can be used in select statements.
+	//
+	// interruptChan is not saved; because saving interrupts all tasks,
+	// interruptChan is always notified after restore (see Task.run).
+	interruptChan chan struct{} `state:"nosave"`
+
+	// gosched contains the current scheduling state of the task goroutine.
+	//
+	// gosched is protected by goschedSeq. gosched is owned by the task
+	// goroutine.
+	goschedSeq gvsync.SeqCount `state:"nosave"`
+	gosched    TaskGoroutineSchedInfo
+
+	// yieldCount is the number of times the task goroutine has called
+	// Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or
+	// Task.Yield(), voluntarily ceasing execution.
+	//
+	// yieldCount is accessed using atomic memory operations. yieldCount is
+	// owned by the task goroutine.
+	yieldCount uint64
+
+	// pendingSignals is the set of pending signals that may be handled only by
+	// this task.
+	//
+	// pendingSignals is protected by (taskNode.)tg.signalHandlers.mu
+	// (hereafter "the signal mutex"); see comment on
+	// ThreadGroup.signalHandlers.
+	pendingSignals pendingSignals
+
+	// signalMask is the set of signals whose delivery is currently blocked.
+	//
+	// signalMask is accessed using atomic memory operations, and is protected
+	// by the signal mutex (such that reading signalMask is safe if either the
+	// signal mutex is locked or if atomic memory operations are used, while
+	// writing signalMask requires both). signalMask is owned by the task
+	// goroutine.
+	signalMask linux.SignalSet
+
+	// If the task goroutine is currently executing Task.sigtimedwait,
+	// realSignalMask is the previous value of signalMask, which has temporarily
+	// been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0.
+	//
+	// realSignalMask is exclusive to the task goroutine.
+	realSignalMask linux.SignalSet
+
+	// If haveSavedSignalMask is true, savedSignalMask is the signal mask that
+	// should be applied after the task has either delivered one signal to a
+	// user handler or is about to resume execution in the untrusted
+	// application.
+	//
+	// Both haveSavedSignalMask and savedSignalMask are exclusive to the task
+	// goroutine.
+	haveSavedSignalMask bool
+	savedSignalMask     linux.SignalSet
+
+	// signalStack is the alternate signal stack used by signal handlers for
+	// which the SA_ONSTACK flag is set.
+	//
+	// signalStack is exclusive to the task goroutine.
+	signalStack arch.SignalStack
+
+	// If groupStopPending is true, the task should participate in a group
+	// stop in the interrupt path.
+	//
+	// groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux.
+	//
+	// groupStopPending is protected by the signal mutex.
+	groupStopPending bool
+
+	// If groupStopAcknowledged is true, the task has already acknowledged that
+	// it is entering the most recent group stop that has been initiated on its
+	// thread group.
+	//
+	// groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux.
+	//
+	// groupStopAcknowledged is protected by the signal mutex.
+	groupStopAcknowledged bool
+
+	// If trapStopPending is true, the task goroutine should enter a
+	// PTRACE_INTERRUPT-induced stop from the interrupt path.
+	//
+	// trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that
+	// Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects
+	// JOBCTL_STOP_PENDING.
+	//
+	// trapStopPending is protected by the signal mutex.
+	trapStopPending bool
+
+	// If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group
+	// stop has begun or ended since the last time the task entered a
+	// ptrace-stop from the group-stop path.
+	//
+	// trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux.
+	//
+	// trapNotifyPending is protected by the signal mutex.
+	trapNotifyPending bool
+
+	// If stop is not nil, it is the internally-initiated condition that
+	// currently prevents the task goroutine from running.
+	//
+	// stop is protected by the signal mutex.
+	stop TaskStop
+
+	// stopCount is the number of active external stops (calls to
+	// Task.BeginExternalStop that have not been paired with a call to
+	// Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is
+	// non-zero if the task goroutine should stop.
+	//
+	// Mutating stopCount requires both locking the signal mutex and using
+	// atomic memory operations. Reading stopCount requires either locking the
+	// signal mutex or using atomic memory operations. This allows Task.doStop
+	// to require only a single atomic read in the common case where stopCount
+	// is 0.
+	//
+	// stopCount is not saved, because external stops cannot be retained across
+	// a save/restore cycle. (Suppose a sentryctl command issues an external
+	// stop; after a save/restore cycle, the restored sentry has no knowledge
+	// of the pre-save sentryctl command, and the stopped task would remain
+	// stopped forever.)
+	stopCount int32 `state:"nosave"`
+
+	// endStopCond is signaled when stopCount transitions to 0. The combination
+	// of stopCount and endStopCond effectively form a sync.WaitGroup, but
+	// WaitGroup provides no way to read its counter value.
+	//
+	// Invariant: endStopCond.L is the signal mutex. (This is not racy because
+	// sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine
+	// calls sync.Cond.Wait; and only the task goroutine can change the
+	// identity of the signal mutex, in Task.finishExec.)
+	endStopCond sync.Cond `state:"nosave"`
+
+	// exitStatus is the task's exit status.
+	//
+	// exitStatus is protected by the signal mutex.
+	exitStatus ExitStatus
+
+	// syscallRestartBlock represents a custom restart function to run in
+	// restart_syscall(2) to resume an interrupted syscall.
+	//
+	// syscallRestartBlock is exclusive to the task goroutine.
+	syscallRestartBlock SyscallRestartBlock
+
+	// p provides the mechanism by which the task runs code in userspace. The p
+	// interface object is immutable.
+	p platform.Context `state:"nosave"`
+
+	// k is the Kernel that this task belongs to. The k pointer is immutable.
+	k *Kernel
+
+	// containerID has no equivalent in Linux; it's used by runsc to track all
+	// tasks that belong to a given containers since cgroups aren't implemented.
+	// It's inherited by the children, is immutable, and may be empty.
+	//
+	// NOTE: cgroups can be used to track this when implemented.
+	containerID string
+
+	// mu protects some of the following fields.
+	mu sync.Mutex `state:"nosave"`
+
+	// tc holds task data provided by the ELF loader.
+	//
+	// tc is protected by mu, and is owned by the task goroutine.
+	tc TaskContext
+
+	// fsc is the task's filesystem context.
+	//
+	// fsc is protected by mu, and is owned by the task goroutine.
+	fsc *FSContext
+
+	// fds is the task's file descriptor table.
+	//
+	// fds is protected by mu, and is owned by the task goroutine.
+	fds *FDMap
+
+	// If vforkParent is not nil, it is the task that created this task with
+	// vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
+	// this TaskContext is released.
+	//
+	// vforkParent is protected by the TaskSet mutex.
+	vforkParent *Task
+
+	// exitState is the task's progress through the exit path.
+	//
+	// exitState is protected by the TaskSet mutex. exitState is owned by the
+	// task goroutine.
+	exitState TaskExitState
+
+	// exitTracerNotified is true if the exit path has either signaled the
+	// task's tracer to indicate the exit, or determined that no such signal is
+	// needed. exitTracerNotified can only be true if exitState is
+	// TaskExitZombie or TaskExitDead.
+	//
+	// exitTracerNotified is protected by the TaskSet mutex.
+	exitTracerNotified bool
+
+	// exitTracerAcked is true if exitTracerNotified is true and either the
+	// task's tracer has acknowledged the exit notification, or the exit path
+	// has determined that no such notification is needed.
+	//
+	// exitTracerAcked is protected by the TaskSet mutex.
+	exitTracerAcked bool
+
+	// exitParentNotified is true if the exit path has either signaled the
+	// task's parent to indicate the exit, or determined that no such signal is
+	// needed. exitParentNotified can only be true if exitState is
+	// TaskExitZombie or TaskExitDead.
+	//
+	// exitParentNotified is protected by the TaskSet mutex.
+	exitParentNotified bool
+
+	// exitParentAcked is true if exitParentNotified is true and either the
+	// task's parent has acknowledged the exit notification, or the exit path
+	// has determined that no such acknowledgment is needed.
+	//
+	// exitParentAcked is protected by the TaskSet mutex.
+	exitParentAcked bool
+
+	// goroutineStopped is a WaitGroup whose counter value is 1 when the task
+	// goroutine is running and 0 when the task goroutine is stopped or has
+	// exited.
+	goroutineStopped sync.WaitGroup `state:"nosave"`
+
+	// ptraceTracer is the task that is ptrace-attached to this one. If
+	// ptraceTracer is nil, this task is not being traced. Note that due to
+	// atomic.Value limitations (atomic.Value.Store(nil) panics), a nil
+	// ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)).
+	//
+	// ptraceTracer is protected by the TaskSet mutex, and accessed with atomic
+	// operations. This allows paths that wouldn't otherwise lock the TaskSet
+	// mutex, notably the syscall path, to check if ptraceTracer is nil without
+	// additional synchronization.
+	ptraceTracer atomic.Value `state:".(*Task)"`
+
+	// ptraceTracees is the set of tasks that this task is ptrace-attached to.
+	//
+	// ptraceTracees is protected by the TaskSet mutex.
+	ptraceTracees map[*Task]struct{}
+
+	// ptraceSeized is true if ptraceTracer attached to this task with
+	// PTRACE_SEIZE.
+	//
+	// ptraceSeized is protected by the TaskSet mutex.
+	ptraceSeized bool
+
+	// ptraceOpts contains ptrace options explicitly set by the tracer. If
+	// ptraceTracer is nil, ptraceOpts is expected to be the zero value.
+	//
+	// ptraceOpts is protected by the TaskSet mutex.
+	ptraceOpts ptraceOptions
+
+	// ptraceSyscallMode controls ptrace behavior around syscall entry and
+	// exit.
+	//
+	// ptraceSyscallMode is protected by the TaskSet mutex.
+	ptraceSyscallMode ptraceSyscallMode
+
+	// If ptraceSinglestep is true, the next time the task executes application
+	// code, single-stepping should be enabled. ptraceSinglestep is stored
+	// independently of the architecture-specific trap flag because tracer
+	// detaching (which can happen concurrently with the tracee's execution if
+	// the tracer exits) must disable single-stepping, and the task's
+	// architectural state is implicitly exclusive to the task goroutine (no
+	// synchronization occurs before passing registers to SwitchToApp).
+	//
+	// ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP.
+	//
+	// ptraceSinglestep is protected by the TaskSet mutex.
+	ptraceSinglestep bool
+
+	// If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the
+	// time that t entered the ptrace stop, reset to 0 when the tracer
+	// acknowledges the stop with a wait*() syscall. Otherwise, it is the
+	// signal number passed to the ptrace operation that ended the last ptrace
+	// stop on this task. In the latter case, the effect of ptraceCode depends
+	// on the nature of the ptrace stop; signal-delivery-stop uses it to
+	// conditionally override ptraceSiginfo, syscall-entry/exit-stops send the
+	// signal to the task after leaving the stop, and PTRACE_EVENT stops and
+	// traced group stops ignore it entirely.
+	//
+	// Linux contextually stores the equivalent of ptraceCode in
+	// task_struct::exit_code.
+	//
+	// ptraceCode is protected by the TaskSet mutex.
+	ptraceCode int32
+
+	// ptraceSiginfo is the value returned to the tracer by
+	// ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO).
+	// (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.)
+	// ptraceSiginfo is nil if the task is in a ptraced group-stop (this is
+	// required for PTRACE_GETSIGINFO to return EINVAL during such stops, which
+	// is in turn required to distinguish group stops from other ptrace stops,
+	// per subsection "Group-stop" in ptrace(2)).
+	//
+	// ptraceSiginfo is analogous to Linux's task_struct::last_siginfo.
+	//
+	// ptraceSiginfo is protected by the TaskSet mutex.
+	ptraceSiginfo *arch.SignalInfo
+
+	// ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to
+	// the tracer by ptrace(PTRACE_GETEVENTMSG).
+	//
+	// ptraceEventMsg is protected by the TaskSet mutex.
+	ptraceEventMsg uint64
+
+	// The struct that holds the IO-related usage. The ioUsage pointer is
+	// immutable.
+	ioUsage *usage.IO
+
+	// logPrefix is a string containing the task's thread ID in the root PID
+	// namespace, and is prepended to log messages emitted by Task.Infof etc.
+	logPrefix atomic.Value `state:".(string)"`
+
+	// creds is the task's credentials.
+	//
+	// creds is protected by mu, however the value itself is immutable and can
+	// only be changed by a copy. After reading the pointer, access will
+	// proceed outside the scope of mu. creds is owned by the task goroutine.
+	creds *auth.Credentials
+
+	// utsns is the task's UTS namespace.
+	//
+	// utsns is protected by mu. utsns is owned by the task goroutine.
+	utsns *UTSNamespace
+
+	// ipcns is the task's IPC namespace.
+	//
+	// ipcns is protected by mu. ipcns is owned by the task goroutine.
+	ipcns *IPCNamespace
+
+	// abstractSockets tracks abstract sockets that are in use.
+	//
+	// abstractSockets is protected by mu.
+	abstractSockets *AbstractSocketNamespace
+
+	// parentDeathSignal is sent to this task's thread group when its parent exits.
+	//
+	// parentDeathSignal is protected by mu.
+	parentDeathSignal linux.Signal
+
+	// syscallFilters is all seccomp-bpf syscall filters applicable to the
+	// task, in the order in which they were installed. The type of the atomic
+	// is []bpf.Program. Writing needs to be protected by the signal mutex.
+	//
+	// syscallFilters is owned by the task goroutine.
+	syscallFilters atomic.Value `state:".([]bpf.Program)"`
+
+	// If cleartid is non-zero, treat it as a pointer to a ThreadID in the
+	// task's virtual address space; when the task exits, set the pointed-to
+	// ThreadID to 0, and wake any futex waiters.
+	//
+	// cleartid is exclusive to the task goroutine.
+	cleartid usermem.Addr
+
+	// This is mostly a fake cpumask just for sched_set/getaffinity as we
+	// don't really control the affinity.
+	//
+	// Invariant: allowedCPUMask.Size() ==
+	// sched.CPUMaskSize(Kernel.applicationCores).
+	//
+	// allowedCPUMask is protected by mu.
+	allowedCPUMask sched.CPUSet
+
+	// cpu is the fake cpu number returned by getcpu(2). cpu is ignored
+	// entirely if Kernel.useHostCores is true.
+	//
+	// cpu is accessed using atomic memory operations.
+	cpu int32
+
+	// This is used to keep track of changes made to a process' priority/niceness.
+	// It is mostly used to provide some reasonable return value from
+	// getpriority(2) after a call to setpriority(2) has been made.
+	// We currently do not actually modify a process' scheduling priority.
+	// NOTE: This represents the userspace view of priority (nice).
+	// This means that the value should be in the range [-20, 19].
+	//
+	// niceness is protected by mu.
+	niceness int
+
+	// This is used to track the numa policy for the current thread. This can be
+	// modified through a set_mempolicy(2) syscall. Since we always report a
+	// single numa node, all policies are no-ops. We only track this information
+	// so that we can return reasonable values if the application calls
+	// get_mempolicy(2) after setting a non-default policy. Note that in the
+	// real syscall, nodemask can be longer than 4 bytes, but we always report a
+	// single node so never need to save more than a single bit.
+	//
+	// numaPolicy and numaNodeMask are protected by mu.
+	numaPolicy   int32
+	numaNodeMask uint32
+
+	// If netns is true, the task is in a non-root network namespace. Network
+	// namespaces aren't currently implemented in full; being in a network
+	// namespace simply prevents the task from observing any network devices
+	// (including loopback) or using abstract socket addresses (see unix(7)).
+	//
+	// netns is protected by mu. netns is owned by the task goroutine.
+	netns bool
+
+	// If rseqPreempted is true, before the next call to p.Switch(), interrupt
+	// RSEQ critical regions as defined by tg.rseq and write the task
+	// goroutine's CPU number to rseqCPUAddr. rseqCPU is the last CPU number
+	// written to rseqCPUAddr.
+	//
+	// If rseqCPUAddr is 0, rseqCPU is -1.
+	//
+	// rseqCPUAddr, rseqCPU, and rseqPreempted are exclusive to the task
+	// goroutine.
+	rseqPreempted bool `state:"nosave"`
+	rseqCPUAddr   usermem.Addr
+	rseqCPU       int32
+
+	// copyScratchBuffer is a buffer available to CopyIn/CopyOut
+	// implementations that require an intermediate buffer to copy data
+	// into/out of. It prevents these buffers from being allocated/zeroed in
+	// each syscall and eventually garbage collected.
+	//
+	// copyScratchBuffer is exclusive to the task goroutine.
+	copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"`
+
+	// blockingTimer is used for blocking timeouts. blockingTimerChan is the
+	// channel that is sent to when blockingTimer fires.
+	//
+	// blockingTimer is exclusive to the task goroutine.
+	blockingTimer     *ktime.Timer    `state:"nosave"`
+	blockingTimerChan <-chan struct{} `state:"nosave"`
+
+	// futexWaiter is used for futex(FUTEX_WAIT) syscalls.
+	//
+	// futexWaiter is exclusive to the task goroutine.
+	futexWaiter *futex.Waiter `state:"nosave"`
+
+	// startTime is the real time at which the task started. It is set when
+	// a Task is created or invokes execve(2).
+	//
+	// startTime is protected by mu.
+	startTime ktime.Time
+}
+
+func (t *Task) savePtraceTracer() *Task {
+	return t.ptraceTracer.Load().(*Task)
+}
+
+func (t *Task) loadPtraceTracer(tracer *Task) {
+	t.ptraceTracer.Store(tracer)
+}
+
+func (t *Task) saveLogPrefix() string {
+	return t.logPrefix.Load().(string)
+}
+
+func (t *Task) loadLogPrefix(prefix string) {
+	t.logPrefix.Store(prefix)
+}
+
+func (t *Task) saveSyscallFilters() []bpf.Program {
+	if f := t.syscallFilters.Load(); f != nil {
+		return f.([]bpf.Program)
+	}
+	return nil
+}
+
+func (t *Task) loadSyscallFilters(filters []bpf.Program) {
+	t.syscallFilters.Store(filters)
+}
+
+// afterLoad is invoked by stateify.
+func (t *Task) afterLoad() {
+	t.interruptChan = make(chan struct{}, 1)
+	t.gosched.State = TaskGoroutineNonexistent
+	if t.stop != nil {
+		t.stopCount = 1
+	}
+	t.endStopCond.L = &t.tg.signalHandlers.mu
+	t.p = t.k.Platform.NewContext()
+	t.rseqPreempted = true
+	t.futexWaiter = futex.NewWaiter()
+}
+
+// copyScratchBufferLen is the length of Task.copyScratchBuffer.
+const copyScratchBufferLen = 144 // sizeof(struct stat)
+
+// CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut
+// functions. It must only be used within those functions and can only be used
+// by the task goroutine; it exists to improve performance and thus
+// intentionally lacks any synchronization.
+//
+// Callers should pass a constant value as an argument if possible, which will
+// allow the compiler to inline and optimize out the if statement below.
+func (t *Task) CopyScratchBuffer(size int) []byte {
+	if size > copyScratchBufferLen {
+		return make([]byte, size)
+	}
+	return t.copyScratchBuffer[:size]
+}
+
+// FutexWaiter returns the Task's futex.Waiter.
+func (t *Task) FutexWaiter() *futex.Waiter {
+	return t.futexWaiter
+}
+
+// Kernel returns the Kernel containing t.
+func (t *Task) Kernel() *Kernel {
+	return t.k
+}
+
+// Value implements context.Context.Value.
+//
+// Preconditions: The caller must be running on the task goroutine (as implied
+// by the requirements of context.Context).
+func (t *Task) Value(key interface{}) interface{} {
+	switch key {
+	case CtxCanTrace:
+		return t.CanTrace
+	case CtxKernel:
+		return t.k
+	case CtxPIDNamespace:
+		return t.tg.pidns
+	case CtxUTSNamespace:
+		return t.utsns
+	case CtxIPCNamespace:
+		return t.ipcns
+	case CtxTask:
+		return t
+	case auth.CtxCredentials:
+		return t.creds
+	case context.CtxThreadGroupID:
+		return int32(t.ThreadGroup().ID())
+	case fs.CtxRoot:
+		return t.fsc.RootDirectory()
+	case fs.CtxDirentCacheLimiter:
+		return t.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return t.NetworkContext()
+	case ktime.CtxRealtimeClock:
+		return t.k.RealtimeClock()
+	case limits.CtxLimits:
+		return t.tg.limits
+	case pgalloc.CtxMemoryFile:
+		return t.k.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return t.k
+	case platform.CtxPlatform:
+		return t.k
+	case uniqueid.CtxGlobalUniqueID:
+		return t.k.UniqueID()
+	case uniqueid.CtxGlobalUniqueIDProvider:
+		return t.k
+	case uniqueid.CtxInotifyCookie:
+		return t.k.GenerateInotifyCookie()
+	case unimpl.CtxEvents:
+		return t.k
+	default:
+		return nil
+	}
+}
+
+// SetClearTID sets t's cleartid.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) SetClearTID(addr usermem.Addr) {
+	t.cleartid = addr
+}
+
+// SetSyscallRestartBlock sets the restart block for use in
+// restart_syscall(2). After registering a restart block, a syscall should
+// return ERESTART_RESTARTBLOCK to request a restart using the block.
+//
+// Precondition: The caller must be running on the task goroutine.
+func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) {
+	t.syscallRestartBlock = r
+}
+
+// SyscallRestartBlock returns the currently registered restart block for use in
+// restart_syscall(2). This function is *not* idempotent and may be called once
+// per syscall. This function must not be called if a restart block has not been
+// registered for the current syscall.
+//
+// Precondition: The caller must be running on the task goroutine.
+func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
+	r := t.syscallRestartBlock
+	// Explicitly set the restart block to nil so that a future syscall can't
+	// accidentally reuse it.
+	t.syscallRestartBlock = nil
+	return r
+}
+
+// IsChrooted returns true if the root directory of t's FSContext is not the
+// root directory of t's MountNamespace.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) IsChrooted() bool {
+	realRoot := t.k.mounts.Root()
+	defer realRoot.DecRef()
+	root := t.fsc.RootDirectory()
+	if root != nil {
+		defer root.DecRef()
+	}
+	return root != realRoot
+}
+
+// TaskContext returns t's TaskContext.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) TaskContext() *TaskContext {
+	return &t.tc
+}
+
+// FSContext returns t's FSContext. FSContext does not take an additional
+// reference on the returned FSContext.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) FSContext() *FSContext {
+	return t.fsc
+}
+
+// FDMap returns t's FDMap. FDMap does not take an additional reference on the
+// returned FDMap.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) FDMap() *FDMap {
+	return t.fds
+}
+
+// WithMuLocked executes f with t.mu locked.
+func (t *Task) WithMuLocked(f func(*Task)) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	f(t)
+}
+
+// MountNamespace returns t's MountNamespace. MountNamespace does not take an
+// additional reference on the returned MountNamespace.
+func (t *Task) MountNamespace() *fs.MountNamespace {
+	return t.k.mounts
+}
+
+// AbstractSockets returns t's AbstractSocketNamespace.
+func (t *Task) AbstractSockets() *AbstractSocketNamespace {
+	return t.abstractSockets
+}
+
+// ContainerID returns t's container ID.
+func (t *Task) ContainerID() string {
+	return t.containerID
+}
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
new file mode 100644
index 000000000..1ca2a82eb
--- /dev/null
+++ b/pkg/sentry/kernel/task_acct.go
@@ -0,0 +1,196 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// Accounting, limits, timers.
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Getitimer implements getitimer(2).
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Getitimer(id int32) (linux.ItimerVal, error) {
+	var tm ktime.Time
+	var s ktime.Setting
+	switch id {
+	case linux.ITIMER_REAL:
+		tm, s = t.tg.itimerRealTimer.Get()
+	case linux.ITIMER_VIRTUAL:
+		tm = t.tg.UserCPUClock().Now()
+		t.tg.signalHandlers.mu.Lock()
+		s, _ = t.tg.itimerVirtSetting.At(tm)
+		t.tg.signalHandlers.mu.Unlock()
+	case linux.ITIMER_PROF:
+		tm = t.tg.CPUClock().Now()
+		t.tg.signalHandlers.mu.Lock()
+		s, _ = t.tg.itimerProfSetting.At(tm)
+		t.tg.signalHandlers.mu.Unlock()
+	default:
+		return linux.ItimerVal{}, syserror.EINVAL
+	}
+	val, iv := ktime.SpecFromSetting(tm, s)
+	return linux.ItimerVal{
+		Value:    linux.DurationToTimeval(val),
+		Interval: linux.DurationToTimeval(iv),
+	}, nil
+}
+
+// Setitimer implements setitimer(2).
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Setitimer(id int32, newitv linux.ItimerVal) (linux.ItimerVal, error) {
+	var tm ktime.Time
+	var olds ktime.Setting
+	switch id {
+	case linux.ITIMER_REAL:
+		news, err := ktime.SettingFromSpec(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), t.tg.itimerRealTimer.Clock())
+		if err != nil {
+			return linux.ItimerVal{}, err
+		}
+		tm, olds = t.tg.itimerRealTimer.Swap(news)
+	case linux.ITIMER_VIRTUAL:
+		c := t.tg.UserCPUClock()
+		var err error
+		t.k.cpuClockTicker.Atomically(func() {
+			tm = c.Now()
+			var news ktime.Setting
+			news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm)
+			if err != nil {
+				return
+			}
+			t.tg.signalHandlers.mu.Lock()
+			olds = t.tg.itimerVirtSetting
+			t.tg.itimerVirtSetting = news
+			t.tg.updateCPUTimersEnabledLocked()
+			t.tg.signalHandlers.mu.Unlock()
+		})
+		if err != nil {
+			return linux.ItimerVal{}, err
+		}
+	case linux.ITIMER_PROF:
+		c := t.tg.CPUClock()
+		var err error
+		t.k.cpuClockTicker.Atomically(func() {
+			tm = c.Now()
+			var news ktime.Setting
+			news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm)
+			if err != nil {
+				return
+			}
+			t.tg.signalHandlers.mu.Lock()
+			olds = t.tg.itimerProfSetting
+			t.tg.itimerProfSetting = news
+			t.tg.updateCPUTimersEnabledLocked()
+			t.tg.signalHandlers.mu.Unlock()
+		})
+		if err != nil {
+			return linux.ItimerVal{}, err
+		}
+	default:
+		return linux.ItimerVal{}, syserror.EINVAL
+	}
+	oldval, oldiv := ktime.SpecFromSetting(tm, olds)
+	return linux.ItimerVal{
+		Value:    linux.DurationToTimeval(oldval),
+		Interval: linux.DurationToTimeval(oldiv),
+	}, nil
+}
+
+// IOUsage returns the io usage of the thread.
+func (t *Task) IOUsage() *usage.IO {
+	return t.ioUsage
+}
+
+// IOUsage returns the total io usage of all dead and live threads in the group.
+func (tg *ThreadGroup) IOUsage() *usage.IO {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+
+	io := *tg.ioUsage
+	// Account for active tasks.
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		io.Accumulate(t.IOUsage())
+	}
+	return &io
+}
+
+// Name returns t's name.
+func (t *Task) Name() string {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.tc.Name
+}
+
+// SetName changes t's name.
+func (t *Task) SetName(name string) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.tc.Name = name
+	t.Debugf("Set thread name to %q", name)
+}
+
+// Limits implements context.Context.Limits.
+func (t *Task) Limits() *limits.LimitSet {
+	return t.ThreadGroup().Limits()
+}
+
+// StartTime returns t's start time.
+func (t *Task) StartTime() ktime.Time {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.startTime
+}
+
+// MaxRSS returns the maximum resident set size of the task in bytes. which
+// should be one of RUSAGE_SELF, RUSAGE_CHILDREN, RUSAGE_THREAD, or
+// RUSAGE_BOTH. See getrusage(2) for documentation on the behavior of these
+// flags.
+func (t *Task) MaxRSS(which int32) uint64 {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+
+	switch which {
+	case linux.RUSAGE_SELF, linux.RUSAGE_THREAD:
+		// If there's an active mm we can use its value.
+		if mm := t.MemoryManager(); mm != nil {
+			if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > t.tg.maxRSS {
+				return mmMaxRSS
+			}
+		}
+		return t.tg.maxRSS
+	case linux.RUSAGE_CHILDREN:
+		return t.tg.childMaxRSS
+	case linux.RUSAGE_BOTH:
+		maxRSS := t.tg.maxRSS
+		if maxRSS < t.tg.childMaxRSS {
+			maxRSS = t.tg.childMaxRSS
+		}
+		if mm := t.MemoryManager(); mm != nil {
+			if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > maxRSS {
+				return mmMaxRSS
+			}
+		}
+		return maxRSS
+	default:
+		// We'll only get here if which is invalid.
+		return 0
+	}
+}
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
new file mode 100644
index 000000000..1c76c4d84
--- /dev/null
+++ b/pkg/sentry/kernel/task_block.go
@@ -0,0 +1,212 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"time"
+
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// BlockWithTimeout blocks t until an event is received from C, the application
+// monotonic clock indicates that timeout has elapsed (only if haveTimeout is true),
+// or t is interrupted. It returns:
+//
+// - The remaining timeout, which is guaranteed to be 0 if the timeout expired,
+// and is unspecified if haveTimeout is false.
+//
+// - An error which is nil if an event is received from C, ETIMEDOUT if the timeout
+// expired, and syserror.ErrInterrupted if t is interrupted.
+func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) {
+	if !haveTimeout {
+		return timeout, t.block(C, nil)
+	}
+
+	start := t.Kernel().MonotonicClock().Now()
+	deadline := start.Add(timeout)
+	err := t.BlockWithDeadline(C, true, deadline)
+
+	// Timeout, explicitly return a remaining duration of 0.
+	if err == syserror.ETIMEDOUT {
+		return 0, err
+	}
+
+	// Compute the remaining timeout. Note that even if block() above didn't
+	// return due to a timeout, we may have used up any of the remaining time
+	// since then. We cap the remaining timeout to 0 to make it easier to
+	// directly use the returned duration.
+	end := t.Kernel().MonotonicClock().Now()
+	remainingTimeout := timeout - end.Sub(start)
+	if remainingTimeout < 0 {
+		remainingTimeout = 0
+	}
+
+	return remainingTimeout, err
+}
+
+// BlockWithDeadline blocks t until an event is received from C, the
+// application monotonic clock indicates a time of deadline (only if
+// haveDeadline is true), or t is interrupted. It returns nil if an event is
+// received from C, ETIMEDOUT if the deadline expired, and
+// syserror.ErrInterrupted if t is interrupted.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) BlockWithDeadline(C chan struct{}, haveDeadline bool, deadline ktime.Time) error {
+	if !haveDeadline {
+		return t.block(C, nil)
+	}
+
+	// Start the timeout timer.
+	t.blockingTimer.Swap(ktime.Setting{
+		Enabled: true,
+		Next:    deadline,
+	})
+
+	err := t.block(C, t.blockingTimerChan)
+
+	// Stop the timeout timer and drain the channel.
+	t.blockingTimer.Swap(ktime.Setting{})
+	select {
+	case <-t.blockingTimerChan:
+	default:
+	}
+
+	return err
+}
+
+// BlockWithTimer blocks t until an event is received from C or tchan, or t is
+// interrupted. It returns nil if an event is received from C, ETIMEDOUT if an
+// event is received from tchan, and syserror.ErrInterrupted if t is
+// interrupted.
+//
+// Most clients should use BlockWithDeadline or BlockWithTimeout instead.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) BlockWithTimer(C <-chan struct{}, tchan <-chan struct{}) error {
+	return t.block(C, tchan)
+}
+
+// Block blocks t until an event is received from C or t is interrupted. It
+// returns nil if an event is received from C and syserror.ErrInterrupted if t
+// is interrupted.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Block(C <-chan struct{}) error {
+	return t.block(C, nil)
+}
+
+// block blocks a task on one of many events.
+// N.B. defer is too expensive to be used here.
+func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
+	// Fast path if the request is already done.
+	select {
+	case <-C:
+		return nil
+	default:
+	}
+
+	// Deactive our address space, we don't need it.
+	interrupt := t.SleepStart()
+
+	select {
+	case <-C:
+		t.SleepFinish(true)
+		return nil
+
+	case <-interrupt:
+		t.SleepFinish(false)
+		// Return the indicated error on interrupt.
+		return syserror.ErrInterrupted
+
+	case <-timerChan:
+		// We've timed out.
+		t.SleepFinish(true)
+		return syserror.ETIMEDOUT
+	}
+}
+
+// SleepStart implements amutex.Sleeper.SleepStart.
+func (t *Task) SleepStart() <-chan struct{} {
+	t.Deactivate()
+	t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible)
+	return t.interruptChan
+}
+
+// SleepFinish implements amutex.Sleeper.SleepFinish.
+func (t *Task) SleepFinish(success bool) {
+	if !success {
+		// The interrupted notification is consumed only at the top-level
+		// (Run). Therefore we attempt to reset the pending notification.
+		// This will also elide our next entry back into the task, so we
+		// will process signals, state changes, etc.
+		t.interruptSelf()
+	}
+	t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible)
+	t.Activate()
+}
+
+// Interrupted implements amutex.Sleeper.Interrupted
+func (t *Task) Interrupted() bool {
+	return len(t.interruptChan) != 0
+}
+
+// UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart.
+func (t *Task) UninterruptibleSleepStart(deactivate bool) {
+	if deactivate {
+		t.Deactivate()
+	}
+	t.accountTaskGoroutineEnter(TaskGoroutineBlockedUninterruptible)
+}
+
+// UninterruptibleSleepFinish implements context.Context.UninterruptibleSleepFinish.
+func (t *Task) UninterruptibleSleepFinish(activate bool) {
+	t.accountTaskGoroutineLeave(TaskGoroutineBlockedUninterruptible)
+	if activate {
+		t.Activate()
+	}
+}
+
+// interrupted returns true if interrupt or interruptSelf has been called at
+// least once since the last call to interrupted.
+func (t *Task) interrupted() bool {
+	select {
+	case <-t.interruptChan:
+		return true
+	default:
+		return false
+	}
+}
+
+// interrupt unblocks the task and interrupts it if it's currently running in
+// userspace.
+func (t *Task) interrupt() {
+	t.interruptSelf()
+	t.p.Interrupt()
+}
+
+// interruptSelf is like Interrupt, but can only be called by the task
+// goroutine.
+func (t *Task) interruptSelf() {
+	select {
+	case t.interruptChan <- struct{}{}:
+		t.Debugf("Interrupt queued")
+	default:
+		t.Debugf("Dropping duplicate interrupt")
+	}
+	// platform.Context.Interrupt() is unnecessary since a task goroutine
+	// calling interruptSelf() cannot also be blocked in
+	// platform.Context.Switch().
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
new file mode 100644
index 000000000..bba8ddd39
--- /dev/null
+++ b/pkg/sentry/kernel/task_clone.go
@@ -0,0 +1,516 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bpf"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SharingOptions controls what resources are shared by a new task created by
+// Task.Clone, or an existing task affected by Task.Unshare.
+type SharingOptions struct {
+	// If NewAddressSpace is true, the task should have an independent virtual
+	// address space.
+	NewAddressSpace bool
+
+	// If NewSignalHandlers is true, the task should use an independent set of
+	// signal handlers.
+	NewSignalHandlers bool
+
+	// If NewThreadGroup is true, the task should be the leader of its own
+	// thread group. TerminationSignal is the signal that the thread group
+	// will send to its parent when it exits. If NewThreadGroup is false,
+	// TerminationSignal is ignored.
+	NewThreadGroup    bool
+	TerminationSignal linux.Signal
+
+	// If NewPIDNamespace is true:
+	//
+	// - In the context of Task.Clone, the new task should be the init task
+	// (TID 1) in a new PID namespace.
+	//
+	// - In the context of Task.Unshare, the task should create a new PID
+	// namespace, and all subsequent clones of the task should be members of
+	// the new PID namespace.
+	NewPIDNamespace bool
+
+	// If NewUserNamespace is true, the task should have an independent user
+	// namespace.
+	NewUserNamespace bool
+
+	// If NewNetworkNamespace is true, the task should have an independent
+	// network namespace. (Note that network namespaces are not really
+	// implemented; see comment on Task.netns for details.)
+	NewNetworkNamespace bool
+
+	// If NewFiles is true, the task should use an independent file descriptor
+	// table.
+	NewFiles bool
+
+	// If NewFSContext is true, the task should have an independent FSContext.
+	NewFSContext bool
+
+	// If NewUTSNamespace is true, the task should have an independent UTS
+	// namespace.
+	NewUTSNamespace bool
+
+	// If NewIPCNamespace is true, the task should have an independent IPC
+	// namespace.
+	NewIPCNamespace bool
+}
+
+// CloneOptions controls the behavior of Task.Clone.
+type CloneOptions struct {
+	// SharingOptions defines the set of resources that the new task will share
+	// with its parent.
+	SharingOptions
+
+	// Stack is the initial stack pointer of the new task. If Stack is 0, the
+	// new task will start with the same stack pointer as its parent.
+	Stack usermem.Addr
+
+	// If SetTLS is true, set the new task's TLS (thread-local storage)
+	// descriptor to TLS. If SetTLS is false, TLS is ignored.
+	SetTLS bool
+	TLS    usermem.Addr
+
+	// If ChildClearTID is true, when the child exits, 0 is written to the
+	// address ChildTID in the child's memory, and if the write is successful a
+	// futex wake on the same address is performed.
+	//
+	// If ChildSetTID is true, the child's thread ID (in the child's PID
+	// namespace) is written to address ChildTID in the child's memory. (As in
+	// Linux, failed writes are silently ignored.)
+	ChildClearTID bool
+	ChildSetTID   bool
+	ChildTID      usermem.Addr
+
+	// If ParentSetTID is true, the child's thread ID (in the parent's PID
+	// namespace) is written to address ParentTID in the parent's memory. (As
+	// in Linux, failed writes are silently ignored.)
+	//
+	// Older versions of the clone(2) man page state that CLONE_PARENT_SETTID
+	// causes the child's thread ID to be written to ptid in both the parent
+	// and child's memory, but this is a documentation error fixed by
+	// 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID").
+	ParentSetTID bool
+	ParentTID    usermem.Addr
+
+	// If Vfork is true, place the parent in vforkStop until the cloned task
+	// releases its TaskContext.
+	Vfork bool
+
+	// If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for
+	// this clone(), and do not ptrace-attach the caller's tracer to the new
+	// task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate).
+	Untraced bool
+
+	// If InheritTracer is true, ptrace-attach the caller's tracer to the new
+	// task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported
+	// for it. If both Untraced and InheritTracer are true, no event will be
+	// reported, but tracer inheritance will still occur.
+	InheritTracer bool
+}
+
+// Clone implements the clone(2) syscall and returns the thread ID of the new
+// task in t's PID namespace. Clone may return both a non-zero thread ID and a
+// non-nil error.
+//
+// Preconditions: The caller must be running Task.doSyscallInvoke on the task
+// goroutine.
+func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
+	// Since signal actions may refer to application signal handlers by virtual
+	// address, any set of signal handlers must refer to the same address
+	// space.
+	if !opts.NewSignalHandlers && opts.NewAddressSpace {
+		return 0, nil, syserror.EINVAL
+	}
+	// In order for the behavior of thread-group-directed signals to be sane,
+	// all tasks in a thread group must share signal handlers.
+	if !opts.NewThreadGroup && opts.NewSignalHandlers {
+		return 0, nil, syserror.EINVAL
+	}
+	// All tasks in a thread group must be in the same PID namespace.
+	if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) {
+		return 0, nil, syserror.EINVAL
+	}
+	// The two different ways of specifying a new PID namespace are
+	// incompatible.
+	if opts.NewPIDNamespace && t.childPIDNamespace != nil {
+		return 0, nil, syserror.EINVAL
+	}
+	// Thread groups and FS contexts cannot span user namespaces.
+	if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
+	// single clone(2) or unshare(2) call, the user namespace is guaranteed to
+	// be created first, giving the child (clone(2)) or caller (unshare(2))
+	// privileges over the remaining namespaces created by the call." -
+	// user_namespaces(7)
+	creds := t.Credentials()
+	userns := creds.UserNamespace
+	if opts.NewUserNamespace {
+		var err error
+		// "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
+		// the caller is in a chroot environment (i.e., the caller's root
+		// directory does not match the root directory of the mount namespace
+		// in which it resides)." - clone(2). Neither chroot(2) nor
+		// user_namespaces(7) document this.
+		if t.IsChrooted() {
+			return 0, nil, syserror.EPERM
+		}
+		userns, err = creds.NewChildUserNamespace()
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+	if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
+		return 0, nil, syserror.EPERM
+	}
+
+	utsns := t.UTSNamespace()
+	if opts.NewUTSNamespace {
+		// Note that this must happen after NewUserNamespace so we get
+		// the new userns if there is one.
+		utsns = t.UTSNamespace().Clone(userns)
+	}
+
+	ipcns := t.IPCNamespace()
+	if opts.NewIPCNamespace {
+		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
+		// namespace"
+		ipcns = NewIPCNamespace(userns)
+	}
+
+	tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
+	if err != nil {
+		return 0, nil, err
+	}
+	// clone() returns 0 in the child.
+	tc.Arch.SetReturn(0)
+	if opts.Stack != 0 {
+		tc.Arch.SetStack(uintptr(opts.Stack))
+	}
+	if opts.SetTLS {
+		if !tc.Arch.SetTLS(uintptr(opts.TLS)) {
+			return 0, nil, syserror.EPERM
+		}
+	}
+
+	var fsc *FSContext
+	if opts.NewFSContext {
+		fsc = t.fsc.Fork()
+	} else {
+		fsc = t.fsc
+		fsc.IncRef()
+	}
+
+	var fds *FDMap
+	if opts.NewFiles {
+		fds = t.fds.Fork()
+	} else {
+		fds = t.fds
+		fds.IncRef()
+	}
+
+	pidns := t.tg.pidns
+	if t.childPIDNamespace != nil {
+		pidns = t.childPIDNamespace
+	} else if opts.NewPIDNamespace {
+		pidns = pidns.NewChild(userns)
+	}
+	tg := t.tg
+	if opts.NewThreadGroup {
+		sh := t.tg.signalHandlers
+		if opts.NewSignalHandlers {
+			sh = sh.Fork()
+		}
+		tg = t.k.newThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
+	}
+
+	cfg := &TaskConfig{
+		Kernel:                  t.k,
+		ThreadGroup:             tg,
+		SignalMask:              t.SignalMask(),
+		TaskContext:             tc,
+		FSContext:               fsc,
+		FDMap:                   fds,
+		Credentials:             creds,
+		Niceness:                t.Niceness(),
+		NetworkNamespaced:       t.netns,
+		AllowedCPUMask:          t.CPUMask(),
+		UTSNamespace:            utsns,
+		IPCNamespace:            ipcns,
+		AbstractSocketNamespace: t.abstractSockets,
+		ContainerID:             t.ContainerID(),
+	}
+	if opts.NewThreadGroup {
+		cfg.Parent = t
+	} else {
+		cfg.InheritParent = t
+	}
+	if opts.NewNetworkNamespace {
+		cfg.NetworkNamespaced = true
+	}
+	nt, err := t.tg.pidns.owner.NewTask(cfg)
+	if err != nil {
+		if opts.NewThreadGroup {
+			tg.release()
+		}
+		return 0, nil, err
+	}
+
+	// "A child process created via fork(2) inherits a copy of its parent's
+	// alternate signal stack settings" - sigaltstack(2).
+	//
+	// However kernel/fork.c:copy_process() adds a limitation to this:
+	// "sigaltstack should be cleared when sharing the same VM".
+	if opts.NewAddressSpace || opts.Vfork {
+		nt.SetSignalStack(t.SignalStack())
+	}
+
+	if userns != creds.UserNamespace {
+		if err := nt.SetUserNamespace(userns); err != nil {
+			// This shouldn't be possible: userns was created from nt.creds, so
+			// nt should have CAP_SYS_ADMIN in userns.
+			panic("Task.Clone: SetUserNamespace failed: " + err.Error())
+		}
+	}
+
+	// This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
+	// nt that it must receive before its task goroutine starts running.
+	tid := nt.k.tasks.Root.IDOfTask(nt)
+	defer nt.Start(tid)
+
+	// "If fork/clone and execve are allowed by @prog, any child processes will
+	// be constrained to the same filters and system call ABI as the parent." -
+	// Documentation/prctl/seccomp_filter.txt
+	if f := t.syscallFilters.Load(); f != nil {
+		copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...)
+		nt.syscallFilters.Store(copiedFilters)
+	}
+	if opts.Vfork {
+		nt.vforkParent = t
+	}
+
+	if opts.ChildClearTID {
+		nt.SetClearTID(opts.ChildTID)
+	}
+	if opts.ChildSetTID {
+		// Can't use Task.CopyOut, which assumes AddressSpaceActive.
+		usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{})
+	}
+	ntid := t.tg.pidns.IDOfTask(nt)
+	if opts.ParentSetTID {
+		t.CopyOut(opts.ParentTID, ntid)
+	}
+
+	kind := ptraceCloneKindClone
+	if opts.Vfork {
+		kind = ptraceCloneKindVfork
+	} else if opts.TerminationSignal == linux.SIGCHLD {
+		kind = ptraceCloneKindFork
+	}
+	if t.ptraceClone(kind, nt, opts) {
+		if opts.Vfork {
+			return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil
+		}
+		return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil
+	}
+	if opts.Vfork {
+		t.maybeBeginVforkStop(nt)
+		return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil
+	}
+	return ntid, nil, nil
+}
+
+// maybeBeginVforkStop checks if a previously-started vfork child is still
+// running and has not yet released its MM, such that its parent t should enter
+// a vforkStop.
+//
+// Preconditions: The caller must be running on t's task goroutine.
+func (t *Task) maybeBeginVforkStop(child *Task) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.killedLocked() {
+		child.vforkParent = nil
+		return
+	}
+	if child.vforkParent == t {
+		t.beginInternalStopLocked((*vforkStop)(nil))
+	}
+}
+
+func (t *Task) unstopVforkParent() {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if p := t.vforkParent; p != nil {
+		p.tg.signalHandlers.mu.Lock()
+		defer p.tg.signalHandlers.mu.Unlock()
+		if _, ok := p.stop.(*vforkStop); ok {
+			p.endInternalStopLocked()
+		}
+		// Parent no longer needs to be unstopped.
+		t.vforkParent = nil
+	}
+}
+
+// +stateify savable
+type runSyscallAfterPtraceEventClone struct {
+	vforkChild *Task
+
+	// If vforkChild is not nil, vforkChildTID is its thread ID in the parent's
+	// PID namespace. vforkChildTID must be stored since the child may exit and
+	// release its TID before the PTRACE_EVENT stop ends.
+	vforkChildTID ThreadID
+}
+
+func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
+	if r.vforkChild != nil {
+		t.maybeBeginVforkStop(r.vforkChild)
+		return &runSyscallAfterVforkStop{r.vforkChildTID}
+	}
+	return (*runSyscallExit)(nil)
+}
+
+// +stateify savable
+type runSyscallAfterVforkStop struct {
+	// childTID has the same meaning as
+	// runSyscallAfterPtraceEventClone.vforkChildTID.
+	childTID ThreadID
+}
+
+func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState {
+	t.ptraceVforkDone(r.childTID)
+	return (*runSyscallExit)(nil)
+}
+
+// Unshare changes the set of resources t shares with other tasks, as specified
+// by opts.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Unshare(opts *SharingOptions) error {
+	// In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and
+	// NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if
+	// t is the only task using its MM, which due to clone(2)'s rules imply
+	// that it is also the only task using its signal handlers / in its thread
+	// group, and cause EINVAL to be returned otherwise.
+	//
+	// Since we don't count the number of tasks using each address space or set
+	// of signal handlers, we reject NewSignalHandlers and NewAddressSpace
+	// altogether, and interpret NewThreadGroup as requiring that t be the only
+	// member of its thread group. This seems to be logically coherent, in the
+	// sense that clone(2) allows a task to share signal handlers and address
+	// spaces with tasks in other thread groups.
+	if opts.NewAddressSpace || opts.NewSignalHandlers {
+		return syserror.EINVAL
+	}
+	if opts.NewThreadGroup {
+		t.tg.signalHandlers.mu.Lock()
+		if t.tg.tasksCount != 1 {
+			t.tg.signalHandlers.mu.Unlock()
+			return syserror.EINVAL
+		}
+		t.tg.signalHandlers.mu.Unlock()
+		// This isn't racy because we're the only living task, and therefore
+		// the only task capable of creating new ones, in our thread group.
+	}
+	if opts.NewUserNamespace {
+		if t.IsChrooted() {
+			return syserror.EPERM
+		}
+		// This temporary is needed because Go.
+		creds := t.Credentials()
+		newUserNS, err := creds.NewChildUserNamespace()
+		if err != nil {
+			return err
+		}
+		err = t.SetUserNamespace(newUserNS)
+		if err != nil {
+			return err
+		}
+	}
+	haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
+	if opts.NewPIDNamespace {
+		if !haveCapSysAdmin {
+			return syserror.EPERM
+		}
+		t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
+	}
+	t.mu.Lock()
+	// Can't defer unlock: DecRefs must occur without holding t.mu.
+	if opts.NewNetworkNamespace {
+		if !haveCapSysAdmin {
+			t.mu.Unlock()
+			return syserror.EPERM
+		}
+		t.netns = true
+	}
+	if opts.NewUTSNamespace {
+		if !haveCapSysAdmin {
+			t.mu.Unlock()
+			return syserror.EPERM
+		}
+		// Note that this must happen after NewUserNamespace, so the
+		// new user namespace is used if there is one.
+		t.utsns = t.utsns.Clone(t.creds.UserNamespace)
+	}
+	if opts.NewIPCNamespace {
+		if !haveCapSysAdmin {
+			t.mu.Unlock()
+			return syserror.EPERM
+		}
+		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
+		// namespace"
+		t.ipcns = NewIPCNamespace(t.creds.UserNamespace)
+	}
+	var oldfds *FDMap
+	if opts.NewFiles {
+		oldfds = t.fds
+		t.fds = oldfds.Fork()
+	}
+	var oldfsc *FSContext
+	if opts.NewFSContext {
+		oldfsc = t.fsc
+		t.fsc = oldfsc.Fork()
+	}
+	t.mu.Unlock()
+	if oldfds != nil {
+		oldfds.DecRef()
+	}
+	if oldfsc != nil {
+		oldfsc.DecRef()
+	}
+	return nil
+}
+
+// vforkStop is a TaskStop imposed on a task that creates a child with
+// CLONE_VFORK or vfork(2), that ends when the child task ceases to use its
+// current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
+// that the child and parent share mappings until the child execve()s into a
+// new process image or exits.)
+//
+// +stateify savable
+type vforkStop struct{}
+
+// StopIgnoresKill implements TaskStop.Killable.
+func (*vforkStop) Killable() bool { return true }
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
new file mode 100644
index 000000000..bbd294141
--- /dev/null
+++ b/pkg/sentry/kernel/task_context.go
@@ -0,0 +1,174 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+)
+
+var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC)
+
+// Auxmap contains miscellaneous data for the task.
+type Auxmap map[string]interface{}
+
+// TaskContext is the subset of a task's data that is provided by the loader.
+//
+// +stateify savable
+type TaskContext struct {
+	// Name is the thread name set by the prctl(PR_SET_NAME) system call.
+	Name string
+
+	// Arch is the architecture-specific context (registers, etc.)
+	Arch arch.Context
+
+	// MemoryManager is the task's address space.
+	MemoryManager *mm.MemoryManager
+
+	// fu implements futexes in the address space.
+	fu *futex.Manager
+
+	// st is the task's syscall table.
+	st *SyscallTable
+}
+
+// release releases all resources held by the TaskContext. release is called by
+// the task when it execs into a new TaskContext or exits.
+func (tc *TaskContext) release() {
+	// Nil out pointers so that if the task is saved after release, it doesn't
+	// follow the pointers to possibly now-invalid objects.
+	if tc.MemoryManager != nil {
+		// TODO(b/38173783)
+		tc.MemoryManager.DecUsers(context.Background())
+		tc.MemoryManager = nil
+	}
+	tc.fu = nil
+}
+
+// Fork returns a duplicate of tc. The copied TaskContext always has an
+// independent arch.Context. If shareAddressSpace is true, the copied
+// TaskContext shares an address space with the original; otherwise, the copied
+// TaskContext has an independent address space that is initially a duplicate
+// of the original's.
+func (tc *TaskContext) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskContext, error) {
+	newTC := &TaskContext{
+		Name: tc.Name,
+		Arch: tc.Arch.Fork(),
+		st:   tc.st,
+	}
+	if shareAddressSpace {
+		newTC.MemoryManager = tc.MemoryManager
+		if newTC.MemoryManager != nil {
+			if !newTC.MemoryManager.IncUsers() {
+				// Shouldn't be possible since tc.MemoryManager should be a
+				// counted user.
+				panic(fmt.Sprintf("TaskContext.Fork called with userless TaskContext.MemoryManager"))
+			}
+		}
+		newTC.fu = tc.fu
+	} else {
+		newMM, err := tc.MemoryManager.Fork(ctx)
+		if err != nil {
+			return nil, err
+		}
+		newTC.MemoryManager = newMM
+		newTC.fu = k.futexes.Fork()
+	}
+	return newTC, nil
+}
+
+// Arch returns t's arch.Context.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Arch() arch.Context {
+	return t.tc.Arch
+}
+
+// MemoryManager returns t's MemoryManager. MemoryManager does not take an
+// additional reference on the returned MM.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) MemoryManager() *mm.MemoryManager {
+	return t.tc.MemoryManager
+}
+
+// SyscallTable returns t's syscall table.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) SyscallTable() *SyscallTable {
+	return t.tc.st
+}
+
+// Stack returns the userspace stack.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Stack() *arch.Stack {
+	return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())}
+}
+
+// LoadTaskImage loads filename into a new TaskContext.
+//
+// It takes several arguments:
+//  * mounts: MountNamespace to lookup filename in
+//  * root: Root to lookup filename under
+//  * wd: Working directory to lookup filename under
+//  * maxTraversals: maximum number of symlinks to follow
+//  * filename: path to binary to load
+//  * argv: Binary argv
+//  * envv: Binary envv
+//  * fs: Binary FeatureSet
+func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
+	// Prepare a new user address space to load into.
+	m := mm.NewMemoryManager(k, k)
+	defer m.DecUsers(ctx)
+
+	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv, k.extraAuxv, k.vdso)
+	if err != nil {
+		return nil, err
+	}
+
+	// Lookup our new syscall table.
+	st, ok := LookupSyscallTable(os, ac.Arch())
+	if !ok {
+		// No syscall table found. This means that the ELF binary does not match
+		// the architecture.
+		return nil, errNoSyscalls
+	}
+
+	if !m.IncUsers() {
+		panic("Failed to increment users count on new MM")
+	}
+	return &TaskContext{
+		Name:          name,
+		Arch:          ac,
+		MemoryManager: m,
+		fu:            k.futexes.Fork(),
+		st:            st,
+	}, nil
+}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
new file mode 100644
index 000000000..5d1425d5c
--- /dev/null
+++ b/pkg/sentry/kernel/task_exec.go
@@ -0,0 +1,262 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements the machinery behind the execve() syscall. In brief, a
+// thread executes an execve() by killing all other threads in its thread
+// group, assuming the leader's identity, and then switching process images.
+//
+// This design is effectively mandated by Linux. From ptrace(2):
+//
+// """
+// execve(2) under ptrace
+//     When one thread in a multithreaded process calls execve(2), the
+//     kernel destroys all other threads in the process, and resets the
+//     thread ID of the execing thread to the thread group ID (process ID).
+//     (Or, to put things another way, when a multithreaded process does an
+//     execve(2), at completion of the call, it appears as though the
+//     execve(2) occurred in the thread group leader, regardless of which
+//     thread did the execve(2).)  This resetting of the thread ID looks
+//     very confusing to tracers:
+//
+//     *  All other threads stop in PTRACE_EVENT_EXIT stop, if the
+//        PTRACE_O_TRACEEXIT option was turned on.  Then all other threads
+//        except the thread group leader report death as if they exited via
+//        _exit(2) with exit code 0.
+//
+//     *  The execing tracee changes its thread ID while it is in the
+//        execve(2).  (Remember, under ptrace, the "pid" returned from
+//        waitpid(2), or fed into ptrace calls, is the tracee's thread ID.)
+//        That is, the tracee's thread ID is reset to be the same as its
+//        process ID, which is the same as the thread group leader's thread
+//        ID.
+//
+//     *  Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC
+//        option was turned on.
+//
+//     *  If the thread group leader has reported its PTRACE_EVENT_EXIT stop
+//        by this time, it appears to the tracer that the dead thread leader
+//        "reappears from nowhere".  (Note: the thread group leader does not
+//        report death via WIFEXITED(status) until there is at least one
+//        other live thread.  This eliminates the possibility that the
+//        tracer will see it dying and then reappearing.)  If the thread
+//        group leader was still alive, for the tracer this may look as if
+//        thread group leader returns from a different system call than it
+//        entered, or even "returned from a system call even though it was
+//        not in any system call".  If the thread group leader was not
+//        traced (or was traced by a different tracer), then during
+//        execve(2) it will appear as if it has become a tracee of the
+//        tracer of the execing tracee.
+//
+//     All of the above effects are the artifacts of the thread ID change in
+//     the tracee.
+// """
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// execStop is a TaskStop that a task sets on itself when it wants to execve
+// and is waiting for the other tasks in its thread group to exit first.
+//
+// +stateify savable
+type execStop struct{}
+
+// Killable implements TaskStop.Killable.
+func (*execStop) Killable() bool { return true }
+
+// Execve implements the execve(2) syscall by killing all other tasks in its
+// thread group and switching to newTC. Execve always takes ownership of newTC.
+//
+// Preconditions: The caller must be running Task.doSyscallInvoke on the task
+// goroutine.
+func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+
+	if t.tg.exiting || t.tg.execing != nil {
+		// We lost to a racing group-exit, kill, or exec from another thread
+		// and should just exit.
+		newTC.release()
+		return nil, syserror.EINTR
+	}
+
+	// Cancel any racing group stops.
+	t.tg.endGroupStopLocked(false)
+
+	// If the task has any siblings, they have to exit before the exec can
+	// continue.
+	t.tg.execing = t
+	if t.tg.tasks.Front() != t.tg.tasks.Back() {
+		// "[All] other threads except the thread group leader report death as
+		// if they exited via _exit(2) with exit code 0." - ptrace(2)
+		for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
+			if t != sibling {
+				sibling.killLocked()
+			}
+		}
+		// The last sibling to exit will wake t.
+		t.beginInternalStopLocked((*execStop)(nil))
+	}
+
+	return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil
+}
+
+// The runSyscallAfterExecStop state continues execve(2) after all siblings of
+// a thread in the execve syscall have exited.
+//
+// +stateify savable
+type runSyscallAfterExecStop struct {
+	tc *TaskContext
+}
+
+func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
+	t.tg.pidns.owner.mu.Lock()
+	t.tg.execing = nil
+	if t.killed() {
+		t.tg.pidns.owner.mu.Unlock()
+		r.tc.release()
+		return (*runInterrupt)(nil)
+	}
+	// We are the thread group leader now. Save our old thread ID for
+	// PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this
+	// point it will get a PID of 0, but this is consistent with Linux.
+	oldTID := ThreadID(0)
+	if tracer := t.Tracer(); tracer != nil {
+		oldTID = tracer.tg.pidns.tids[t]
+	}
+	t.promoteLocked()
+	// "POSIX timers are not preserved (timer_create(2))." - execve(2). Handle
+	// this first since POSIX timers are protected by the signal mutex, which
+	// we're about to change. Note that we have to stop and destroy timers
+	// without holding any mutexes to avoid circular lock ordering.
+	var its []*IntervalTimer
+	t.tg.signalHandlers.mu.Lock()
+	for _, it := range t.tg.timers {
+		its = append(its, it)
+	}
+	t.tg.timers = make(map[linux.TimerID]*IntervalTimer)
+	t.tg.signalHandlers.mu.Unlock()
+	t.tg.pidns.owner.mu.Unlock()
+	for _, it := range its {
+		it.DestroyTimer()
+	}
+	t.tg.pidns.owner.mu.Lock()
+	// "During an execve(2), the dispositions of handled signals are reset to
+	// the default; the dispositions of ignored signals are left unchanged. ...
+	// [The] signal mask is preserved across execve(2). ... [The] pending
+	// signal set is preserved across an execve(2)." - signal(7)
+	//
+	// Details:
+	//
+	// - If the thread group is sharing its signal handlers with another thread
+	// group via CLONE_SIGHAND, execve forces the signal handlers to be copied
+	// (see Linux's fs/exec.c:de_thread). We're not reference-counting signal
+	// handlers, so we always make a copy.
+	//
+	// - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags,
+	// restorer (if present), and mask are always reset. (See Linux's
+	// fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.)
+	t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec()
+	t.endStopCond.L = &t.tg.signalHandlers.mu
+	// "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2)
+	t.signalStack = arch.SignalStack{Flags: arch.SignalStackFlagDisable}
+	// "The termination signal is reset to SIGCHLD (see clone(2))."
+	t.tg.terminationSignal = linux.SIGCHLD
+	// execed indicates that the process can no longer join a process group
+	// in some scenarios (namely, the parent call setpgid(2) on the child).
+	// See the JoinProcessGroup function in sessions.go for more context.
+	t.tg.execed = true
+	// Maximum RSS is preserved across execve(2).
+	t.updateRSSLocked()
+	// Restartable sequence state is discarded.
+	t.rseqPreempted = false
+	t.rseqCPUAddr = 0
+	t.rseqCPU = -1
+	t.tg.rscr.Store(&RSEQCriticalRegion{})
+	t.tg.pidns.owner.mu.Unlock()
+
+	// Remove FDs with the CloseOnExec flag set.
+	t.fds.RemoveIf(func(file *fs.File, flags FDFlags) bool {
+		return flags.CloseOnExec
+	})
+
+	// Switch to the new process.
+	t.MemoryManager().Deactivate()
+	t.mu.Lock()
+	// Update credentials to reflect the execve. This should precede switching
+	// MMs to ensure that dumpability has been reset first, if needed.
+	t.updateCredsForExecLocked()
+	t.tc.release()
+	t.tc = *r.tc
+	t.mu.Unlock()
+	t.unstopVforkParent()
+	// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
+	t.MemoryManager().Activate()
+
+	t.ptraceExec(oldTID)
+	return (*runSyscallExit)(nil)
+}
+
+// promoteLocked makes t the leader of its thread group. If t is already the
+// thread group leader, promoteLocked is a no-op.
+//
+// Preconditions: All other tasks in t's thread group, including the existing
+// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must
+// be locked for writing.
+func (t *Task) promoteLocked() {
+	oldLeader := t.tg.leader
+	if t == oldLeader {
+		return
+	}
+	// Swap the leader's TIDs with the execing task's. The latter will be
+	// released when the old leader is reaped below.
+	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+		oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader]
+		ns.tids[oldLeader] = oldTID
+		ns.tids[t] = leaderTID
+		ns.tasks[oldTID] = oldLeader
+		ns.tasks[leaderTID] = t
+		// Neither the ThreadGroup nor TGID change, so no need to
+		// update ns.tgids.
+	}
+
+	// Inherit the old leader's start time.
+	oldStartTime := oldLeader.StartTime()
+	t.mu.Lock()
+	t.startTime = oldStartTime
+	t.mu.Unlock()
+
+	t.tg.leader = t
+	t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t])
+	t.updateLogPrefixLocked()
+	// Reap the original leader. If it has a tracer, detach it instead of
+	// waiting for it to acknowledge the original leader's death.
+	oldLeader.exitParentNotified = true
+	oldLeader.exitParentAcked = true
+	if tracer := oldLeader.Tracer(); tracer != nil {
+		delete(tracer.ptraceTracees, oldLeader)
+		oldLeader.forgetTracerLocked()
+		// Notify the tracer that it will no longer be receiving these events
+		// from the tracee.
+		tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue)
+	}
+	oldLeader.exitNotifyLocked(false)
+}
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
new file mode 100644
index 000000000..158e665d3
--- /dev/null
+++ b/pkg/sentry/kernel/task_exit.go
@@ -0,0 +1,1159 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements the task exit cycle:
+//
+// - Tasks are asynchronously requested to exit with Task.Kill.
+//
+// - When able, the task goroutine enters the exit path starting from state
+// runExit.
+//
+// - Other tasks observe completed exits with Task.Wait (which implements the
+// wait*() family of syscalls).
+
+import (
+	"errors"
+	"fmt"
+	"strconv"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// An ExitStatus is a value communicated from an exiting task or thread group
+// to the party that reaps it.
+//
+// +stateify savable
+type ExitStatus struct {
+	// Code is the numeric value passed to the call to exit or exit_group that
+	// caused the exit. If the exit was not caused by such a call, Code is 0.
+	Code int
+
+	// Signo is the signal that caused the exit. If the exit was not caused by
+	// a signal, Signo is 0.
+	Signo int
+}
+
+// Signaled returns true if the ExitStatus indicates that the exiting task or
+// thread group was killed by a signal.
+func (es ExitStatus) Signaled() bool {
+	return es.Signo != 0
+}
+
+// Status returns the numeric representation of the ExitStatus returned by e.g.
+// the wait4() system call.
+func (es ExitStatus) Status() uint32 {
+	return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff)
+}
+
+// ShellExitCode returns the numeric exit code that Bash would return for an
+// exit status of es.
+func (es ExitStatus) ShellExitCode() int {
+	if es.Signaled() {
+		return 128 + es.Signo
+	}
+	return es.Code
+}
+
+// TaskExitState represents a step in the task exit path.
+//
+// "Exiting" and "exited" are often ambiguous; prefer to name specific states.
+type TaskExitState int
+
+const (
+	// TaskExitNone indicates that the task has not begun exiting.
+	TaskExitNone TaskExitState = iota
+
+	// TaskExitInitiated indicates that the task goroutine has entered the exit
+	// path, and the task is no longer eligible to participate in group stops
+	// or group signal handling. TaskExitInitiated is analogous to Linux's
+	// PF_EXITING.
+	TaskExitInitiated
+
+	// TaskExitZombie indicates that the task has released its resources, and
+	// the task no longer prevents a sibling thread from completing execve.
+	TaskExitZombie
+
+	// TaskExitDead indicates that the task's thread IDs have been released,
+	// and the task no longer prevents its thread group leader from being
+	// reaped. ("Reaping" refers to the transitioning of a task from
+	// TaskExitZombie to TaskExitDead.)
+	TaskExitDead
+)
+
+// String implements fmt.Stringer.
+func (t TaskExitState) String() string {
+	switch t {
+	case TaskExitNone:
+		return "TaskExitNone"
+	case TaskExitInitiated:
+		return "TaskExitInitiated"
+	case TaskExitZombie:
+		return "TaskExitZombie"
+	case TaskExitDead:
+		return "TaskExitDead"
+	default:
+		return strconv.Itoa(int(t))
+	}
+}
+
+// killLocked marks t as killed by enqueueing a SIGKILL, without causing the
+// thread-group-affecting side effects SIGKILL usually has.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) killLocked() {
+	// Clear killable stops.
+	if t.stop != nil && t.stop.Killable() {
+		t.endInternalStopLocked()
+	}
+	t.pendingSignals.enqueue(&arch.SignalInfo{
+		Signo: int32(linux.SIGKILL),
+		// Linux just sets SIGKILL in the pending signal bitmask without
+		// enqueueing an actual siginfo, such that
+		// kernel/signal.c:collect_signal() initializes si_code to SI_USER.
+		Code: arch.SignalInfoUser,
+	}, nil)
+	t.interrupt()
+}
+
+// killed returns true if t has a SIGKILL pending. killed is analogous to
+// Linux's fatal_signal_pending().
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) killed() bool {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.killedLocked()
+}
+
+func (t *Task) killedLocked() bool {
+	return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0
+}
+
+// PrepareExit indicates an exit with status es.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) PrepareExit(es ExitStatus) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.exitStatus = es
+}
+
+// PrepareGroupExit indicates a group exit with status es to t's thread group.
+//
+// PrepareGroupExit is analogous to Linux's do_group_exit(), except that it
+// does not tail-call do_exit(), except that it *does* set Task.exitStatus.
+// (Linux does not do so until within do_exit(), since it reuses exit_code for
+// ptrace.)
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) PrepareGroupExit(es ExitStatus) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.tg.exiting || t.tg.execing != nil {
+		// Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e.
+		// this "group exit" is being executed by the killed sibling of an
+		// execing task, then Task.Execve never set t.tg.exitStatus, so it's
+		// still the zero value. This is consistent with Linux, both in intent
+		// ("all other threads ... report death as if they exited via _exit(2)
+		// with exit code 0" - ptrace(2), "execve under ptrace") and in
+		// implementation (compare fs/exec.c:de_thread() =>
+		// kernel/signal.c:zap_other_threads() and
+		// kernel/exit.c:do_group_exit() =>
+		// include/linux/sched.h:signal_group_exit()).
+		t.exitStatus = t.tg.exitStatus
+		return
+	}
+	t.tg.exiting = true
+	t.tg.exitStatus = es
+	t.exitStatus = es
+	for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
+		if sibling != t {
+			sibling.killLocked()
+		}
+	}
+}
+
+// Kill requests that all tasks in ts exit as if group exiting with status es.
+// Kill does not wait for tasks to exit.
+//
+// Kill has no analogue in Linux; it's provided for save/restore only.
+func (ts *TaskSet) Kill(es ExitStatus) {
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.Root.exiting = true
+	for t := range ts.Root.tids {
+		t.tg.signalHandlers.mu.Lock()
+		if !t.tg.exiting {
+			t.tg.exiting = true
+			t.tg.exitStatus = es
+		}
+		t.killLocked()
+		t.tg.signalHandlers.mu.Unlock()
+	}
+}
+
+// advanceExitStateLocked checks that t's current exit state is oldExit, then
+// sets it to newExit. If t's current exit state is not oldExit,
+// advanceExitStateLocked panics.
+//
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) {
+	if t.exitState != oldExit {
+		panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState))
+	}
+	t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit)
+	t.exitState = newExit
+}
+
+// runExit is the entry point into the task exit path.
+//
+// +stateify savable
+type runExit struct{}
+
+func (*runExit) execute(t *Task) taskRunState {
+	t.ptraceExit()
+	return (*runExitMain)(nil)
+}
+
+// +stateify savable
+type runExitMain struct{}
+
+func (*runExitMain) execute(t *Task) taskRunState {
+	lastExiter := t.exitThreadGroup()
+
+	// If the task has a cleartid, and the thread group wasn't killed by a
+	// signal, handle that before releasing the MM.
+	if t.cleartid != 0 {
+		t.tg.signalHandlers.mu.Lock()
+		signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
+		t.tg.signalHandlers.mu.Unlock()
+		if !signaled {
+			if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
+				t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1)
+			}
+			// If the CopyOut fails, there's nothing we can do.
+		}
+	}
+
+	// Deactivate the address space and update max RSS before releasing the
+	// task's MM.
+	t.Deactivate()
+	t.tg.pidns.owner.mu.Lock()
+	t.updateRSSLocked()
+	t.tg.pidns.owner.mu.Unlock()
+	t.mu.Lock()
+	t.tc.release()
+	t.mu.Unlock()
+
+	// Releasing the MM unblocks a blocked CLONE_VFORK parent.
+	t.unstopVforkParent()
+
+	t.fsc.DecRef()
+	t.fds.DecRef()
+
+	// If this is the last task to exit from the thread group, release the
+	// thread group's resources.
+	if lastExiter {
+		t.tg.release()
+	}
+
+	// Detach tracees.
+	t.exitPtrace()
+
+	// Reparent the task's children.
+	t.exitChildren()
+
+	// Don't tail-call runExitNotify, as exitChildren may have initiated a stop
+	// to wait for a PID namespace to die.
+	return (*runExitNotify)(nil)
+}
+
+// exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread
+// group that it is no longer eligible to participate in group activities. It
+// returns true if t is the last task in its thread group to call
+// exitThreadGroup.
+func (t *Task) exitThreadGroup() bool {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.tg.signalHandlers.mu.Lock()
+	// Can't defer unlock: see below.
+
+	t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated)
+	t.tg.activeTasks--
+	last := t.tg.activeTasks == 0
+
+	// Ensure that someone will handle the signals we can't.
+	t.setSignalMaskLocked(^linux.SignalSet(0))
+
+	// Check if this task's exit interacts with an initiated group stop.
+	if !t.groupStopPending {
+		t.tg.signalHandlers.mu.Unlock()
+		return last
+	}
+	t.groupStopPending = false
+	sig := t.tg.groupStopSignal
+	notifyParent := t.participateGroupStopLocked()
+	// signalStop must be called with t's signal mutex unlocked.
+	t.tg.signalHandlers.mu.Unlock()
+	if notifyParent && t.tg.leader.parent != nil {
+		t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig))
+		t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
+	}
+	return last
+}
+
+func (t *Task) exitChildren() {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	newParent := t.findReparentTargetLocked()
+	if newParent == nil {
+		// "If the init process of a PID namespace terminates, the kernel
+		// terminates all of the processes in the namespace via a SIGKILL
+		// signal." - pid_namespaces(7)
+		t.Debugf("Init process terminating, killing namespace")
+		t.tg.pidns.exiting = true
+		for other := range t.tg.pidns.tgids {
+			if other == t.tg {
+				continue
+			}
+			other.signalHandlers.mu.Lock()
+			other.leader.sendSignalLocked(&arch.SignalInfo{
+				Signo: int32(linux.SIGKILL),
+			}, true /* group */)
+			other.signalHandlers.mu.Unlock()
+		}
+		// TODO(b/37722272): The init process waits for all processes in the
+		// namespace to exit before completing its own exit
+		// (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all
+		// other tasks in the namespace are dead, except possibly for this
+		// thread group's leader (which can't be reaped until this task exits).
+	}
+	// This is correct even if newParent is nil (it ensures that children don't
+	// wait for a parent to reap them.)
+	for c := range t.children {
+		if sig := c.ParentDeathSignal(); sig != 0 {
+			siginfo := &arch.SignalInfo{
+				Signo: int32(sig),
+				Code:  arch.SignalInfoUser,
+			}
+			siginfo.SetPid(int32(c.tg.pidns.tids[t]))
+			siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow()))
+			c.tg.signalHandlers.mu.Lock()
+			c.sendSignalLocked(siginfo, true /* group */)
+			c.tg.signalHandlers.mu.Unlock()
+		}
+		c.reparentLocked(newParent)
+		if newParent != nil {
+			newParent.children[c] = struct{}{}
+		}
+	}
+}
+
+// findReparentTargetLocked returns the task to which t's children should be
+// reparented. If no such task exists, findNewParentLocked returns nil.
+//
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) findReparentTargetLocked() *Task {
+	// Reparent to any sibling in the same thread group that hasn't begun
+	// exiting.
+	if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil {
+		return t2
+	}
+	// "A child process that is orphaned within the namespace will be
+	// reparented to [the init process for the namespace] ..." -
+	// pid_namespaces(7)
+	if init := t.tg.pidns.tasks[InitTID]; init != nil {
+		return init.tg.anyNonExitingTaskLocked()
+	}
+	return nil
+}
+
+func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task {
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		if t.exitState == TaskExitNone {
+			return t
+		}
+	}
+	return nil
+}
+
+// reparentLocked changes t's parent. The new parent may be nil.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) reparentLocked(parent *Task) {
+	oldParent := t.parent
+	t.parent = parent
+	// If a thread group leader's parent changes, reset the thread group's
+	// termination signal to SIGCHLD and re-check exit notification. (Compare
+	// kernel/exit.c:reparent_leader().)
+	if t != t.tg.leader {
+		return
+	}
+	if oldParent == nil && parent == nil {
+		return
+	}
+	if oldParent != nil && parent != nil && oldParent.tg == parent.tg {
+		return
+	}
+	t.tg.terminationSignal = linux.SIGCHLD
+	if t.exitParentNotified && !t.exitParentAcked {
+		t.exitParentNotified = false
+		t.exitNotifyLocked(false)
+	}
+}
+
+// When a task exits, other tasks in the system, notably the task's parent and
+// ptracer, may want to be notified. The exit notification system ensures that
+// interested tasks receive signals and/or are woken from blocking calls to
+// wait*() syscalls; these notifications must be resolved before exiting tasks
+// can be reaped and disappear from the system.
+//
+// Each task may have a parent task and/or a tracer task. If both a parent and
+// a tracer exist, they may be the same task, different tasks in the same
+// thread group, or tasks in different thread groups. (In the last case, Linux
+// refers to the task as being ptrace-reparented due to an implementation
+// detail; we avoid this terminology to avoid confusion.)
+//
+// A thread group is *empty* if all non-leader tasks in the thread group are
+// dead, and the leader is either a zombie or dead. The exit of a thread group
+// leader is never waitable - by either the parent or tracer - until the thread
+// group is empty.
+//
+// There are a few ways for an exit notification to be resolved:
+//
+// - The exit notification may be acknowledged by a call to Task.Wait with
+// WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall).
+//
+// - If the notified party is the parent, and the parent thread group is not
+// also the tracer thread group, and the notification signal is SIGCHLD, the
+// parent may explicitly ignore the notification (see quote in exitNotify).
+// Note that it's possible for the notified party to ignore the signal in other
+// cases, but the notification is only resolved under the above conditions.
+// (Actually, there is one exception; see the last paragraph of the "leader,
+// has tracer, tracer thread group is parent thread group" case below.)
+//
+// - If the notified party is the parent, and the parent does not exist, the
+// notification is resolved as if ignored. (This is only possible in the
+// sentry. In Linux, the only task / thread group without a parent is global
+// init, and killing global init causes a kernel panic.)
+//
+// - If the notified party is a tracer, the tracer may detach the traced task.
+// (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.)
+//
+// In addition, if the notified party is the parent, the parent may exit and
+// cause the notifying task to be reparented to another thread group. This does
+// not resolve the notification; instead, the notification must be resent to
+// the new parent.
+//
+// The series of notifications generated for a given task's exit depend on
+// whether it is a thread group leader; whether the task is ptraced; and, if
+// so, whether the tracer thread group is the same as the parent thread group.
+//
+// - Non-leader, no tracer: No notification is generated; the task is reaped
+// immediately.
+//
+// - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer
+// notification is resolved (by waiting or detaching), the task is reaped. (For
+// non-leaders, whether the tracer and parent thread groups are the same is
+// irrelevant.)
+//
+// - Leader, no tracer: The task remains a zombie, with no notification sent,
+// until all other tasks in the thread group are dead. (In Linux terms, this
+// condition is indicated by include/linux/sched.h:thread_group_empty(); tasks
+// are removed from their thread_group list in kernel/exit.c:release_task() =>
+// __exit_signal() => __unhash_process().) Then the thread group's termination
+// signal is sent to the parent. When the parent notification is resolved (by
+// waiting or ignoring), the task is reaped.
+//
+// - Leader, has tracer, tracer thread group is not parent thread group:
+// SIGCHLD is sent to the tracer. When the tracer notification is resolved (by
+// waiting or detaching), and all other tasks in the thread group are dead, the
+// thread group's termination signal is sent to the parent. (Note that the
+// tracer cannot resolve the exit notification by waiting until the thread
+// group is empty.) When the parent notification is resolved, the task is
+// reaped.
+//
+// - Leader, has tracer, tracer thread group is parent thread group:
+//
+// If all other tasks in the thread group are dead, the thread group's
+// termination signal is sent to the parent. At this point, the notification
+// can only be resolved by waiting. If the parent detaches from the task as a
+// tracer, the notification is not resolved, but the notification can now be
+// resolved by waiting or ignoring. When the parent notification is resolved,
+// the task is reaped.
+//
+// If at least one task in the thread group is not dead, SIGCHLD is sent to the
+// parent. At this point, the notification cannot be resolved at all; once the
+// thread group becomes empty, it can be resolved only by waiting. If the
+// parent detaches from the task as a tracer before all remaining tasks die,
+// then exit notification proceeds as in the case where the leader never had a
+// tracer. If the parent detaches from the task as a tracer after all remaining
+// tasks die, the notification is not resolved, but the notification can now be
+// resolved by waiting or ignoring. When the parent notification is resolved,
+// the task is reaped.
+//
+// In both of the above cases, when the parent detaches from the task as a
+// tracer while the thread group is empty, whether or not the parent resolves
+// the notification by ignoring it is based on the parent's SIGCHLD signal
+// action, whether or not the thread group's termination signal is SIGCHLD
+// (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()).
+//
+// There is one final wrinkle: A leader can become a non-leader due to a
+// sibling execve. In this case, the execing thread detaches the leader's
+// tracer (if one exists) and reaps the leader immediately. In Linux, this is
+// in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked().
+
+// +stateify savable
+type runExitNotify struct{}
+
+func (*runExitNotify) execute(t *Task) taskRunState {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie)
+	t.tg.liveTasks--
+	// Check if this completes a sibling's execve.
+	if t.tg.execing != nil && t.tg.liveTasks == 1 {
+		// execing blocks the addition of new tasks to the thread group, so
+		// the sole living task must be the execing one.
+		e := t.tg.execing
+		e.tg.signalHandlers.mu.Lock()
+		if _, ok := e.stop.(*execStop); ok {
+			e.endInternalStopLocked()
+		}
+		e.tg.signalHandlers.mu.Unlock()
+	}
+	t.exitNotifyLocked(false)
+	// The task goroutine will now exit.
+	return nil
+}
+
+// exitNotifyLocked is called after changes to t's state that affect exit
+// notification.
+//
+// If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace;
+// thanks to Linux's haphazard implementation of this functionality, such cases
+// determine whether parent notifications are ignored based on the parent's
+// handling of SIGCHLD, regardless of what the exited task's thread group's
+// termination signal is.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
+	if t.exitState != TaskExitZombie {
+		return
+	}
+	if !t.exitTracerNotified {
+		t.exitTracerNotified = true
+		tracer := t.Tracer()
+		if tracer == nil {
+			t.exitTracerAcked = true
+		} else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg {
+			// Don't set exitParentNotified if t is non-leader, even if the
+			// tracer is in the parent thread group, so that if the parent
+			// detaches the following call to exitNotifyLocked passes through
+			// the !exitParentNotified case below and causes t to be reaped
+			// immediately.
+			//
+			// Tracer notification doesn't care about about
+			// SIG_IGN/SA_NOCLDWAIT.
+			tracer.tg.signalHandlers.mu.Lock()
+			tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */)
+			tracer.tg.signalHandlers.mu.Unlock()
+			// Wake EventTraceeStop waiters as well since this task will never
+			// ptrace-stop again.
+			tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop)
+		} else {
+			// t is a leader and the tracer is in the parent thread group.
+			t.exitParentNotified = true
+			sig := linux.SIGCHLD
+			if t.tg.tasksCount == 1 {
+				sig = t.tg.terminationSignal
+			}
+			// This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either
+			// (in Linux, the check in do_notify_parent() is gated by
+			// !tsk->ptrace.)
+			t.parent.tg.signalHandlers.mu.Lock()
+			t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */)
+			t.parent.tg.signalHandlers.mu.Unlock()
+			// See below for rationale for this event mask.
+			t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
+		}
+	}
+	if t.exitTracerAcked && !t.exitParentNotified {
+		if t != t.tg.leader {
+			t.exitParentNotified = true
+			t.exitParentAcked = true
+		} else if t.tg.tasksCount == 1 {
+			t.exitParentNotified = true
+			if t.parent == nil {
+				t.exitParentAcked = true
+			} else {
+				// "POSIX.1-2001 specifies that if the disposition of SIGCHLD is
+				// set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see
+				// sigaction(2)), then children that terminate do not become
+				// zombies and a call to wait() or waitpid() will block until all
+				// children have terminated, and then fail with errno set to
+				// ECHILD. (The original POSIX standard left the behavior of
+				// setting SIGCHLD to SIG_IGN unspecified. Note that even though
+				// the default disposition of SIGCHLD is "ignore", explicitly
+				// setting the disposition to SIG_IGN results in different
+				// treatment of zombie process children.) Linux 2.6 conforms to
+				// this specification." - wait(2)
+				//
+				// Some undocumented Linux-specific details:
+				//
+				// - All of the above is ignored if the termination signal isn't
+				// SIGCHLD.
+				//
+				// - SA_NOCLDWAIT causes the leader to be immediately reaped, but
+				// does not suppress the SIGCHLD.
+				signalParent := t.tg.terminationSignal.IsValid()
+				t.parent.tg.signalHandlers.mu.Lock()
+				if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach {
+					if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok {
+						if act.Handler == arch.SignalActIgnore {
+							t.exitParentAcked = true
+							signalParent = false
+						} else if act.Flags&arch.SignalFlagNoCldWait != 0 {
+							t.exitParentAcked = true
+						}
+					}
+				}
+				if signalParent {
+					t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */)
+				}
+				t.parent.tg.signalHandlers.mu.Unlock()
+				// If a task in the parent was waiting for a child group stop
+				// or continue, it needs to be notified of the exit, because
+				// there may be no remaining eligible tasks (so that wait
+				// should return ECHILD).
+				t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
+			}
+		}
+	}
+	if t.exitTracerAcked && t.exitParentAcked {
+		t.advanceExitStateLocked(TaskExitZombie, TaskExitDead)
+		for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+			tid := ns.tids[t]
+			delete(ns.tasks, tid)
+			delete(ns.tids, t)
+			if t == t.tg.leader {
+				delete(ns.tgids, t.tg)
+			}
+		}
+		t.tg.exitedCPUStats.Accumulate(t.CPUStats())
+		t.tg.ioUsage.Accumulate(t.ioUsage)
+		t.tg.signalHandlers.mu.Lock()
+		t.tg.tasks.Remove(t)
+		t.tg.tasksCount--
+		tc := t.tg.tasksCount
+		t.tg.signalHandlers.mu.Unlock()
+		if tc == 1 && t != t.tg.leader {
+			// Our fromPtraceDetach doesn't matter here (in Linux terms, this
+			// is via a call to release_task()).
+			t.tg.leader.exitNotifyLocked(false)
+		} else if tc == 0 {
+			t.tg.processGroup.decRefWithParent(t.tg.parentPG())
+		}
+		if t.parent != nil {
+			delete(t.parent.children, t)
+			t.parent = nil
+		}
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.SignalInfo {
+	info := &arch.SignalInfo{
+		Signo: int32(sig),
+	}
+	info.SetPid(int32(receiver.tg.pidns.tids[t]))
+	info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+	if t.exitStatus.Signaled() {
+		info.Code = arch.CLD_KILLED
+		info.SetStatus(int32(t.exitStatus.Signo))
+	} else {
+		info.Code = arch.CLD_EXITED
+		info.SetStatus(int32(t.exitStatus.Code))
+	}
+	// TODO(b/72102453): Set utime, stime.
+	return info
+}
+
+// ExitStatus returns t's exit status, which is only guaranteed to be
+// meaningful if t.ExitState() != TaskExitNone.
+func (t *Task) ExitStatus() ExitStatus {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.exitStatus
+}
+
+// ExitStatus returns the exit status that would be returned by a consuming
+// wait*() on tg.
+func (tg *ThreadGroup) ExitStatus() ExitStatus {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	if tg.exiting {
+		return tg.exitStatus
+	}
+	return tg.leader.exitStatus
+}
+
+// TerminationSignal returns the thread group's termination signal.
+func (tg *ThreadGroup) TerminationSignal() linux.Signal {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.terminationSignal
+}
+
+// Task events that can be waited for.
+const (
+	// EventExit represents an exit notification generated for a child thread
+	// group leader or a tracee under the conditions specified in the comment
+	// above runExitNotify.
+	EventExit waiter.EventMask = 1 << iota
+
+	// EventChildGroupStop occurs when a child thread group completes a group
+	// stop (i.e. all tasks in the child thread group have entered a stopped
+	// state as a result of a group stop).
+	EventChildGroupStop
+
+	// EventTraceeStop occurs when a task that is ptraced by a task in the
+	// notified thread group enters a ptrace stop (see ptrace(2)).
+	EventTraceeStop
+
+	// EventGroupContinue occurs when a child thread group, or a thread group
+	// whose leader is ptraced by a task in the notified thread group, that had
+	// initiated or completed a group stop leaves the group stop, due to the
+	// child thread group or any task in the child thread group being sent
+	// SIGCONT.
+	EventGroupContinue
+)
+
+// WaitOptions controls the behavior of Task.Wait.
+type WaitOptions struct {
+	// If SpecificTID is non-zero, only events from the task with thread ID
+	// SpecificTID are eligible to be waited for. SpecificTID is resolved in
+	// the PID namespace of the waiter (the method receiver of Task.Wait). If
+	// no such task exists, or that task would not otherwise be eligible to be
+	// waited for by the waiting task, then there are no waitable tasks and
+	// Wait will return ECHILD.
+	SpecificTID ThreadID
+
+	// If SpecificPGID is non-zero, only events from ThreadGroups with a
+	// matching ProcessGroupID are eligible to be waited for. (Same
+	// constraints as SpecificTID apply.)
+	SpecificPGID ProcessGroupID
+
+	// Terminology note: Per waitpid(2), "a clone child is one which delivers
+	// no signal, or a signal other than SIGCHLD to its parent upon
+	// termination." In Linux, termination signal is technically a per-task
+	// property rather than a per-thread-group property. However, clone()
+	// forces no termination signal for tasks created with CLONE_THREAD, and
+	// execve() resets the termination signal to SIGCHLD, so all
+	// non-group-leader threads have no termination signal and are therefore
+	// "clone tasks".
+
+	// If NonCloneTasks is true, events from non-clone tasks are eligible to be
+	// waited for.
+	NonCloneTasks bool
+
+	// If CloneTasks is true, events from clone tasks are eligible to be waited
+	// for.
+	CloneTasks bool
+
+	// If SiblingChildren is true, events from children tasks of any task
+	// in the thread group of the waiter are eligible to be waited for.
+	SiblingChildren bool
+
+	// Events is a bitwise combination of the events defined above that specify
+	// what events are of interest to the call to Wait.
+	Events waiter.EventMask
+
+	// If ConsumeEvent is true, the Wait should consume the event such that it
+	// cannot be returned by a future Wait. Note that if a task exit is
+	// consumed in this way, in most cases the task will be reaped.
+	ConsumeEvent bool
+
+	// If BlockInterruptErr is not nil, Wait will block until either an event
+	// is available or there are no tasks that could produce a waitable event;
+	// if that blocking is interrupted, Wait returns BlockInterruptErr. If
+	// BlockInterruptErr is nil, Wait will not block.
+	BlockInterruptErr error
+}
+
+// Preconditions: The TaskSet mutex must be locked (for reading or writing).
+func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace, tracee bool) bool {
+	if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] {
+		return false
+	}
+	if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] {
+		return false
+	}
+	// Tracees are always eligible.
+	if tracee {
+		return true
+	}
+	if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD {
+		return o.NonCloneTasks
+	}
+	return o.CloneTasks
+}
+
+// ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g.
+// waitpid(WNOHANG)) that find no waitable events, but determine that waitable
+// events may exist in the future. (In contrast, if a non-blocking or blocking
+// Wait determines that there are no tasks that can produce a waitable event,
+// Task.Wait returns ECHILD.)
+var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events")
+
+// WaitResult contains information about a waited-for event.
+type WaitResult struct {
+	// Task is the task that reported the event.
+	Task *Task
+
+	// TID is the thread ID of Task in the PID namespace of the task that
+	// called Wait (that is, the method receiver of the call to Task.Wait). TID
+	// is provided because consuming exit waits cause the thread ID to be
+	// deallocated.
+	TID ThreadID
+
+	// UID is the real UID of Task in the user namespace of the task that
+	// called Wait.
+	UID auth.UID
+
+	// Event is exactly one of the events defined above.
+	Event waiter.EventMask
+
+	// Status is the numeric status associated with the event.
+	Status uint32
+}
+
+// Wait waits for an event from a thread group that is a child of t's thread
+// group, or a task in such a thread group, or a task that is ptraced by t,
+// subject to the options specified in opts.
+func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) {
+	if opts.BlockInterruptErr == nil {
+		return t.waitOnce(opts)
+	}
+	w, ch := waiter.NewChannelEntry(nil)
+	t.tg.eventQueue.EventRegister(&w, opts.Events)
+	defer t.tg.eventQueue.EventUnregister(&w)
+	for {
+		wr, err := t.waitOnce(opts)
+		if err != ErrNoWaitableEvent {
+			// This includes err == nil.
+			return wr, err
+		}
+		if err := t.Block(ch); err != nil {
+			return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr)
+		}
+	}
+}
+
+func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) {
+	anyWaitableTasks := false
+
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+
+	if opts.SiblingChildren {
+		// We can wait on the children and tracees of any task in the
+		// same thread group.
+		for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() {
+			wr, any := t.waitParentLocked(opts, parent)
+			if wr != nil {
+				return wr, nil
+			}
+			anyWaitableTasks = anyWaitableTasks || any
+		}
+	} else {
+		// We can only wait on this task.
+		var wr *WaitResult
+		wr, anyWaitableTasks = t.waitParentLocked(opts, t)
+		if wr != nil {
+			return wr, nil
+		}
+	}
+
+	if anyWaitableTasks {
+		return nil, ErrNoWaitableEvent
+	}
+	return nil, syserror.ECHILD
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, bool) {
+	anyWaitableTasks := false
+
+	for child := range parent.children {
+		if !opts.matchesTask(child, parent.tg.pidns, false) {
+			continue
+		}
+		// Non-leaders don't notify parents on exit and aren't eligible to
+		// be waited on.
+		if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked {
+			anyWaitableTasks = true
+			if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil {
+				return wr, anyWaitableTasks
+			}
+		}
+		// Check for group stops and continues. Tasks that have passed
+		// TaskExitInitiated can no longer participate in group stops.
+		if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 {
+			continue
+		}
+		if child.exitState >= TaskExitInitiated {
+			continue
+		}
+		// If the waiter is in the same thread group as the task's
+		// tracer, do not report its group stops; they will be reported
+		// as ptrace stops instead. This also skips checking for group
+		// continues, but they'll be checked for when scanning tracees
+		// below. (Per kernel/exit.c:wait_consider_task(): "If a
+		// ptracer wants to distinguish the two events for its own
+		// children, it should create a separate process which takes
+		// the role of real parent.")
+		if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg {
+			continue
+		}
+		anyWaitableTasks = true
+		if opts.Events&EventChildGroupStop != 0 {
+			if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil {
+				return wr, anyWaitableTasks
+			}
+		}
+		if opts.Events&EventGroupContinue != 0 {
+			if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil {
+				return wr, anyWaitableTasks
+			}
+		}
+	}
+	for tracee := range parent.ptraceTracees {
+		if !opts.matchesTask(tracee, parent.tg.pidns, true) {
+			continue
+		}
+		// Non-leaders do notify tracers on exit.
+		if opts.Events&EventExit != 0 && !tracee.exitTracerAcked {
+			anyWaitableTasks = true
+			if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil {
+				return wr, anyWaitableTasks
+			}
+		}
+		if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 {
+			continue
+		}
+		if tracee.exitState >= TaskExitInitiated {
+			continue
+		}
+		anyWaitableTasks = true
+		if opts.Events&EventTraceeStop != 0 {
+			if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil {
+				return wr, anyWaitableTasks
+			}
+		}
+		if opts.Events&EventGroupContinue != 0 {
+			if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil {
+				return wr, anyWaitableTasks
+			}
+		}
+	}
+
+	return nil, anyWaitableTasks
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult {
+	if asPtracer && !target.exitTracerNotified {
+		return nil
+	}
+	if !asPtracer && !target.exitParentNotified {
+		return nil
+	}
+	// Zombied thread group leaders are never waitable until their thread group
+	// is otherwise empty. Usually this is caught by the
+	// target.exitParentNotified check above, but if t is both (in the thread
+	// group of) target's tracer and parent, asPtracer may be true.
+	if target == target.tg.leader && target.tg.tasksCount != 1 {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	status := target.exitStatus.Status()
+	if !opts.ConsumeEvent {
+		return &WaitResult{
+			Task:   target,
+			TID:    pid,
+			UID:    uid,
+			Event:  EventExit,
+			Status: status,
+		}
+	}
+	// Surprisingly, the exit status reported by a non-consuming wait can
+	// differ from that reported by a consuming wait; the latter will return
+	// the group exit code if one is available.
+	if target.tg.exiting {
+		status = target.tg.exitStatus.Status()
+	}
+	// t may be (in the thread group of) target's parent, tracer, or both. We
+	// don't need to check for !exitTracerAcked because tracees are detached
+	// here, and we don't need to check for !exitParentAcked because zombies
+	// will be reaped here.
+	if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified {
+		target.exitTracerAcked = true
+		target.ptraceTracer.Store((*Task)(nil))
+		delete(t.ptraceTracees, target)
+	}
+	if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified {
+		target.exitParentAcked = true
+		if target == target.tg.leader {
+			// target.tg.exitedCPUStats doesn't include target.CPUStats() yet,
+			// and won't until after target.exitNotifyLocked() (maybe). Include
+			// target.CPUStats() explicitly. This is consistent with Linux,
+			// which accounts an exited task's cputime to its thread group in
+			// kernel/exit.c:release_task() => __exit_signal(), and uses
+			// thread_group_cputime_adjusted() in wait_task_zombie().
+			t.tg.childCPUStats.Accumulate(target.CPUStats())
+			t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats)
+			t.tg.childCPUStats.Accumulate(target.tg.childCPUStats)
+			// Update t's child max resident set size. The size will be the maximum
+			// of this thread's size and all its childrens' sizes.
+			if t.tg.childMaxRSS < target.tg.maxRSS {
+				t.tg.childMaxRSS = target.tg.maxRSS
+			}
+			if t.tg.childMaxRSS < target.tg.childMaxRSS {
+				t.tg.childMaxRSS = target.tg.childMaxRSS
+			}
+		}
+	}
+	target.exitNotifyLocked(false)
+	return &WaitResult{
+		Task:   target,
+		TID:    pid,
+		UID:    uid,
+		Event:  EventExit,
+		Status: status,
+	}
+}
+
+// updateRSSLocked updates t.tg.maxRSS.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) updateRSSLocked() {
+	if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS {
+		t.tg.maxRSS = mmMaxRSS
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult {
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if !target.tg.groupStopWaitable {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	sig := target.tg.groupStopSignal
+	if opts.ConsumeEvent {
+		target.tg.groupStopWaitable = false
+	}
+	return &WaitResult{
+		Task:  target,
+		TID:   pid,
+		UID:   uid,
+		Event: EventChildGroupStop,
+		// There is no name for these status constants.
+		Status: (uint32(sig)&0xff)<<8 | 0x7f,
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult {
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if !target.tg.groupContWaitable {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	if opts.ConsumeEvent {
+		target.tg.groupContWaitable = false
+	}
+	return &WaitResult{
+		Task:   target,
+		TID:    pid,
+		UID:    uid,
+		Event:  EventGroupContinue,
+		Status: 0xffff,
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult {
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if target.stop == nil {
+		return nil
+	}
+	if _, ok := target.stop.(*ptraceStop); !ok {
+		return nil
+	}
+	if target.ptraceCode == 0 {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	code := target.ptraceCode
+	if opts.ConsumeEvent {
+		target.ptraceCode = 0
+	}
+	return &WaitResult{
+		Task:   target,
+		TID:    pid,
+		UID:    uid,
+		Event:  EventTraceeStop,
+		Status: uint32(code)<<8 | 0x7f,
+	}
+}
+
+// ExitState returns t's current progress through the exit path.
+func (t *Task) ExitState() TaskExitState {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	return t.exitState
+}
+
+// ParentDeathSignal returns t's parent death signal.
+func (t *Task) ParentDeathSignal() linux.Signal {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.parentDeathSignal
+}
+
+// SetParentDeathSignal sets t's parent death signal.
+func (t *Task) SetParentDeathSignal(sig linux.Signal) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.parentDeathSignal = sig
+}
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
new file mode 100644
index 000000000..f98097c2c
--- /dev/null
+++ b/pkg/sentry/kernel/task_futex.go
@@ -0,0 +1,54 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// Futex returns t's futex manager.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Futex() *futex.Manager {
+	return t.tc.fu
+}
+
+// SwapUint32 implements futex.Target.SwapUint32.
+func (t *Task) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
+	return t.MemoryManager().SwapUint32(t, addr, new, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CompareAndSwapUint32 implemets futex.Target.CompareAndSwapUint32.
+func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
+	return t.MemoryManager().CompareAndSwapUint32(t, addr, old, new, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// LoadUint32 implemets futex.Target.LoadUint32.
+func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) {
+	return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// GetSharedKey implements futex.Target.GetSharedKey.
+func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) {
+	return t.MemoryManager().GetSharedFutexKey(t, addr)
+}
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
new file mode 100644
index 000000000..17f08729a
--- /dev/null
+++ b/pkg/sentry/kernel/task_identity.go
@@ -0,0 +1,568 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Credentials returns t's credentials.
+//
+// This value must be considered immutable.
+func (t *Task) Credentials() *auth.Credentials {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.creds
+}
+
+// UserNamespace returns the user namespace associated with the task.
+func (t *Task) UserNamespace() *auth.UserNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.creds.UserNamespace
+}
+
+// HasCapabilityIn checks if the task has capability cp in user namespace ns.
+func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.creds.HasCapabilityIn(cp, ns)
+}
+
+// HasCapability checks if the task has capability cp in its user namespace.
+func (t *Task) HasCapability(cp linux.Capability) bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.creds.HasCapability(cp)
+}
+
+// SetUID implements the semantics of setuid(2).
+func (t *Task) SetUID(uid auth.UID) error {
+	// setuid considers -1 to be invalid.
+	if !uid.Ok() {
+		return syserror.EINVAL
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	kuid := t.creds.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return syserror.EINVAL
+	}
+	// "setuid() sets the effective user ID of the calling process. If the
+	// effective UID of the caller is root (more precisely: if the caller has
+	// the CAP_SETUID capability), the real UID and saved set-user-ID are also
+	// set." - setuid(2)
+	if t.creds.HasCapability(linux.CAP_SETUID) {
+		t.setKUIDsUncheckedLocked(kuid, kuid, kuid)
+		return nil
+	}
+	// "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID
+	// capability) and uid does not match the real UID or saved set-user-ID of
+	// the calling process."
+	if kuid != t.creds.RealKUID && kuid != t.creds.SavedKUID {
+		return syserror.EPERM
+	}
+	t.setKUIDsUncheckedLocked(t.creds.RealKUID, kuid, t.creds.SavedKUID)
+	return nil
+}
+
+// SetREUID implements the semantics of setreuid(2).
+func (t *Task) SetREUID(r, e auth.UID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Supplying a value of -1 for either the real or effective user ID forces
+	// the system to leave that ID unchanged." - setreuid(2)
+	newR := t.creds.RealKUID
+	if r.Ok() {
+		newR = t.creds.UserNamespace.MapToKUID(r)
+		if !newR.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	newE := t.creds.EffectiveKUID
+	if e.Ok() {
+		newE = t.creds.UserNamespace.MapToKUID(e)
+		if !newE.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	if !t.creds.HasCapability(linux.CAP_SETUID) {
+		// "Unprivileged processes may only set the effective user ID to the
+		// real user ID, the effective user ID, or the saved set-user-ID."
+		if newE != t.creds.RealKUID && newE != t.creds.EffectiveKUID && newE != t.creds.SavedKUID {
+			return syserror.EPERM
+		}
+		// "Unprivileged users may only set the real user ID to the real user
+		// ID or the effective user ID."
+		if newR != t.creds.RealKUID && newR != t.creds.EffectiveKUID {
+			return syserror.EPERM
+		}
+	}
+	// "If the real user ID is set (i.e., ruid is not -1) or the effective user
+	// ID is set to a value not equal to the previous real user ID, the saved
+	// set-user-ID will be set to the new effective user ID."
+	newS := t.creds.SavedKUID
+	if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKUID) {
+		newS = newE
+	}
+	t.setKUIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// SetRESUID implements the semantics of the setresuid(2) syscall.
+func (t *Task) SetRESUID(r, e, s auth.UID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Unprivileged user processes may change the real UID, effective UID, and
+	// saved set-user-ID, each to one of: the current real UID, the current
+	// effective UID or the current saved set-user-ID. Privileged processes (on
+	// Linux, those having the CAP_SETUID capability) may set the real UID,
+	// effective UID, and saved set-user-ID to arbitrary values. If one of the
+	// arguments equals -1, the corresponding value is not changed." -
+	// setresuid(2)
+	var err error
+	newR := t.creds.RealKUID
+	if r.Ok() {
+		newR, err = t.creds.UseUID(r)
+		if err != nil {
+			return err
+		}
+	}
+	newE := t.creds.EffectiveKUID
+	if e.Ok() {
+		newE, err = t.creds.UseUID(e)
+		if err != nil {
+			return err
+		}
+	}
+	newS := t.creds.SavedKUID
+	if s.Ok() {
+		newS, err = t.creds.UseUID(s)
+		if err != nil {
+			return err
+		}
+	}
+	t.setKUIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// Preconditions: t.mu must be locked.
+func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
+	root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
+	oldR, oldE, oldS := t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID
+	t.creds = t.creds.Fork() // See doc for creds.
+	t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID = newR, newE, newS
+
+	// "1. If one or more of the real, effective or saved set user IDs was
+	// previously 0, and as a result of the UID changes all of these IDs have a
+	// nonzero value, then all capabilities are cleared from the permitted and
+	// effective capability sets." - capabilities(7)
+	if (oldR == root || oldE == root || oldS == root) && (newR != root && newE != root && newS != root) {
+		// prctl(2): "PR_SET_KEEPCAP: Set the state of the calling thread's
+		// "keep capabilities" flag, which determines whether the thread's permitted
+		// capability set is cleared when a change is made to the
+		// thread's user IDs such that the thread's real UID, effective
+		// UID, and saved set-user-ID all become nonzero when at least
+		// one of them previously had the value 0.  By default, the
+		// permitted capability set is cleared when such a change is
+		// made; setting the "keep capabilities" flag prevents it from
+		// being cleared." (A thread's effective capability set is always
+		// cleared when such a credential change is made,
+		// regardless of the setting of the "keep capabilities" flag.)
+		if !t.creds.KeepCaps {
+			t.creds.PermittedCaps = 0
+			t.creds.EffectiveCaps = 0
+		}
+	}
+	// """
+	// 2. If the effective user ID is changed from 0 to nonzero, then all
+	// capabilities are cleared from the effective set.
+	//
+	// 3. If the effective user ID is changed from nonzero to 0, then the
+	// permitted set is copied to the effective set.
+	// """
+	if oldE == root && newE != root {
+		t.creds.EffectiveCaps = 0
+	} else if oldE != root && newE == root {
+		t.creds.EffectiveCaps = t.creds.PermittedCaps
+	}
+	// "4. If the filesystem user ID is changed from 0 to nonzero (see
+	// setfsuid(2)), then the following capabilities are cleared from the
+	// effective set: ..."
+	// (filesystem UIDs aren't implemented, nor are any of the capabilities in
+	// question)
+
+	// Not documented, but compare Linux's kernel/cred.c:commit_creds().
+	if oldE != newE {
+		t.parentDeathSignal = 0
+	}
+}
+
+// SetGID implements the semantics of setgid(2).
+func (t *Task) SetGID(gid auth.GID) error {
+	if !gid.Ok() {
+		return syserror.EINVAL
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	kgid := t.creds.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return syserror.EINVAL
+	}
+	if t.creds.HasCapability(linux.CAP_SETGID) {
+		t.setKGIDsUncheckedLocked(kgid, kgid, kgid)
+		return nil
+	}
+	if kgid != t.creds.RealKGID && kgid != t.creds.SavedKGID {
+		return syserror.EPERM
+	}
+	t.setKGIDsUncheckedLocked(t.creds.RealKGID, kgid, t.creds.SavedKGID)
+	return nil
+}
+
+// SetREGID implements the semantics of setregid(2).
+func (t *Task) SetREGID(r, e auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	newR := t.creds.RealKGID
+	if r.Ok() {
+		newR = t.creds.UserNamespace.MapToKGID(r)
+		if !newR.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	newE := t.creds.EffectiveKGID
+	if e.Ok() {
+		newE = t.creds.UserNamespace.MapToKGID(e)
+		if !newE.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	if !t.creds.HasCapability(linux.CAP_SETGID) {
+		if newE != t.creds.RealKGID && newE != t.creds.EffectiveKGID && newE != t.creds.SavedKGID {
+			return syserror.EPERM
+		}
+		if newR != t.creds.RealKGID && newR != t.creds.EffectiveKGID {
+			return syserror.EPERM
+		}
+	}
+	newS := t.creds.SavedKGID
+	if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKGID) {
+		newS = newE
+	}
+	t.setKGIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// SetRESGID implements the semantics of the setresgid(2) syscall.
+func (t *Task) SetRESGID(r, e, s auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	var err error
+	newR := t.creds.RealKGID
+	if r.Ok() {
+		newR, err = t.creds.UseGID(r)
+		if err != nil {
+			return err
+		}
+	}
+	newE := t.creds.EffectiveKGID
+	if e.Ok() {
+		newE, err = t.creds.UseGID(e)
+		if err != nil {
+			return err
+		}
+	}
+	newS := t.creds.SavedKGID
+	if s.Ok() {
+		newS, err = t.creds.UseGID(s)
+		if err != nil {
+			return err
+		}
+	}
+	t.setKGIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
+	oldE := t.creds.EffectiveKGID
+	t.creds = t.creds.Fork() // See doc for creds.
+	t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS
+
+	// Not documented, but compare Linux's kernel/cred.c:commit_creds().
+	if oldE != newE {
+		t.parentDeathSignal = 0
+	}
+}
+
+// SetExtraGIDs attempts to change t's supplemental groups. All IDs are
+// interpreted as being in t's user namespace.
+func (t *Task) SetExtraGIDs(gids []auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if !t.creds.HasCapability(linux.CAP_SETGID) {
+		return syserror.EPERM
+	}
+	kgids := make([]auth.KGID, len(gids))
+	for i, gid := range gids {
+		kgid := t.creds.UserNamespace.MapToKGID(gid)
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+		kgids[i] = kgid
+	}
+	t.creds = t.creds.Fork() // See doc for creds.
+	t.creds.ExtraKGIDs = kgids
+	return nil
+}
+
+// SetCapabilitySets attempts to change t's permitted, inheritable, and
+// effective capability sets.
+func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Permitted: This is a limiting superset for the effective capabilities
+	// that the thread may assume." - capabilities(7)
+	if effective & ^permitted != 0 {
+		return syserror.EPERM
+	}
+	// "It is also a limiting superset for the capabilities that may be added
+	// to the inheritable set by a thread that does not have the CAP_SETPCAP
+	// capability in its effective set."
+	if !t.creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(t.creds.InheritableCaps|t.creds.PermittedCaps) != 0) {
+		return syserror.EPERM
+	}
+	// "If a thread drops a capability from its permitted set, it can never
+	// reacquire that capability (unless it execve(2)s ..."
+	if permitted & ^t.creds.PermittedCaps != 0 {
+		return syserror.EPERM
+	}
+	// "... if a capability is not in the bounding set, then a thread can't add
+	// this capability to its inheritable set, even if it was in its permitted
+	// capabilities ..."
+	if inheritable & ^(t.creds.InheritableCaps|t.creds.BoundingCaps) != 0 {
+		return syserror.EPERM
+	}
+	t.creds = t.creds.Fork() // See doc for creds.
+	t.creds.PermittedCaps = permitted
+	t.creds.InheritableCaps = inheritable
+	t.creds.EffectiveCaps = effective
+	return nil
+}
+
+// DropBoundingCapability attempts to drop capability cp from t's capability
+// bounding set.
+func (t *Task) DropBoundingCapability(cp linux.Capability) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if !t.creds.HasCapability(linux.CAP_SETPCAP) {
+		return syserror.EPERM
+	}
+	t.creds = t.creds.Fork() // See doc for creds.
+	t.creds.BoundingCaps &^= auth.CapabilitySetOf(cp)
+	return nil
+}
+
+// SetUserNamespace attempts to move c into ns.
+func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// "A process reassociating itself with a user namespace must have the
+	// CAP_SYS_ADMIN capability in the target user namespace." - setns(2)
+	//
+	// If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN
+	// in ns (by rule 3 in auth.Credentials.HasCapability).
+	if !t.creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
+		return syserror.EPERM
+	}
+
+	t.creds = t.creds.Fork() // See doc for creds.
+	t.creds.UserNamespace = ns
+	// "The child process created by clone(2) with the CLONE_NEWUSER flag
+	// starts out with a complete set of capabilities in the new user
+	// namespace. Likewise, a process that creates a new user namespace using
+	// unshare(2) or joins an existing user namespace using setns(2) gains a
+	// full set of capabilities in that namespace."
+	t.creds.PermittedCaps = auth.AllCapabilities
+	t.creds.InheritableCaps = 0
+	t.creds.EffectiveCaps = auth.AllCapabilities
+	t.creds.BoundingCaps = auth.AllCapabilities
+	// "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER
+	// flag sets the "securebits" flags (see capabilities(7)) to their default
+	// values (all flags disabled) in the child (for clone(2)) or caller (for
+	// unshare(2), or setns(2)." - user_namespaces(7)
+	t.creds.KeepCaps = false
+
+	return nil
+}
+
+// SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS.
+func (t *Task) SetKeepCaps(k bool) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.creds = t.creds.Fork() // See doc for creds.
+	t.creds.KeepCaps = k
+}
+
+// updateCredsForExec updates t.creds to reflect an execve().
+//
+// NOTE(b/30815691): We currently do not implement privileged executables
+// (set-user/group-ID bits and file capabilities). This allows us to make a lot
+// of simplifying assumptions:
+//
+// - We assume the no_new_privs bit (set by prctl(SET_NO_NEW_PRIVS)), which
+// disables the features we don't support anyway, is always set. This
+// drastically simplifies this function.
+//
+// - We don't implement AT_SECURE, because no_new_privs always being set means
+// that the conditions that require AT_SECURE never arise. (Compare Linux's
+// security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().)
+//
+// - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since
+// seccomp-bpf is also allowed if the task has no_new_privs set.
+//
+// - Task.ptraceAttach does not serialize with execve as it does in Linux,
+// since no_new_privs being set has the same effect as the presence of an
+// unprivileged tracer.
+//
+// Preconditions: t.mu must be locked.
+func (t *Task) updateCredsForExecLocked() {
+	// """
+	// During an execve(2), the kernel calculates the new capabilities of
+	// the process using the following algorithm:
+	//
+	//     P'(permitted) = (P(inheritable) & F(inheritable)) |
+	//                     (F(permitted) & cap_bset)
+	//
+	//     P'(effective) = F(effective) ? P'(permitted) : 0
+	//
+	//     P'(inheritable) = P(inheritable)    [i.e., unchanged]
+	//
+	// where:
+	//
+	//     P         denotes the value of a thread capability set before the
+	//               execve(2)
+	//
+	//     P'        denotes the value of a thread capability set after the
+	//               execve(2)
+	//
+	//     F         denotes a file capability set
+	//
+	//     cap_bset  is the value of the capability bounding set
+	//
+	// ...
+	//
+	// In order to provide an all-powerful root using capability sets, during
+	// an execve(2):
+	//
+	// 1. If a set-user-ID-root program is being executed, or the real user ID
+	// of the process is 0 (root) then the file inheritable and permitted sets
+	// are defined to be all ones (i.e. all capabilities enabled).
+	//
+	// 2. If a set-user-ID-root program is being executed, then the file
+	// effective bit is defined to be one (enabled).
+	//
+	// The upshot of the above rules, combined with the capabilities
+	// transformations described above, is that when a process execve(2)s a
+	// set-user-ID-root program, or when a process with an effective UID of 0
+	// execve(2)s a program, it gains all capabilities in its permitted and
+	// effective capability sets, except those masked out by the capability
+	// bounding set.
+	// """ - capabilities(7)
+	// (ambient capability sets omitted)
+	//
+	// As the last paragraph implies, the case of "a set-user-ID root program
+	// is being executed" also includes the case where (namespace) root is
+	// executing a non-set-user-ID program; the actual check is just based on
+	// the effective user ID.
+	var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0
+	fileEffective := false
+	root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
+	if t.creds.EffectiveKUID == root || t.creds.RealKUID == root {
+		newPermitted = t.creds.InheritableCaps | t.creds.BoundingCaps
+		if t.creds.EffectiveKUID == root {
+			fileEffective = true
+		}
+	}
+
+	t.creds = t.creds.Fork() // See doc for creds.
+
+	// Now we enter poorly-documented, somewhat confusing territory. (The
+	// accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds
+	// is not very helpful.) My reading of it is:
+	//
+	// If at least one of the following is true:
+	//
+	// A1. The execing task is ptraced, and the tracer did not have
+	// CAP_SYS_PTRACE in the execing task's user namespace at the time of
+	// PTRACE_ATTACH.
+	//
+	// A2. The execing task shares its FS context with at least one task in
+	// another thread group.
+	//
+	// A3. The execing task has no_new_privs set.
+	//
+	// AND at least one of the following is true:
+	//
+	// B1. The new effective user ID (which may come from set-user-ID, or be the
+	// execing task's existing effective user ID) is not equal to the task's
+	// real UID.
+	//
+	// B2. The new effective group ID (which may come from set-group-ID, or be
+	// the execing task's existing effective group ID) is not equal to the
+	// task's real GID.
+	//
+	// B3. The new permitted capability set contains capabilities not in the
+	// task's permitted capability set.
+	//
+	// Then:
+	//
+	// C1. Limit the new permitted capability set to the task's permitted
+	// capability set.
+	//
+	// C2. If either the task does not have CAP_SETUID in its user namespace, or
+	// the task has no_new_privs set, force the new effective UID and GID to
+	// the task's real UID and GID.
+	//
+	// But since no_new_privs is always set (A3 is always true), this becomes
+	// much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1
+	// is a no-op. So we can just do C1 and C2 unconditionally.
+	if t.creds.EffectiveKUID != t.creds.RealKUID || t.creds.EffectiveKGID != t.creds.RealKGID {
+		t.creds.EffectiveKUID = t.creds.RealKUID
+		t.creds.EffectiveKGID = t.creds.RealKGID
+		t.parentDeathSignal = 0
+	}
+	// (Saved set-user-ID is always set to the new effective user ID, and saved
+	// set-group-ID is always set to the new effective group ID, regardless of
+	// the above.)
+	t.creds.SavedKUID = t.creds.RealKUID
+	t.creds.SavedKGID = t.creds.RealKGID
+	t.creds.PermittedCaps &= newPermitted
+	if fileEffective {
+		t.creds.EffectiveCaps = t.creds.PermittedCaps
+	} else {
+		t.creds.EffectiveCaps = 0
+	}
+
+	// prctl(2): The "keep capabilities" value will be reset to 0 on subsequent
+	// calls to execve(2).
+	t.creds.KeepCaps = false
+
+	// "The bounding set is inherited at fork(2) from the thread's parent, and
+	// is preserved across an execve(2)". So we're done.
+}
diff --git a/pkg/sentry/kernel/task_list.go b/pkg/sentry/kernel/task_list.go
new file mode 100755
index 000000000..57d3f098d
--- /dev/null
+++ b/pkg/sentry/kernel/task_list.go
@@ -0,0 +1,173 @@
+package kernel
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type taskElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (taskElementMapper) linkerFor(elem *Task) *Task { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+//      for e := l.Front(); e != nil; e = e.Next() {
+// 		// do something with e.
+//      }
+//
+// +stateify savable
+type taskList struct {
+	head *Task
+	tail *Task
+}
+
+// Reset resets list l to the empty state.
+func (l *taskList) Reset() {
+	l.head = nil
+	l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+func (l *taskList) Empty() bool {
+	return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+func (l *taskList) Front() *Task {
+	return l.head
+}
+
+// Back returns the last element of list l or nil.
+func (l *taskList) Back() *Task {
+	return l.tail
+}
+
+// PushFront inserts the element e at the front of list l.
+func (l *taskList) PushFront(e *Task) {
+	taskElementMapper{}.linkerFor(e).SetNext(l.head)
+	taskElementMapper{}.linkerFor(e).SetPrev(nil)
+
+	if l.head != nil {
+		taskElementMapper{}.linkerFor(l.head).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+
+	l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+func (l *taskList) PushBack(e *Task) {
+	taskElementMapper{}.linkerFor(e).SetNext(nil)
+	taskElementMapper{}.linkerFor(e).SetPrev(l.tail)
+
+	if l.tail != nil {
+		taskElementMapper{}.linkerFor(l.tail).SetNext(e)
+	} else {
+		l.head = e
+	}
+
+	l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+func (l *taskList) PushBackList(m *taskList) {
+	if l.head == nil {
+		l.head = m.head
+		l.tail = m.tail
+	} else if m.head != nil {
+		taskElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+		taskElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+		l.tail = m.tail
+	}
+
+	m.head = nil
+	m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+func (l *taskList) InsertAfter(b, e *Task) {
+	a := taskElementMapper{}.linkerFor(b).Next()
+	taskElementMapper{}.linkerFor(e).SetNext(a)
+	taskElementMapper{}.linkerFor(e).SetPrev(b)
+	taskElementMapper{}.linkerFor(b).SetNext(e)
+
+	if a != nil {
+		taskElementMapper{}.linkerFor(a).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+}
+
+// InsertBefore inserts e before a.
+func (l *taskList) InsertBefore(a, e *Task) {
+	b := taskElementMapper{}.linkerFor(a).Prev()
+	taskElementMapper{}.linkerFor(e).SetNext(a)
+	taskElementMapper{}.linkerFor(e).SetPrev(b)
+	taskElementMapper{}.linkerFor(a).SetPrev(e)
+
+	if b != nil {
+		taskElementMapper{}.linkerFor(b).SetNext(e)
+	} else {
+		l.head = e
+	}
+}
+
+// Remove removes e from l.
+func (l *taskList) Remove(e *Task) {
+	prev := taskElementMapper{}.linkerFor(e).Prev()
+	next := taskElementMapper{}.linkerFor(e).Next()
+
+	if prev != nil {
+		taskElementMapper{}.linkerFor(prev).SetNext(next)
+	} else {
+		l.head = next
+	}
+
+	if next != nil {
+		taskElementMapper{}.linkerFor(next).SetPrev(prev)
+	} else {
+		l.tail = prev
+	}
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type taskEntry struct {
+	next *Task
+	prev *Task
+}
+
+// Next returns the entry that follows e in the list.
+func (e *taskEntry) Next() *Task {
+	return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+func (e *taskEntry) Prev() *Task {
+	return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+func (e *taskEntry) SetNext(elem *Task) {
+	e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+func (e *taskEntry) SetPrev(elem *Task) {
+	e.prev = elem
+}
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
new file mode 100644
index 000000000..e0e57e8bd
--- /dev/null
+++ b/pkg/sentry/kernel/task_log.go
@@ -0,0 +1,137 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sort"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// maxStackDebugBytes is the maximum number of user stack bytes that may be
+	// printed by debugDumpStack.
+	maxStackDebugBytes = 1024
+)
+
+// Infof logs an formatted info message by calling log.Infof.
+func (t *Task) Infof(fmt string, v ...interface{}) {
+	if log.IsLogging(log.Info) {
+		log.Infof(t.logPrefix.Load().(string)+fmt, v...)
+	}
+}
+
+// Warningf logs a warning string by calling log.Warningf.
+func (t *Task) Warningf(fmt string, v ...interface{}) {
+	if log.IsLogging(log.Warning) {
+		log.Warningf(t.logPrefix.Load().(string)+fmt, v...)
+	}
+}
+
+// Debugf creates a debug string that includes the task ID.
+func (t *Task) Debugf(fmt string, v ...interface{}) {
+	if log.IsLogging(log.Debug) {
+		log.Debugf(t.logPrefix.Load().(string)+fmt, v...)
+	}
+}
+
+// IsLogging returns true iff this level is being logged.
+func (t *Task) IsLogging(level log.Level) bool {
+	return log.IsLogging(level)
+}
+
+// DebugDumpState logs task state at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) DebugDumpState() {
+	t.debugDumpRegisters()
+	t.debugDumpStack()
+	if mm := t.MemoryManager(); mm != nil {
+		t.Debugf("Mappings:\n%s", mm)
+	}
+	t.Debugf("FDMap:\n%s", t.fds)
+}
+
+// debugDumpRegisters logs register state at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) debugDumpRegisters() {
+	if !t.IsLogging(log.Debug) {
+		return
+	}
+	regmap, err := t.Arch().RegisterMap()
+	if err != nil {
+		t.Debugf("Registers: %v", err)
+	} else {
+		t.Debugf("Registers:")
+		var regs []string
+		for reg := range regmap {
+			regs = append(regs, reg)
+		}
+		sort.Strings(regs)
+		for _, reg := range regs {
+			t.Debugf("%-8s = %016x", reg, regmap[reg])
+		}
+	}
+}
+
+// debugDumpStack logs user stack contents at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) debugDumpStack() {
+	if !t.IsLogging(log.Debug) {
+		return
+	}
+	m := t.MemoryManager()
+	if m == nil {
+		t.Debugf("Memory manager for task is gone, skipping application stack dump.")
+		return
+	}
+	t.Debugf("Stack:")
+	start := usermem.Addr(t.Arch().Stack())
+	// Round addr down to a 16-byte boundary.
+	start &= ^usermem.Addr(15)
+	// Print 16 bytes per line, one byte at a time.
+	for offset := uint64(0); offset < maxStackDebugBytes; offset += 16 {
+		addr, ok := start.AddLength(offset)
+		if !ok {
+			break
+		}
+		var data [16]byte
+		n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{
+			IgnorePermissions: true,
+		})
+		// Print as much of the line as we can, even if an error was
+		// encountered.
+		if n > 0 {
+			t.Debugf("%x: % x", addr, data[:n])
+		}
+		if err != nil {
+			t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err)
+			break
+		}
+	}
+}
+
+// updateLogPrefix updates the task's cached log prefix to reflect its
+// current thread ID.
+//
+// Preconditions: The task's owning TaskSet.mu must be locked.
+func (t *Task) updateLogPrefixLocked() {
+	// Use the task's TID in the root PID namespace for logging.
+	t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t]))
+}
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
new file mode 100644
index 000000000..04c684c1a
--- /dev/null
+++ b/pkg/sentry/kernel/task_net.go
@@ -0,0 +1,35 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+)
+
+// IsNetworkNamespaced returns true if t is in a non-root network namespace.
+func (t *Task) IsNetworkNamespaced() bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns
+}
+
+// NetworkContext returns the network stack used by the task. NetworkContext
+// may return nil if no network stack is available.
+func (t *Task) NetworkContext() inet.Stack {
+	if t.IsNetworkNamespaced() {
+		return nil
+	}
+	return t.k.networkStack
+}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
new file mode 100644
index 000000000..a79101a18
--- /dev/null
+++ b/pkg/sentry/kernel/task_run.go
@@ -0,0 +1,340 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"bytes"
+	"runtime"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// A taskRunState is a reified state in the task state machine. See README.md
+// for details. The canonical list of all run states, as well as transitions
+// between them, is given in run_states.dot.
+//
+// The set of possible states is enumerable and completely defined by the
+// kernel package, so taskRunState would ideally be represented by a
+// discriminated union. However, Go does not support sum types.
+//
+// Hence, as with TaskStop, data-free taskRunStates should be represented as
+// typecast nils to avoid unnecessary allocation.
+type taskRunState interface {
+	// execute executes the code associated with this state over the given task
+	// and returns the following state. If execute returns nil, the task
+	// goroutine should exit.
+	//
+	// It is valid to tail-call a following state's execute to avoid the
+	// overhead of converting the following state to an interface object and
+	// checking for stops, provided that the tail-call cannot recurse.
+	execute(*Task) taskRunState
+}
+
+// run runs the task goroutine.
+//
+// threadID a dummy value set to the task's TID in the root PID namespace to
+// make it visible in stack dumps. A goroutine for a given task can be identified
+// searching for Task.run()'s argument value.
+func (t *Task) run(threadID uintptr) {
+	// Construct t.blockingTimer here. We do this here because we can't
+	// reconstruct t.blockingTimer during restore in Task.afterLoad(), because
+	// kernel.timekeeper.SetClocks() hasn't been called yet.
+	blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier()
+	t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier)
+	defer t.blockingTimer.Destroy()
+	t.blockingTimerChan = blockingTimerChan
+
+	// Activate our address space.
+	t.Activate()
+	// The corresponding t.Deactivate occurs in the exit path
+	// (runExitMain.execute) so that when
+	// Platform.CooperativelySharesAddressSpace() == true, we give up the
+	// AddressSpace before the task goroutine finishes executing.
+
+	// If this is a newly-started task, it should check for participation in
+	// group stops. If this is a task resuming after restore, it was
+	// interrupted by saving. In either case, the task is initially
+	// interrupted.
+	t.interruptSelf()
+
+	for {
+		// Explanation for this ordering:
+		//
+		// - A freshly-started task that is stopped should not do anything
+		// before it enters the stop.
+		//
+		// - If taskRunState.execute returns nil, the task goroutine should
+		// exit without checking for a stop.
+		//
+		// - Task.Start won't start Task.run if t.runState is nil, so this
+		// ordering is safe.
+		t.doStop()
+		t.runState = t.runState.execute(t)
+		if t.runState == nil {
+			t.accountTaskGoroutineEnter(TaskGoroutineNonexistent)
+			t.goroutineStopped.Done()
+			t.tg.liveGoroutines.Done()
+			t.tg.pidns.owner.liveGoroutines.Done()
+			t.tg.pidns.owner.runningGoroutines.Done()
+
+			// Keep argument alive because stack trace for dead variables may not be correct.
+			runtime.KeepAlive(threadID)
+			return
+		}
+	}
+}
+
+// doStop is called by Task.run to block until the task is not stopped.
+func (t *Task) doStop() {
+	if atomic.LoadInt32(&t.stopCount) == 0 {
+		return
+	}
+	t.Deactivate()
+	// NOTE(b/30316266): t.Activate() must be called without any locks held, so
+	// this defer must precede the defer for unlocking the signal mutex.
+	defer t.Activate()
+	t.accountTaskGoroutineEnter(TaskGoroutineStopped)
+	defer t.accountTaskGoroutineLeave(TaskGoroutineStopped)
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.tg.pidns.owner.runningGoroutines.Add(-1)
+	defer t.tg.pidns.owner.runningGoroutines.Add(1)
+	t.goroutineStopped.Add(-1)
+	defer t.goroutineStopped.Add(1)
+	for t.stopCount > 0 {
+		t.endStopCond.Wait()
+	}
+}
+
+// The runApp state checks for interrupts before executing untrusted
+// application code.
+//
+// +stateify savable
+type runApp struct{}
+
+func (*runApp) execute(t *Task) taskRunState {
+	if t.interrupted() {
+		// Checkpointing instructs tasks to stop by sending an interrupt, so we
+		// must check for stops before entering runInterrupt (instead of
+		// tail-calling it).
+		return (*runInterrupt)(nil)
+	}
+
+	// We're about to switch to the application again. If there's still a
+	// unhandled SyscallRestartErrno that wasn't translated to an EINTR,
+	// restart the syscall that was interrupted. If there's a saved signal
+	// mask, restore it. (Note that restoring the saved signal mask may unblock
+	// a pending signal, causing another interruption, but that signal should
+	// not interact with the interrupted syscall.)
+	if t.haveSyscallReturn {
+		if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+			if sre == ERESTART_RESTARTBLOCK {
+				t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
+				t.Arch().RestartSyscallWithRestartBlock()
+			} else {
+				t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
+				t.Arch().RestartSyscall()
+			}
+		}
+		t.haveSyscallReturn = false
+	}
+	if t.haveSavedSignalMask {
+		t.SetSignalMask(t.savedSignalMask)
+		t.haveSavedSignalMask = false
+		if t.interrupted() {
+			return (*runInterrupt)(nil)
+		}
+	}
+
+	// Apply restartable sequences.
+	if t.rseqPreempted {
+		t.rseqPreempted = false
+		if t.rseqCPUAddr != 0 {
+			cpu := int32(hostcpu.GetCPU())
+			if t.rseqCPU != cpu {
+				t.rseqCPU = cpu
+				if err := t.rseqCopyOutCPU(); err != nil {
+					t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err)
+					t.forceSignal(linux.SIGSEGV, false)
+					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+					// Re-enter the task run loop for signal delivery.
+					return (*runApp)(nil)
+				}
+			}
+		}
+		t.rseqInterrupt()
+	}
+
+	// Check if we need to enable single-stepping. Tracers expect that the
+	// kernel preserves the value of the single-step flag set by PTRACE_SETREGS
+	// whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this
+	// includes our ptrace platform, by the way), so we should only clear the
+	// single-step flag if we're responsible for setting it. (clearSinglestep
+	// is therefore analogous to Linux's TIF_FORCED_TF.)
+	//
+	// Strictly speaking, we should also not clear the single-step flag if we
+	// single-step through an instruction that sets the single-step flag
+	// (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their
+	// own TF. (Famous last words, I know.)
+	clearSinglestep := false
+	if t.hasTracer() {
+		t.tg.pidns.owner.mu.RLock()
+		if t.ptraceSinglestep {
+			clearSinglestep = !t.Arch().SingleStep()
+			t.Arch().SetSingleStep()
+		}
+		t.tg.pidns.owner.mu.RUnlock()
+	}
+
+	t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
+	info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU)
+	t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
+
+	if clearSinglestep {
+		t.Arch().ClearSingleStep()
+	}
+
+	switch err {
+	case nil:
+		// Handle application system call.
+		return t.doSyscall()
+
+	case platform.ErrContextInterrupt:
+		// Interrupted by platform.Context.Interrupt(). Re-enter the run
+		// loop to figure out why.
+		return (*runApp)(nil)
+
+	case platform.ErrContextSignalCPUID:
+		// Is this a CPUID instruction?
+		expected := arch.CPUIDInstruction[:]
+		found := make([]byte, len(expected))
+		_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+		if err == nil && bytes.Equal(expected, found) {
+			// Skip the cpuid instruction.
+			t.Arch().CPUIDEmulate(t)
+			t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+
+			// Resume execution.
+			return (*runApp)(nil)
+		}
+
+		// The instruction at the given RIP was not a CPUID, and we
+		// fallthrough to the default signal deliver behavior below.
+		fallthrough
+
+	case platform.ErrContextSignal:
+		// Looks like a signal has been delivered to us. If it's a synchronous
+		// signal (SEGV, SIGBUS, etc.), it should be sent to the application
+		// thread that received it.
+		sig := linux.Signal(info.Signo)
+
+		// Was it a fault that we should handle internally? If so, this wasn't
+		// an application-generated signal and we should continue execution
+		// normally.
+		if at.Any() {
+			addr := usermem.Addr(info.Addr())
+			err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
+			if err == nil {
+				// The fault was handled appropriately.
+				// We can resume running the application.
+				return (*runApp)(nil)
+			}
+
+			// Is this a vsyscall that we need emulate?
+			if at.Execute {
+				if sysno, ok := t.tc.st.LookupEmulate(addr); ok {
+					return t.doVsyscall(addr, sysno)
+				}
+			}
+
+			// Faults are common, log only at debug level.
+			t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
+			t.DebugDumpState()
+
+			// Continue to signal handling.
+			//
+			// Convert a BusError error to a SIGBUS from a SIGSEGV. All
+			// other info bits stay the same (address, etc.).
+			if _, ok := err.(*memmap.BusError); ok {
+				sig = linux.SIGBUS
+				info.Signo = int32(linux.SIGBUS)
+			}
+		}
+
+		switch sig {
+		case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP:
+			// Synchronous signal. Send it to ourselves. Assume the signal is
+			// legitimate and force it (work around the signal being ignored or
+			// blocked) like Linux does. Conveniently, this is even the correct
+			// behavior for SIGTRAP from single-stepping.
+			t.forceSignal(linux.Signal(sig), false /* unconditional */)
+			t.SendSignal(info)
+
+		case platform.SignalInterrupt:
+			// Assume that a call to platform.Context.Interrupt() misfired.
+
+		case linux.SIGPROF:
+			// It's a profiling interrupt: there's not much
+			// we can do. We've already paid a decent cost
+			// by intercepting the signal, at this point we
+			// simply ignore it.
+
+		default:
+			// Asynchronous signal. Let the system deal with it.
+			t.k.sendExternalSignal(info, "application")
+		}
+
+		return (*runApp)(nil)
+
+	case platform.ErrContextCPUPreempted:
+		// Ensure that RSEQ critical sections are interrupted and per-thread
+		// CPU values are updated before the next platform.Context.Switch().
+		t.rseqPreempted = true
+		return (*runApp)(nil)
+
+	default:
+		// What happened? Can't continue.
+		t.Warningf("Unexpected SwitchToApp error: %v", err)
+		t.PrepareExit(ExitStatus{Code: t.ExtractErrno(err, -1)})
+		return (*runExit)(nil)
+	}
+}
+
+// waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits.
+func (t *Task) waitGoroutineStoppedOrExited() {
+	t.goroutineStopped.Wait()
+}
+
+// WaitExited blocks until all task goroutines in tg have exited.
+//
+// WaitExited does not correspond to anything in Linux; it's provided so that
+// external callers of Kernel.CreateProcess can wait for the created thread
+// group to terminate.
+func (tg *ThreadGroup) WaitExited() {
+	tg.liveGoroutines.Wait()
+}
+
+// Yield yields the processor for the calling task.
+func (t *Task) Yield() {
+	atomic.AddUint64(&t.yieldCount, 1)
+	runtime.Gosched()
+}
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
new file mode 100644
index 000000000..5455f6ea9
--- /dev/null
+++ b/pkg/sentry/kernel/task_sched.go
@@ -0,0 +1,637 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// CPU scheduling, real and fake.
+
+import (
+	"fmt"
+	"math/rand"
+	"sync/atomic"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TaskGoroutineState is a coarse representation of the current execution
+// status of a kernel.Task goroutine.
+type TaskGoroutineState int
+
+const (
+	// TaskGoroutineNonexistent indicates that the task goroutine has either
+	// not yet been created by Task.Start() or has returned from Task.run().
+	// This must be the zero value for TaskGoroutineState.
+	TaskGoroutineNonexistent TaskGoroutineState = iota
+
+	// TaskGoroutineRunningSys indicates that the task goroutine is executing
+	// sentry code.
+	TaskGoroutineRunningSys
+
+	// TaskGoroutineRunningApp indicates that the task goroutine is executing
+	// application code.
+	TaskGoroutineRunningApp
+
+	// TaskGoroutineBlockedInterruptible indicates that the task goroutine is
+	// blocked in Task.block(), and hence may be woken by Task.interrupt()
+	// (e.g. due to signal delivery).
+	TaskGoroutineBlockedInterruptible
+
+	// TaskGoroutineBlockedUninterruptible indicates that the task goroutine is
+	// stopped outside of Task.block() and Task.doStop(), and hence cannot be
+	// woken by Task.interrupt().
+	TaskGoroutineBlockedUninterruptible
+
+	// TaskGoroutineStopped indicates that the task goroutine is blocked in
+	// Task.doStop(). TaskGoroutineStopped is similar to
+	// TaskGoroutineBlockedUninterruptible, but is a separate state to make it
+	// possible to determine when Task.stop is meaningful.
+	TaskGoroutineStopped
+)
+
+// TaskGoroutineSchedInfo contains task goroutine scheduling state which must
+// be read and updated atomically.
+//
+// +stateify savable
+type TaskGoroutineSchedInfo struct {
+	// Timestamp was the value of Kernel.cpuClock when this
+	// TaskGoroutineSchedInfo was last updated.
+	Timestamp uint64
+
+	// State is the current state of the task goroutine.
+	State TaskGoroutineState
+
+	// UserTicks is the amount of time the task goroutine has spent executing
+	// its associated Task's application code, in units of linux.ClockTick.
+	UserTicks uint64
+
+	// SysTicks is the amount of time the task goroutine has spent executing in
+	// the sentry, in units of linux.ClockTick.
+	SysTicks uint64
+}
+
+// userTicksAt returns the extrapolated value of ts.UserTicks after
+// Kernel.CPUClockNow() indicates a time of now.
+//
+// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
+// monotonic, this is satisfied if now is the result of a previous call to
+// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
+// change to t.gosched can cause userTicksAt to adjust stats by too much,
+// making the observed stats non-monotonic.
+func (ts *TaskGoroutineSchedInfo) userTicksAt(now uint64) uint64 {
+	if ts.Timestamp < now && ts.State == TaskGoroutineRunningApp {
+		// Update stats to reflect execution since the last update.
+		return ts.UserTicks + (now - ts.Timestamp)
+	}
+	return ts.UserTicks
+}
+
+// sysTicksAt returns the extrapolated value of ts.SysTicks after
+// Kernel.CPUClockNow() indicates a time of now.
+//
+// Preconditions: As for userTicksAt.
+func (ts *TaskGoroutineSchedInfo) sysTicksAt(now uint64) uint64 {
+	if ts.Timestamp < now && ts.State == TaskGoroutineRunningSys {
+		return ts.SysTicks + (now - ts.Timestamp)
+	}
+	return ts.SysTicks
+}
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
+	now := t.k.CPUClockNow()
+	if t.gosched.State != TaskGoroutineRunningSys {
+		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state))
+	}
+	t.goschedSeq.BeginWrite()
+	// This function is very hot; avoid defer.
+	t.gosched.SysTicks += now - t.gosched.Timestamp
+	t.gosched.Timestamp = now
+	t.gosched.State = state
+	t.goschedSeq.EndWrite()
+}
+
+// Preconditions: The caller must be running on the task goroutine, and leaving
+// a state indicated by a previous call to
+// t.accountTaskGoroutineEnter(state).
+func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
+	now := t.k.CPUClockNow()
+	if t.gosched.State != state {
+		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys))
+	}
+	t.goschedSeq.BeginWrite()
+	// This function is very hot; avoid defer.
+	if state == TaskGoroutineRunningApp {
+		t.gosched.UserTicks += now - t.gosched.Timestamp
+	}
+	t.gosched.Timestamp = now
+	t.gosched.State = TaskGoroutineRunningSys
+	t.goschedSeq.EndWrite()
+}
+
+// TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
+// Most clients should use t.CPUStats() instead.
+func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
+	return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched)
+}
+
+// CPUStats returns the CPU usage statistics of t.
+func (t *Task) CPUStats() usage.CPUStats {
+	return t.cpuStatsAt(t.k.CPUClockNow())
+}
+
+// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt.
+func (t *Task) cpuStatsAt(now uint64) usage.CPUStats {
+	tsched := t.TaskGoroutineSchedInfo()
+	return usage.CPUStats{
+		UserTime:          time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)),
+		SysTime:           time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)),
+		VoluntarySwitches: atomic.LoadUint64(&t.yieldCount),
+	}
+}
+
+// CPUStats returns the combined CPU usage statistics of all past and present
+// threads in tg.
+func (tg *ThreadGroup) CPUStats() usage.CPUStats {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	// Hack to get a pointer to the Kernel.
+	if tg.leader == nil {
+		// Per comment on tg.leader, this is only possible if nothing in the
+		// ThreadGroup has ever executed anyway.
+		return usage.CPUStats{}
+	}
+	return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
+}
+
+// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex
+// must be locked.
+func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
+	stats := tg.exitedCPUStats
+	// Account for live tasks.
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		stats.Accumulate(t.cpuStatsAt(now))
+	}
+	return stats
+}
+
+// JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return
+// resource usage statistics for all children of [tg] that have terminated and
+// been waited for. These statistics will include the resources used by
+// grandchildren, and further removed descendants, if all of the intervening
+// descendants waited on their terminated children."
+func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.childCPUStats
+}
+
+// taskClock is a ktime.Clock that measures the time that a task has spent
+// executing. taskClock is primarily used to implement CLOCK_THREAD_CPUTIME_ID.
+//
+// +stateify savable
+type taskClock struct {
+	t *Task
+
+	// If includeSys is true, the taskClock includes both time spent executing
+	// application code as well as time spent in the sentry. Otherwise, the
+	// taskClock includes only time spent executing application code.
+	includeSys bool
+
+	// Implements waiter.Waitable. TimeUntil wouldn't change its estimation
+	// based on either of the clock events, so there's no event to be
+	// notified for.
+	ktime.NoClockEvents `state:"nosave"`
+
+	// Implements ktime.Clock.WallTimeUntil.
+	//
+	// As an upper bound, a task's clock cannot advance faster than CPU
+	// time. It would have to execute at a rate of more than 1 task-second
+	// per 1 CPU-second, which isn't possible.
+	ktime.WallRateClock `state:"nosave"`
+}
+
+// UserCPUClock returns a clock measuring the CPU time the task has spent
+// executing application code.
+func (t *Task) UserCPUClock() ktime.Clock {
+	return &taskClock{t: t, includeSys: false}
+}
+
+// CPUClock returns a clock measuring the CPU time the task has spent executing
+// application and "kernel" code.
+func (t *Task) CPUClock() ktime.Clock {
+	return &taskClock{t: t, includeSys: true}
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *taskClock) Now() ktime.Time {
+	stats := tc.t.CPUStats()
+	if tc.includeSys {
+		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+	}
+	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// tgClock is a ktime.Clock that measures the time a thread group has spent
+// executing. tgClock is primarily used to implement CLOCK_PROCESS_CPUTIME_ID.
+//
+// +stateify savable
+type tgClock struct {
+	tg *ThreadGroup
+
+	// If includeSys is true, the tgClock includes both time spent executing
+	// application code as well as time spent in the sentry. Otherwise, the
+	// tgClock includes only time spent executing application code.
+	includeSys bool
+
+	// Implements waiter.Waitable.
+	ktime.ClockEventsQueue `state:"nosave"`
+}
+
+// Now implements ktime.Clock.Now.
+func (tgc *tgClock) Now() ktime.Time {
+	stats := tgc.tg.CPUStats()
+	if tgc.includeSys {
+		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+	}
+	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// WallTimeUntil implements ktime.Clock.WallTimeUntil.
+func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
+	// Thread group CPU time should not exceed wall time * live tasks, since
+	// task goroutines exit after the transition to TaskExitZombie in
+	// runExitNotify.
+	tgc.tg.pidns.owner.mu.RLock()
+	n := tgc.tg.liveTasks
+	tgc.tg.pidns.owner.mu.RUnlock()
+	if n == 0 {
+		if t.Before(now) {
+			return 0
+		}
+		// The timer tick raced with thread group exit, after which no more
+		// tasks can enter the thread group. So tgc.Now() will never advance
+		// again. Return a large delay; the timer should be stopped long before
+		// it comes again anyway.
+		return time.Hour
+	}
+	// This is a lower bound on the amount of time that can elapse before an
+	// associated timer expires, so returning this value tends to result in a
+	// sequence of closely-spaced ticks just before timer expiry. To avoid
+	// this, round up to the nearest ClockTick; CPU usage measurements are
+	// limited to this resolution anyway.
+	remaining := time.Duration(t.Sub(now).Nanoseconds()/int64(n)) * time.Nanosecond
+	return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
+}
+
+// UserCPUClock returns a ktime.Clock that measures the time that a thread
+// group has spent executing.
+func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
+	return &tgClock{tg: tg, includeSys: false}
+}
+
+// CPUClock returns a ktime.Clock that measures the time that a thread group
+// has spent executing, including sentry time.
+func (tg *ThreadGroup) CPUClock() ktime.Clock {
+	return &tgClock{tg: tg, includeSys: true}
+}
+
+type kernelCPUClockTicker struct {
+	k *Kernel
+
+	// These are essentially kernelCPUClockTicker.Notify local variables that
+	// are cached between calls to reduce allocations.
+	rng *rand.Rand
+	tgs []*ThreadGroup
+}
+
+func newKernelCPUClockTicker(k *Kernel) *kernelCPUClockTicker {
+	return &kernelCPUClockTicker{
+		k:   k,
+		rng: rand.New(rand.NewSource(rand.Int63())),
+	}
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (ticker *kernelCPUClockTicker) Notify(exp uint64) {
+	// Only increment cpuClock by 1 regardless of the number of expirations.
+	// This approximately compensates for cases where thread throttling or bad
+	// Go runtime scheduling prevents the kernelCPUClockTicker goroutine, and
+	// presumably task goroutines as well, from executing for a long period of
+	// time. It's also necessary to prevent CPU clocks from seeing large
+	// discontinuous jumps.
+	now := atomic.AddUint64(&ticker.k.cpuClock, 1)
+
+	// Check thread group CPU timers.
+	tgs := ticker.k.tasks.Root.ThreadGroupsAppend(ticker.tgs)
+	for _, tg := range tgs {
+		if atomic.LoadUint32(&tg.cpuTimersEnabled) == 0 {
+			continue
+		}
+
+		ticker.k.tasks.mu.RLock()
+		if tg.leader == nil {
+			// No tasks have ever run in this thread group.
+			ticker.k.tasks.mu.RUnlock()
+			continue
+		}
+		// Accumulate thread group CPU stats, and randomly select running tasks
+		// using reservoir sampling to receive CPU timer signals.
+		var virtReceiver *Task
+		nrVirtCandidates := 0
+		var profReceiver *Task
+		nrProfCandidates := 0
+		tgUserTime := tg.exitedCPUStats.UserTime
+		tgSysTime := tg.exitedCPUStats.SysTime
+		for t := tg.tasks.Front(); t != nil; t = t.Next() {
+			tsched := t.TaskGoroutineSchedInfo()
+			tgUserTime += time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick))
+			tgSysTime += time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick))
+			switch tsched.State {
+			case TaskGoroutineRunningApp:
+				// Considered by ITIMER_VIRT, ITIMER_PROF, and RLIMIT_CPU
+				// timers.
+				nrVirtCandidates++
+				if int(randInt31n(ticker.rng, int32(nrVirtCandidates))) == 0 {
+					virtReceiver = t
+				}
+				fallthrough
+			case TaskGoroutineRunningSys:
+				// Considered by ITIMER_PROF and RLIMIT_CPU timers.
+				nrProfCandidates++
+				if int(randInt31n(ticker.rng, int32(nrProfCandidates))) == 0 {
+					profReceiver = t
+				}
+			}
+		}
+		tgVirtNow := ktime.FromNanoseconds(tgUserTime.Nanoseconds())
+		tgProfNow := ktime.FromNanoseconds((tgUserTime + tgSysTime).Nanoseconds())
+
+		// All of the following are standard (not real-time) signals, which are
+		// automatically deduplicated, so we ignore the number of expirations.
+		tg.signalHandlers.mu.Lock()
+		// It should only be possible for these timers to advance if we found
+		// at least one running task.
+		if virtReceiver != nil {
+			// ITIMER_VIRTUAL
+			newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow)
+			tg.itimerVirtSetting = newItimerVirtSetting
+			if exp != 0 {
+				virtReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGVTALRM), true)
+			}
+		}
+		if profReceiver != nil {
+			// ITIMER_PROF
+			newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow)
+			tg.itimerProfSetting = newItimerProfSetting
+			if exp != 0 {
+				profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGPROF), true)
+			}
+			// RLIMIT_CPU soft limit
+			newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow)
+			tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting
+			if exp != 0 {
+				profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGXCPU), true)
+			}
+			// RLIMIT_CPU hard limit
+			rlimitCPUMax := tg.limits.Get(limits.CPU).Max
+			if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) {
+				profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true)
+			}
+		}
+		tg.signalHandlers.mu.Unlock()
+
+		ticker.k.tasks.mu.RUnlock()
+	}
+
+	// Retain tgs between calls to Notify to reduce allocations.
+	for i := range tgs {
+		tgs[i] = nil
+	}
+	ticker.tgs = tgs[:0]
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (ticker *kernelCPUClockTicker) Destroy() {
+}
+
+// randInt31n returns a random integer in [0, n).
+//
+// randInt31n is equivalent to math/rand.Rand.int31n(), which is unexported.
+// See that function for details.
+func randInt31n(rng *rand.Rand, n int32) int32 {
+	v := rng.Uint32()
+	prod := uint64(v) * uint64(n)
+	low := uint32(prod)
+	if low < uint32(n) {
+		thresh := uint32(-n) % uint32(n)
+		for low < thresh {
+			v = rng.Uint32()
+			prod = uint64(v) * uint64(n)
+			low = uint32(prod)
+		}
+	}
+	return int32(prod >> 32)
+}
+
+// NotifyRlimitCPUUpdated is called by setrlimit.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) NotifyRlimitCPUUpdated() {
+	t.k.cpuClockTicker.Atomically(func() {
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		t.tg.signalHandlers.mu.Lock()
+		defer t.tg.signalHandlers.mu.Unlock()
+		rlimitCPU := t.tg.limits.Get(limits.CPU)
+		t.tg.rlimitCPUSoftSetting = ktime.Setting{
+			Enabled: rlimitCPU.Cur != limits.Infinity,
+			Next:    ktime.FromNanoseconds((time.Duration(rlimitCPU.Cur) * time.Second).Nanoseconds()),
+			Period:  time.Second,
+		}
+		if rlimitCPU.Max != limits.Infinity {
+			// Check if tg is already over the hard limit.
+			tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow())
+			tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds())
+			if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) {
+				t.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true)
+			}
+		}
+		t.tg.updateCPUTimersEnabledLocked()
+	})
+}
+
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) updateCPUTimersEnabledLocked() {
+	rlimitCPU := tg.limits.Get(limits.CPU)
+	if tg.itimerVirtSetting.Enabled || tg.itimerProfSetting.Enabled || tg.rlimitCPUSoftSetting.Enabled || rlimitCPU.Max != limits.Infinity {
+		atomic.StoreUint32(&tg.cpuTimersEnabled, 1)
+	} else {
+		atomic.StoreUint32(&tg.cpuTimersEnabled, 0)
+	}
+}
+
+// StateStatus returns a string representation of the task's current state,
+// appropriate for /proc/[pid]/status.
+func (t *Task) StateStatus() string {
+	switch s := t.TaskGoroutineSchedInfo().State; s {
+	case TaskGoroutineNonexistent:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		switch t.exitState {
+		case TaskExitZombie:
+			return "Z (zombie)"
+		case TaskExitDead:
+			return "X (dead)"
+		default:
+			// The task goroutine can't exit before passing through
+			// runExitNotify, so this indicates that the task has been created,
+			// but the task goroutine hasn't yet started. The Linux equivalent
+			// is struct task_struct::state == TASK_NEW
+			// (kernel/fork.c:copy_process() =>
+			// kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is
+			// masked out by TASK_REPORT for /proc/[pid]/status, leaving only
+			// TASK_RUNNING.
+			return "R (running)"
+		}
+	case TaskGoroutineRunningSys, TaskGoroutineRunningApp:
+		return "R (running)"
+	case TaskGoroutineBlockedInterruptible:
+		return "S (sleeping)"
+	case TaskGoroutineStopped:
+		t.tg.signalHandlers.mu.Lock()
+		defer t.tg.signalHandlers.mu.Unlock()
+		switch t.stop.(type) {
+		case *groupStop:
+			return "T (stopped)"
+		case *ptraceStop:
+			return "t (tracing stop)"
+		}
+		fallthrough
+	case TaskGoroutineBlockedUninterruptible:
+		// This is the name Linux uses for TASK_UNINTERRUPTIBLE and
+		// TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL):
+		// fs/proc/array.c:task_state_array.
+		return "D (disk sleep)"
+	default:
+		panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s))
+	}
+}
+
+// CPUMask returns a copy of t's allowed CPU mask.
+func (t *Task) CPUMask() sched.CPUSet {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.allowedCPUMask.Copy()
+}
+
+// SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of
+// mask.
+//
+// Preconditions: mask.Size() ==
+// sched.CPUSetSize(t.Kernel().ApplicationCores()).
+func (t *Task) SetCPUMask(mask sched.CPUSet) error {
+	if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want {
+		panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want))
+	}
+
+	// Remove CPUs in mask above Kernel.applicationCores.
+	mask.ClearAbove(t.k.applicationCores)
+
+	// Ensure that at least 1 CPU is still allowed.
+	if mask.NumCPUs() == 0 {
+		return syserror.EINVAL
+	}
+
+	if t.k.useHostCores {
+		// No-op; pretend the mask was immediately changed back.
+		return nil
+	}
+
+	t.tg.pidns.owner.mu.RLock()
+	rootTID := t.tg.pidns.owner.Root.tids[t]
+	t.tg.pidns.owner.mu.RUnlock()
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.allowedCPUMask = mask
+	atomic.StoreInt32(&t.cpu, assignCPU(mask, rootTID))
+	return nil
+}
+
+// CPU returns the cpu id for a given task.
+func (t *Task) CPU() int32 {
+	if t.k.useHostCores {
+		return int32(hostcpu.GetCPU())
+	}
+
+	return atomic.LoadInt32(&t.cpu)
+}
+
+// assignCPU returns the virtualized CPU number for the task with global TID
+// tid and allowedCPUMask allowed.
+func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) {
+	// To pretend that threads are evenly distributed to allowed CPUs, choose n
+	// to be less than the number of CPUs in allowed ...
+	n := int(tid) % int(allowed.NumCPUs())
+	// ... then pick the nth CPU in allowed.
+	allowed.ForEachCPU(func(c uint) {
+		if n--; n == 0 {
+			cpu = int32(c)
+		}
+	})
+	return cpu
+}
+
+// Niceness returns t's niceness.
+func (t *Task) Niceness() int {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.niceness
+}
+
+// Priority returns t's priority.
+func (t *Task) Priority() int {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.niceness + 20
+}
+
+// SetNiceness sets t's niceness to n.
+func (t *Task) SetNiceness(n int) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.niceness = n
+}
+
+// NumaPolicy returns t's current numa policy.
+func (t *Task) NumaPolicy() (policy int32, nodeMask uint32) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.numaPolicy, t.numaNodeMask
+}
+
+// SetNumaPolicy sets t's numa policy.
+func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.numaPolicy = policy
+	t.numaNodeMask = nodeMask
+}
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
new file mode 100644
index 000000000..654cf7525
--- /dev/null
+++ b/pkg/sentry/kernel/task_signals.go
@@ -0,0 +1,1110 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file defines the behavior of task signal handling.
+
+import (
+	"fmt"
+	"sync/atomic"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ucspb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SignalAction is an internal signal action.
+type SignalAction int
+
+// Available signal actions.
+// Note that although we refer the complete set internally,
+// the application is only capable of using the Default and
+// Ignore actions from the system call interface.
+const (
+	SignalActionTerm SignalAction = iota
+	SignalActionCore
+	SignalActionStop
+	SignalActionIgnore
+	SignalActionHandler
+)
+
+// Default signal handler actions. Note that for most signals,
+// (except SIGKILL and SIGSTOP) these can be overridden by the app.
+var defaultActions = map[linux.Signal]SignalAction{
+	// POSIX.1-1990 standard.
+	linux.SIGHUP:  SignalActionTerm,
+	linux.SIGINT:  SignalActionTerm,
+	linux.SIGQUIT: SignalActionCore,
+	linux.SIGILL:  SignalActionCore,
+	linux.SIGABRT: SignalActionCore,
+	linux.SIGFPE:  SignalActionCore,
+	linux.SIGKILL: SignalActionTerm, // but see ThreadGroup.applySignalSideEffects
+	linux.SIGSEGV: SignalActionCore,
+	linux.SIGPIPE: SignalActionTerm,
+	linux.SIGALRM: SignalActionTerm,
+	linux.SIGTERM: SignalActionTerm,
+	linux.SIGUSR1: SignalActionTerm,
+	linux.SIGUSR2: SignalActionTerm,
+	linux.SIGCHLD: SignalActionIgnore,
+	linux.SIGCONT: SignalActionIgnore, // but see ThreadGroup.applySignalSideEffects
+	linux.SIGSTOP: SignalActionStop,
+	linux.SIGTSTP: SignalActionStop,
+	linux.SIGTTIN: SignalActionStop,
+	linux.SIGTTOU: SignalActionStop,
+	// POSIX.1-2001 standard.
+	linux.SIGBUS:    SignalActionCore,
+	linux.SIGPROF:   SignalActionTerm,
+	linux.SIGSYS:    SignalActionCore,
+	linux.SIGTRAP:   SignalActionCore,
+	linux.SIGURG:    SignalActionIgnore,
+	linux.SIGVTALRM: SignalActionTerm,
+	linux.SIGXCPU:   SignalActionCore,
+	linux.SIGXFSZ:   SignalActionCore,
+	// The rest on linux.
+	linux.SIGSTKFLT: SignalActionTerm,
+	linux.SIGIO:     SignalActionTerm,
+	linux.SIGPWR:    SignalActionTerm,
+	linux.SIGWINCH:  SignalActionIgnore,
+}
+
+// computeAction figures out what to do given a signal number
+// and an arch.SignalAct. SIGSTOP always results in a SignalActionStop,
+// and SIGKILL always results in a SignalActionTerm.
+// Signal 0 is always ignored as many programs use it for various internal functions
+// and don't expect it to do anything.
+//
+// In the event the signal is not one of these, act.Handler determines what
+// happens next.
+// If act.Handler is:
+// 0, the default action is taken;
+// 1, the signal is ignored;
+// anything else, the function returns SignalActionHandler.
+func computeAction(sig linux.Signal, act arch.SignalAct) SignalAction {
+	switch sig {
+	case linux.SIGSTOP:
+		return SignalActionStop
+	case linux.SIGKILL:
+		return SignalActionTerm
+	case linux.Signal(0):
+		return SignalActionIgnore
+	}
+
+	switch act.Handler {
+	case arch.SignalActDefault:
+		return defaultActions[sig]
+	case arch.SignalActIgnore:
+		return SignalActionIgnore
+	default:
+		return SignalActionHandler
+	}
+}
+
+// UnblockableSignals contains the set of signals which cannot be blocked.
+var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP)
+
+// StopSignals is the set of signals whose default action is SignalActionStop.
+var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU)
+
+// dequeueSignalLocked returns a pending signal that is *not* included in mask.
+// If there are no pending unmasked signals, dequeueSignalLocked returns nil.
+//
+// Preconditions: t.tg.signalHandlers.mu must be locked.
+func (t *Task) dequeueSignalLocked(mask linux.SignalSet) *arch.SignalInfo {
+	if info := t.pendingSignals.dequeue(mask); info != nil {
+		return info
+	}
+	return t.tg.pendingSignals.dequeue(mask)
+}
+
+// discardSpecificLocked removes all instances of the given signal from all
+// signal queues in tg.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) discardSpecificLocked(sig linux.Signal) {
+	tg.pendingSignals.discardSpecific(sig)
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		t.pendingSignals.discardSpecific(sig)
+	}
+}
+
+// PendingSignals returns the set of pending signals.
+func (t *Task) PendingSignals() linux.SignalSet {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet
+}
+
+// deliverSignal delivers the given signal and returns the following run state.
+func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunState {
+	sigact := computeAction(linux.Signal(info.Signo), act)
+
+	if t.haveSyscallReturn {
+		if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+			// Signals that are ignored, cause a thread group stop, or
+			// terminate the thread group do not interact with interrupted
+			// syscalls; in Linux terms, they are never returned to the signal
+			// handling path from get_signal => get_signal_to_deliver. The
+			// behavior of an interrupted syscall is determined by the first
+			// signal that is actually handled (by userspace).
+			if sigact == SignalActionHandler {
+				switch {
+				case sre == ERESTARTNOHAND:
+					fallthrough
+				case sre == ERESTART_RESTARTBLOCK:
+					fallthrough
+				case (sre == ERESTARTSYS && !act.IsRestart()):
+					t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
+					t.Arch().SetReturn(uintptr(-t.ExtractErrno(syserror.EINTR, -1)))
+				default:
+					t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
+					t.Arch().RestartSyscall()
+				}
+			}
+		}
+	}
+
+	switch sigact {
+	case SignalActionTerm, SignalActionCore:
+		// "Default action is to terminate the process." - signal(7)
+		t.Debugf("Signal %d: terminating thread group", info.Signo)
+
+		// Emit an event channel messages related to this uncaught signal.
+		ucs := &ucspb.UncaughtSignal{
+			Tid:          int32(t.Kernel().TaskSet().Root.IDOfTask(t)),
+			Pid:          int32(t.Kernel().TaskSet().Root.IDOfThreadGroup(t.ThreadGroup())),
+			Registers:    t.Arch().StateData().Proto(),
+			SignalNumber: info.Signo,
+		}
+
+		// Attach an fault address if appropriate.
+		switch linux.Signal(info.Signo) {
+		case linux.SIGSEGV, linux.SIGFPE, linux.SIGILL, linux.SIGTRAP, linux.SIGBUS:
+			ucs.FaultAddr = info.Addr()
+		}
+
+		eventchannel.Emit(ucs)
+
+		t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)})
+		return (*runExit)(nil)
+
+	case SignalActionStop:
+		// "Default action is to stop the process."
+		t.initiateGroupStop(info)
+
+	case SignalActionIgnore:
+		// "Default action is to ignore the signal."
+		t.Debugf("Signal %d: ignored", info.Signo)
+
+	case SignalActionHandler:
+		// Try to deliver the signal to the user-configured handler.
+		t.Debugf("Signal %d: delivering to handler", info.Signo)
+		if err := t.deliverSignalToHandler(info, act); err != nil {
+			// This is not a warning, it can occur during normal operation.
+			t.Debugf("Failed to deliver signal %+v to user handler: %v", info, err)
+
+			// Send a forced SIGSEGV. If the signal that couldn't be delivered
+			// was a SIGSEGV, force the handler to SIG_DFL.
+			t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */)
+			t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		}
+
+	default:
+		panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(linux.Signal(info.Signo), act)))
+	}
+	return (*runInterrupt)(nil)
+}
+
+// deliverSignalToHandler changes the task's userspace state to enter the given
+// user-configured handler for the given signal.
+func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) error {
+	// Signal delivery to an application handler interrupts restartable
+	// sequences.
+	t.rseqInterrupt()
+
+	// Are executing on the main stack,
+	// or the provided alternate stack?
+	sp := usermem.Addr(t.Arch().Stack())
+
+	// N.B. This is a *copy* of the alternate stack that the user's signal
+	// handler expects to see in its ucontext (even if it's not in use).
+	alt := t.signalStack
+	if act.IsOnStack() && alt.IsEnabled() {
+		alt.SetOnStack()
+		if !alt.Contains(sp) {
+			sp = usermem.Addr(alt.Top())
+		}
+	}
+
+	// Set up the signal handler. If we have a saved signal mask, the signal
+	// handler should run with the current mask, but sigreturn should restore
+	// the saved one.
+	st := &arch.Stack{t.Arch(), t.MemoryManager(), sp}
+	mask := t.signalMask
+	if t.haveSavedSignalMask {
+		mask = t.savedSignalMask
+	}
+	if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
+		return err
+	}
+	t.haveSavedSignalMask = false
+
+	// Add our signal mask.
+	newMask := t.signalMask | act.Mask
+	if !act.IsNoDefer() {
+		newMask |= linux.SignalSetOf(linux.Signal(info.Signo))
+	}
+	t.SetSignalMask(newMask)
+
+	return nil
+}
+
+var ctrlResume = &SyscallControl{ignoreReturn: true}
+
+// SignalReturn implements sigreturn(2) (if rt is false) or rt_sigreturn(2) (if
+// rt is true).
+func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
+	st := t.Stack()
+	sigset, alt, err := t.Arch().SignalRestore(st, rt)
+	if err != nil {
+		return nil, err
+	}
+
+	// Attempt to record the given signal stack. Note that we silently
+	// ignore failures here, as does Linux. Only an EFAULT may be
+	// generated, but SignalRestore has already deserialized the entire
+	// frame successfully.
+	t.SetSignalStack(alt)
+
+	// Restore our signal mask. SIGKILL and SIGSTOP should not be blocked.
+	t.SetSignalMask(sigset &^ UnblockableSignals)
+
+	return ctrlResume, nil
+}
+
+// Sigtimedwait implements the semantics of sigtimedwait(2).
+//
+// Preconditions: The caller must be running on the task goroutine. t.exitState
+// < TaskExitZombie.
+func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
+	// set is the set of signals we're interested in; invert it to get the set
+	// of signals to block.
+	mask := ^(set &^ UnblockableSignals)
+
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if info := t.dequeueSignalLocked(mask); info != nil {
+		return info, nil
+	}
+
+	if timeout == 0 {
+		return nil, syserror.EAGAIN
+	}
+
+	// Unblock signals we're waiting for. Remember the original signal mask so
+	// that Task.sendSignalTimerLocked doesn't discard ignored signals that
+	// we're temporarily unblocking.
+	t.realSignalMask = t.signalMask
+	t.setSignalMaskLocked(t.signalMask & mask)
+
+	// Wait for a timeout or new signal.
+	t.tg.signalHandlers.mu.Unlock()
+	_, err := t.BlockWithTimeout(nil, true, timeout)
+	t.tg.signalHandlers.mu.Lock()
+
+	// Restore the original signal mask.
+	t.setSignalMaskLocked(t.realSignalMask)
+	t.realSignalMask = 0
+
+	if info := t.dequeueSignalLocked(mask); info != nil {
+		return info, nil
+	}
+	if err == syserror.ETIMEDOUT {
+		return nil, syserror.EAGAIN
+	}
+	return nil, err
+}
+
+// SendSignal sends the given signal to t.
+//
+// The following errors may be returned:
+//
+//	syserror.ESRCH - The task has exited.
+//	syserror.EINVAL - The signal is not valid.
+//	syserror.EAGAIN - THe signal is realtime, and cannot be queued.
+//
+func (t *Task) SendSignal(info *arch.SignalInfo) error {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.sendSignalLocked(info, false /* group */)
+}
+
+// SendGroupSignal sends the given signal to t's thread group.
+func (t *Task) SendGroupSignal(info *arch.SignalInfo) error {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.sendSignalLocked(info, true /* group */)
+}
+
+// SendSignal sends the given signal to tg, using tg's leader to determine if
+// the signal is blocked.
+func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	return tg.leader.sendSignalLocked(info, true /* group */)
+}
+
+func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error {
+	return t.sendSignalTimerLocked(info, group, nil)
+}
+
+func (t *Task) sendSignalTimerLocked(info *arch.SignalInfo, group bool, timer *IntervalTimer) error {
+	if t.exitState == TaskExitDead {
+		return syserror.ESRCH
+	}
+	sig := linux.Signal(info.Signo)
+	if sig == 0 {
+		return nil
+	}
+	if !sig.IsValid() {
+		return syserror.EINVAL
+	}
+
+	// Signal side effects apply even if the signal is ultimately discarded.
+	t.tg.applySignalSideEffectsLocked(sig)
+
+	// TODO: "Only signals for which the "init" process has established a
+	// signal handler can be sent to the "init" process by other members of the
+	// PID namespace. This restriction applies even to privileged processes,
+	// and prevents other members of the PID namespace from accidentally
+	// killing the "init" process." - pid_namespaces(7). We don't currently do
+	// this for child namespaces, though we should; we also don't do this for
+	// the root namespace (the same restriction applies to global init on
+	// Linux), where whether or not we should is much murkier. In practice,
+	// most sandboxed applications are not prepared to function as an init
+	// process.
+
+	// Unmasked, ignored signals are discarded without being queued, unless
+	// they will be visible to a tracer. Even for group signals, it's the
+	// originally-targeted task's signal mask and tracer that matter; compare
+	// Linux's kernel/signal.c:__send_signal() => prepare_signal() =>
+	// sig_ignored().
+	ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore
+	if sigset := linux.SignalSetOf(sig); sigset&t.signalMask == 0 && sigset&t.realSignalMask == 0 && ignored && !t.hasTracer() {
+		t.Debugf("Discarding ignored signal %d", sig)
+		if timer != nil {
+			timer.signalRejectedLocked()
+		}
+		return nil
+	}
+
+	q := &t.pendingSignals
+	if group {
+		q = &t.tg.pendingSignals
+	}
+	if !q.enqueue(info, timer) {
+		if sig.IsRealtime() {
+			return syserror.EAGAIN
+		}
+		t.Debugf("Discarding duplicate signal %d", sig)
+		if timer != nil {
+			timer.signalRejectedLocked()
+		}
+		return nil
+	}
+
+	// Find a receiver to notify. Note that the task we choose to notify, if
+	// any, may not be the task that actually dequeues and handles the signal;
+	// e.g. a racing signal mask change may cause the notified task to become
+	// ineligible, or a racing sibling task may dequeue the signal first.
+	if t.canReceiveSignalLocked(sig) {
+		t.Debugf("Notified of signal %d", sig)
+		t.interrupt()
+		return nil
+	}
+	if group {
+		if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
+			nt.Debugf("Notified of group signal %d", sig)
+			nt.interrupt()
+			return nil
+		}
+	}
+	t.Debugf("No task notified of signal %d", sig)
+	return nil
+}
+
+func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) {
+	switch {
+	case linux.SignalSetOf(sig)&StopSignals != 0:
+		// Stop signals cause all prior SIGCONT to be discarded. (This is
+		// despite the fact this has little effect since SIGCONT's most
+		// important effect is applied when the signal is sent in the branch
+		// below, not when the signal is delivered.)
+		tg.discardSpecificLocked(linux.SIGCONT)
+	case sig == linux.SIGCONT:
+		// "The SIGCONT signal has a side effect of waking up (all threads of)
+		// a group-stopped process. This side effect happens before
+		// signal-delivery-stop. The tracer can't suppress this side effect (it
+		// can only suppress signal injection, which only causes the SIGCONT
+		// handler to not be executed in the tracee, if such a handler is
+		// installed." - ptrace(2)
+		tg.endGroupStopLocked(true)
+	case sig == linux.SIGKILL:
+		// "SIGKILL does not generate signal-delivery-stop and therefore the
+		// tracer can't suppress it. SIGKILL kills even within system calls
+		// (syscall-exit-stop is not generated prior to death by SIGKILL)." -
+		// ptrace(2)
+		//
+		// Note that this differs from ThreadGroup.requestExit in that it
+		// ignores tg.execing.
+		if !tg.exiting {
+			tg.exiting = true
+			tg.exitStatus = ExitStatus{Signo: int(linux.SIGKILL)}
+		}
+		for t := tg.tasks.Front(); t != nil; t = t.Next() {
+			t.killLocked()
+		}
+	}
+}
+
+// canReceiveSignalLocked returns true if t should be interrupted to receive
+// the given signal. canReceiveSignalLocked is analogous to Linux's
+// kernel/signal.c:wants_signal(), but see below for divergences.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
+	// - Do not choose tasks that are blocking the signal.
+	if linux.SignalSetOf(sig)&t.signalMask != 0 {
+		return false
+	}
+	// - No need to check Task.exitState, as the exit path sets every bit in the
+	// signal mask when it transitions from TaskExitNone to TaskExitInitiated.
+	// - No special case for SIGKILL: SIGKILL already interrupted all tasks in the
+	// task group via applySignalSideEffects => killLocked.
+	// - Do not choose stopped tasks, which cannot handle signals.
+	if t.stop != nil {
+		return false
+	}
+	// - TODO(b/38173783): No special case for when t is also the sending task,
+	// because the identity of the sender is unknown.
+	// - Do not choose tasks that have already been interrupted, as they may be
+	// busy handling another signal.
+	if len(t.interruptChan) != 0 {
+		return false
+	}
+	return true
+}
+
+// findSignalReceiverLocked returns a task in tg that should be interrupted to
+// receive the given signal. If no such task exists, findSignalReceiverLocked
+// returns nil.
+//
+// Linux actually records curr_target to balance the group signal targets.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) findSignalReceiverLocked(sig linux.Signal) *Task {
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		if t.canReceiveSignalLocked(sig) {
+			return t
+		}
+	}
+	return nil
+}
+
+// forceSignal ensures that the task is not ignoring or blocking the given
+// signal. If unconditional is true, forceSignal takes action even if the
+// signal isn't being ignored or blocked.
+func (t *Task) forceSignal(sig linux.Signal, unconditional bool) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.forceSignalLocked(sig, unconditional)
+}
+
+func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) {
+	blocked := linux.SignalSetOf(sig)&t.signalMask != 0
+	act := t.tg.signalHandlers.actions[sig]
+	ignored := act.Handler == arch.SignalActIgnore
+	if blocked || ignored || unconditional {
+		act.Handler = arch.SignalActDefault
+		t.tg.signalHandlers.actions[sig] = act
+		if blocked {
+			t.setSignalMaskLocked(t.signalMask &^ linux.SignalSetOf(sig))
+		}
+	}
+}
+
+// SignalMask returns a copy of t's signal mask.
+func (t *Task) SignalMask() linux.SignalSet {
+	return linux.SignalSet(atomic.LoadUint64((*uint64)(&t.signalMask)))
+}
+
+// SetSignalMask sets t's signal mask.
+//
+// Preconditions: SetSignalMask can only be called by the task goroutine.
+// t.exitState < TaskExitZombie.
+func (t *Task) SetSignalMask(mask linux.SignalSet) {
+	// By precondition, t prevents t.tg from completing an execve and mutating
+	// t.tg.signalHandlers, so we can skip the TaskSet mutex.
+	t.tg.signalHandlers.mu.Lock()
+	t.setSignalMaskLocked(mask)
+	t.tg.signalHandlers.mu.Unlock()
+}
+
+// Preconditions: The signal mutex must be locked.
+func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
+	oldMask := t.signalMask
+	atomic.StoreUint64((*uint64)(&t.signalMask), uint64(mask))
+
+	// If the new mask blocks any signals that were not blocked by the old
+	// mask, and at least one such signal is pending in tg.pendingSignals, and
+	// t has been woken, it could be the case that t was woken to handle that
+	// signal, but will no longer do so as a result of its new signal mask, so
+	// we have to pick a replacement.
+	blocked := mask &^ oldMask
+	blockedGroupPending := blocked & t.tg.pendingSignals.pendingSet
+	if blockedGroupPending != 0 && t.interrupted() {
+		linux.ForEachSignal(blockedGroupPending, func(sig linux.Signal) {
+			if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
+				nt.interrupt()
+				return
+			}
+		})
+		// We have to re-issue the interrupt consumed by t.interrupted() since
+		// it might have been for a different reason.
+		t.interruptSelf()
+	}
+
+	// Conversely, if the new mask unblocks any signals that were blocked by
+	// the old mask, and at least one such signal is pending, we may now need
+	// to handle that signal.
+	unblocked := oldMask &^ mask
+	unblockedPending := unblocked & (t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet)
+	if unblockedPending != 0 {
+		t.interruptSelf()
+	}
+}
+
+// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
+// comment).
+//
+// Preconditions: SetSavedSignalMask can only be called by the task goroutine.
+func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
+	t.savedSignalMask = mask
+	t.haveSavedSignalMask = true
+}
+
+// SignalStack returns the task-private signal stack.
+func (t *Task) SignalStack() arch.SignalStack {
+	alt := t.signalStack
+	if t.onSignalStack(alt) {
+		alt.Flags |= arch.SignalStackFlagOnStack
+	}
+	return alt
+}
+
+// onSignalStack returns true if the task is executing on the given signal stack.
+func (t *Task) onSignalStack(alt arch.SignalStack) bool {
+	sp := usermem.Addr(t.Arch().Stack())
+	return alt.Contains(sp)
+}
+
+// SetSignalStack sets the task-private signal stack.
+//
+// This value may not be changed if the task is currently executing on the
+// signal stack, i.e. if t.onSignalStack returns true. In this case, this
+// function will return false. Otherwise, true is returned.
+func (t *Task) SetSignalStack(alt arch.SignalStack) bool {
+	// Check that we're not executing on the stack.
+	if t.onSignalStack(t.signalStack) {
+		return false
+	}
+
+	if alt.Flags&arch.SignalStackFlagDisable != 0 {
+		// Don't record anything beyond the flags.
+		t.signalStack = arch.SignalStack{
+			Flags: arch.SignalStackFlagDisable,
+		}
+	} else {
+		// Mask out irrelevant parts: only disable matters.
+		alt.Flags &= arch.SignalStackFlagDisable
+		t.signalStack = alt
+	}
+	return true
+}
+
+// SetSignalAct atomically sets the thread group's signal action for signal sig
+// to *actptr (if actptr is not nil) and returns the old signal action.
+func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (arch.SignalAct, error) {
+	if !sig.IsValid() {
+		return arch.SignalAct{}, syserror.EINVAL
+	}
+
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	sh := tg.signalHandlers
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	oldact := sh.actions[sig]
+	if actptr != nil {
+		if sig == linux.SIGKILL || sig == linux.SIGSTOP {
+			return oldact, syserror.EINVAL
+		}
+
+		act := *actptr
+		act.Mask &^= UnblockableSignals
+		sh.actions[sig] = act
+		// From POSIX, by way of Linux:
+		//
+		// "Setting a signal action to SIG_IGN for a signal that is pending
+		// shall cause the pending signal to be discarded, whether or not it is
+		// blocked."
+		//
+		// "Setting a signal action to SIG_DFL for a signal that is pending and
+		// whose default action is to ignore the signal (for example, SIGCHLD),
+		// shall cause the pending signal to be discarded, whether or not it is
+		// blocked."
+		if computeAction(sig, act) == SignalActionIgnore {
+			tg.discardSpecificLocked(sig)
+		}
+	}
+	return oldact, nil
+}
+
+// CopyOutSignalAct converts the given SignalAct into an architecture-specific
+// type and then copies it out to task memory.
+func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
+	n := t.Arch().NewSignalAct()
+	n.SerializeFrom(s)
+	_, err := t.CopyOut(addr, n)
+	return err
+}
+
+// CopyInSignalAct copies an architecture-specific sigaction type from task
+// memory and then converts it into a SignalAct.
+func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
+	n := t.Arch().NewSignalAct()
+	var s arch.SignalAct
+	if _, err := t.CopyIn(addr, n); err != nil {
+		return s, err
+	}
+	n.DeserializeTo(&s)
+	return s, nil
+}
+
+// CopyOutSignalStack converts the given SignalStack into an
+// architecture-specific type and then copies it out to task memory.
+func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error {
+	n := t.Arch().NewSignalStack()
+	n.SerializeFrom(s)
+	_, err := t.CopyOut(addr, n)
+	return err
+}
+
+// CopyInSignalStack copies an architecture-specific stack_t from task memory
+// and then converts it into a SignalStack.
+func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
+	n := t.Arch().NewSignalStack()
+	var s arch.SignalStack
+	if _, err := t.CopyIn(addr, n); err != nil {
+		return s, err
+	}
+	n.DeserializeTo(&s)
+	return s, nil
+}
+
+// groupStop is a TaskStop placed on tasks that have received a stop signal
+// (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from
+// the ptrace man page.)
+//
+// +stateify savable
+type groupStop struct{}
+
+// Killable implements TaskStop.Killable.
+func (*groupStop) Killable() bool { return true }
+
+// initiateGroupStop attempts to initiate a group stop based on a
+// previously-dequeued stop signal.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) initiateGroupStop(info *arch.SignalInfo) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.groupStopPending {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing stop signal", info.Signo)
+		return
+	}
+	if !t.tg.groupStopDequeued {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing SIGCONT", info.Signo)
+		return
+	}
+	if t.tg.exiting {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing group exit", info.Signo)
+		return
+	}
+	if t.tg.execing != nil {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo)
+		return
+	}
+	if !t.tg.groupStopComplete {
+		t.tg.groupStopSignal = linux.Signal(info.Signo)
+	}
+	t.tg.groupStopPendingCount = 0
+	for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() {
+		if t2.killedLocked() || t2.exitState >= TaskExitInitiated {
+			t2.groupStopPending = false
+			continue
+		}
+		t2.groupStopPending = true
+		t2.groupStopAcknowledged = false
+		if t2.ptraceSeized {
+			t2.trapNotifyPending = true
+			if s, ok := t2.stop.(*ptraceStop); ok && s.listen {
+				t2.endInternalStopLocked()
+			}
+		}
+		t2.interrupt()
+		t.tg.groupStopPendingCount++
+	}
+	t.Debugf("Signal %d: stopping %d threads in thread group", info.Signo, t.tg.groupStopPendingCount)
+}
+
+// endGroupStopLocked ensures that all prior stop signals received by tg are
+// not stopping tg and will not stop tg in the future. If broadcast is true,
+// parent and tracer notification will be scheduled if appropriate.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) {
+	// Discard all previously-queued stop signals.
+	linux.ForEachSignal(StopSignals, tg.discardSpecificLocked)
+
+	if tg.groupStopPendingCount == 0 && !tg.groupStopComplete {
+		return
+	}
+
+	completeStr := "incomplete"
+	if tg.groupStopComplete {
+		completeStr = "complete"
+	}
+	tg.leader.Debugf("Ending %s group stop with %d threads pending", completeStr, tg.groupStopPendingCount)
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		t.groupStopPending = false
+		if t.ptraceSeized {
+			t.trapNotifyPending = true
+			if s, ok := t.stop.(*ptraceStop); ok && s.listen {
+				t.endInternalStopLocked()
+			}
+		} else {
+			if _, ok := t.stop.(*groupStop); ok {
+				t.endInternalStopLocked()
+			}
+		}
+	}
+	if broadcast {
+		// Instead of notifying the parent here, set groupContNotify so that
+		// one of the continuing tasks does so. (Linux does something similar.)
+		// The reason we do this is to keep locking sane. In order to send a
+		// signal to the parent, we need to lock its signal mutex, but we're
+		// already holding tg's signal mutex, and the TaskSet mutex must be
+		// locked for writing for us to hold two signal mutexes. Since we don't
+		// want to require this for endGroupStopLocked (which is called from
+		// signal-sending paths), nor do we want to lose atomicity by releasing
+		// the mutexes we're already holding, just let the continuing thread
+		// group deal with it.
+		tg.groupContNotify = true
+		tg.groupContInterrupted = !tg.groupStopComplete
+		tg.groupContWaitable = true
+	}
+	// Unsetting groupStopDequeued will cause racing calls to initiateGroupStop
+	// to recognize that the group stop has been cancelled.
+	tg.groupStopDequeued = false
+	tg.groupStopSignal = 0
+	tg.groupStopPendingCount = 0
+	tg.groupStopComplete = false
+	tg.groupStopWaitable = false
+}
+
+// participateGroupStopLocked is called to handle thread group side effects
+// after t unsets t.groupStopPending. The caller must handle task side effects
+// (e.g. placing the task goroutine into the group stop). It returns true if
+// the caller must notify t.tg.leader's parent of a completed group stop (which
+// participateGroupStopLocked cannot do due to holding the wrong locks).
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) participateGroupStopLocked() bool {
+	if t.groupStopAcknowledged {
+		return false
+	}
+	t.groupStopAcknowledged = true
+	t.tg.groupStopPendingCount--
+	if t.tg.groupStopPendingCount != 0 {
+		return false
+	}
+	if t.tg.groupStopComplete {
+		return false
+	}
+	t.Debugf("Completing group stop")
+	t.tg.groupStopComplete = true
+	t.tg.groupStopWaitable = true
+	t.tg.groupContNotify = false
+	t.tg.groupContWaitable = false
+	return true
+}
+
+// signalStop sends a signal to t's thread group of a new group stop, group
+// continue, or ptrace stop, if appropriate. code and status are set in the
+// signal sent to tg, if any.
+//
+// Preconditions: The TaskSet mutex must be locked (for reading or writing).
+func (t *Task) signalStop(target *Task, code int32, status int32) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	act, ok := t.tg.signalHandlers.actions[linux.SIGCHLD]
+	if !ok || (act.Handler != arch.SignalActIgnore && act.Flags&arch.SignalFlagNoCldStop == 0) {
+		sigchld := &arch.SignalInfo{
+			Signo: int32(linux.SIGCHLD),
+			Code:  code,
+		}
+		sigchld.SetPid(int32(t.tg.pidns.tids[target]))
+		sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+		sigchld.SetStatus(status)
+		// TODO(b/72102453): Set utime, stime.
+		t.sendSignalLocked(sigchld, true /* group */)
+	}
+}
+
+// The runInterrupt state handles conditions indicated by interrupts.
+//
+// +stateify savable
+type runInterrupt struct{}
+
+func (*runInterrupt) execute(t *Task) taskRunState {
+	// Interrupts are de-duplicated (if t is interrupted twice before
+	// t.interrupted() is called, t.interrupted() will only return true once),
+	// so early exits from this function must re-enter the runInterrupt state
+	// to check for more interrupt-signaled conditions.
+
+	t.tg.signalHandlers.mu.Lock()
+
+	// Did we just leave a group stop?
+	if t.tg.groupContNotify {
+		t.tg.groupContNotify = false
+		sig := t.tg.groupStopSignal
+		intr := t.tg.groupContInterrupted
+		t.tg.signalHandlers.mu.Unlock()
+		t.tg.pidns.owner.mu.RLock()
+		// For consistency with Linux, if the parent and (thread group
+		// leader's) tracer are in the same thread group, deduplicate
+		// notifications.
+		notifyParent := t.tg.leader.parent != nil
+		if tracer := t.tg.leader.Tracer(); tracer != nil {
+			if notifyParent && tracer.tg == t.tg.leader.parent.tg {
+				notifyParent = false
+			}
+			// Sending CLD_STOPPED to the tracer doesn't really make any sense;
+			// the thread group leader may have already entered the stop and
+			// notified its tracer accordingly. But it's consistent with
+			// Linux...
+			if intr {
+				tracer.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+				if !notifyParent {
+					tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop | EventChildGroupStop)
+				} else {
+					tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop)
+				}
+			} else {
+				tracer.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
+				tracer.tg.eventQueue.Notify(EventGroupContinue)
+			}
+		}
+		if notifyParent {
+			// If groupContInterrupted, do as Linux does and pretend the group
+			// stop completed just before it ended. The theoretical behavior in
+			// this case would be to send a SIGCHLD indicating the completed
+			// stop, followed by a SIGCHLD indicating the continue. However,
+			// SIGCHLD is a standard signal, so the latter would always be
+			// dropped. Hence sending only the former is equivalent.
+			if intr {
+				t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+				t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue | EventChildGroupStop)
+			} else {
+				t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
+				t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue)
+			}
+		}
+		t.tg.pidns.owner.mu.RUnlock()
+		return (*runInterrupt)(nil)
+	}
+
+	// Do we need to enter a group stop or related ptrace stop? This path is
+	// analogous to Linux's kernel/signal.c:get_signal() => do_signal_stop()
+	// (with ptrace enabled) and do_jobctl_trap().
+	if t.groupStopPending || t.trapStopPending || t.trapNotifyPending {
+		sig := t.tg.groupStopSignal
+		notifyParent := false
+		if t.groupStopPending {
+			t.groupStopPending = false
+			// We care about t.tg.groupStopSignal (for tracer notification)
+			// even if this doesn't complete a group stop, so keep the
+			// value of sig we've already read.
+			notifyParent = t.participateGroupStopLocked()
+		}
+		t.trapStopPending = false
+		t.trapNotifyPending = false
+		// Drop the signal mutex so we can take the TaskSet mutex.
+		t.tg.signalHandlers.mu.Unlock()
+
+		t.tg.pidns.owner.mu.RLock()
+		if t.tg.leader.parent == nil {
+			notifyParent = false
+		}
+		if tracer := t.Tracer(); tracer != nil {
+			if t.ptraceSeized {
+				if sig == 0 {
+					sig = linux.SIGTRAP
+				}
+				// "If tracee was attached using PTRACE_SEIZE, group-stop is
+				// indicated by PTRACE_EVENT_STOP: status>>16 ==
+				// PTRACE_EVENT_STOP. This allows detection of group-stops
+				// without requiring an extra PTRACE_GETSIGINFO call." -
+				// "Group-stop", ptrace(2)
+				t.ptraceCode = int32(sig) | linux.PTRACE_EVENT_STOP<<8
+				t.ptraceSiginfo = &arch.SignalInfo{
+					Signo: int32(sig),
+					Code:  t.ptraceCode,
+				}
+				t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
+				t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+			} else {
+				t.ptraceCode = int32(sig)
+				t.ptraceSiginfo = nil
+			}
+			if t.beginPtraceStopLocked() {
+				tracer.signalStop(t, arch.CLD_STOPPED, int32(sig))
+				// For consistency with Linux, if the parent and tracer are in the
+				// same thread group, deduplicate notification signals.
+				if notifyParent && tracer.tg == t.tg.leader.parent.tg {
+					notifyParent = false
+					tracer.tg.eventQueue.Notify(EventChildGroupStop | EventTraceeStop)
+				} else {
+					tracer.tg.eventQueue.Notify(EventTraceeStop)
+				}
+			}
+		} else {
+			t.tg.signalHandlers.mu.Lock()
+			if !t.killedLocked() {
+				t.beginInternalStopLocked((*groupStop)(nil))
+			}
+			t.tg.signalHandlers.mu.Unlock()
+		}
+		if notifyParent {
+			t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+			t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
+		}
+		t.tg.pidns.owner.mu.RUnlock()
+
+		return (*runInterrupt)(nil)
+	}
+
+	// Are there signals pending?
+	if info := t.dequeueSignalLocked(t.signalMask); info != nil {
+		if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 {
+			// Indicate that we've dequeued a stop signal before unlocking the
+			// signal mutex; initiateGroupStop will check for races with
+			// endGroupStopLocked after relocking it.
+			t.tg.groupStopDequeued = true
+		}
+		if t.ptraceSignalLocked(info) {
+			// Dequeueing the signal action must wait until after the
+			// signal-delivery-stop ends since the tracer can change or
+			// suppress the signal.
+			t.tg.signalHandlers.mu.Unlock()
+			return (*runInterruptAfterSignalDeliveryStop)(nil)
+		}
+		act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
+		t.tg.signalHandlers.mu.Unlock()
+		return t.deliverSignal(info, act)
+	}
+
+	t.tg.signalHandlers.mu.Unlock()
+	return (*runApp)(nil)
+}
+
+// +stateify savable
+type runInterruptAfterSignalDeliveryStop struct{}
+
+func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
+	t.tg.pidns.owner.mu.Lock()
+	// Can't defer unlock: deliverSignal must be called without holding TaskSet
+	// mutex.
+	sig := linux.Signal(t.ptraceCode)
+	defer func() {
+		t.ptraceSiginfo = nil
+	}()
+	if !sig.IsValid() {
+		t.tg.pidns.owner.mu.Unlock()
+		return (*runInterrupt)(nil)
+	}
+	info := t.ptraceSiginfo
+	if sig != linux.Signal(info.Signo) {
+		info.Signo = int32(sig)
+		info.Errno = 0
+		info.Code = arch.SignalInfoUser
+		// pid isn't a valid field for all signal numbers, but Linux
+		// doesn't care (kernel/signal.c:ptrace_signal()).
+		//
+		// Linux uses t->parent for the tid and uid here, which is the tracer
+		// if it hasn't detached or the real parent otherwise.
+		parent := t.parent
+		if tracer := t.Tracer(); tracer != nil {
+			parent = tracer
+		}
+		if parent == nil {
+			// Tracer has detached and t was created by Kernel.CreateProcess().
+			// Pretend the parent is in an ancestor PID + user namespace.
+			info.SetPid(0)
+			info.SetUid(int32(auth.OverflowUID))
+		} else {
+			info.SetPid(int32(t.tg.pidns.tids[parent]))
+			info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+		}
+	}
+	t.tg.signalHandlers.mu.Lock()
+	t.tg.pidns.owner.mu.Unlock()
+	// If the signal is masked, re-queue it.
+	if linux.SignalSetOf(sig)&t.signalMask != 0 {
+		t.sendSignalLocked(info, false /* group */)
+		t.tg.signalHandlers.mu.Unlock()
+		return (*runInterrupt)(nil)
+	}
+	act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
+	t.tg.signalHandlers.mu.Unlock()
+	return t.deliverSignal(info, act)
+}
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
new file mode 100644
index 000000000..b42531e57
--- /dev/null
+++ b/pkg/sentry/kernel/task_start.go
@@ -0,0 +1,287 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TaskConfig defines the configuration of a new Task (see below).
+type TaskConfig struct {
+	// Kernel is the owning Kernel.
+	Kernel *Kernel
+
+	// Parent is the new task's parent. Parent may be nil.
+	Parent *Task
+
+	// If InheritParent is not nil, use InheritParent's parent as the new
+	// task's parent.
+	InheritParent *Task
+
+	// ThreadGroup is the ThreadGroup the new task belongs to.
+	ThreadGroup *ThreadGroup
+
+	// SignalMask is the new task's initial signal mask.
+	SignalMask linux.SignalSet
+
+	// TaskContext is the TaskContext of the new task. Ownership of the
+	// TaskContext is transferred to TaskSet.NewTask, whether or not it
+	// succeeds.
+	TaskContext *TaskContext
+
+	// FSContext is the FSContext of the new task. A reference must be held on
+	// FSContext, which is transferred to TaskSet.NewTask whether or not it
+	// succeeds.
+	FSContext *FSContext
+
+	// FDMap is the FDMap of the new task. A reference must be held on FDMap,
+	// which is transferred to TaskSet.NewTask whether or not it succeeds.
+	FDMap *FDMap
+
+	// Credentials is the Credentials of the new task.
+	Credentials *auth.Credentials
+
+	// Niceness is the niceness of the new task.
+	Niceness int
+
+	// If NetworkNamespaced is true, the new task should observe a non-root
+	// network namespace.
+	NetworkNamespaced bool
+
+	// AllowedCPUMask contains the cpus that this task can run on.
+	AllowedCPUMask sched.CPUSet
+
+	// UTSNamespace is the UTSNamespace of the new task.
+	UTSNamespace *UTSNamespace
+
+	// IPCNamespace is the IPCNamespace of the new task.
+	IPCNamespace *IPCNamespace
+
+	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
+	AbstractSocketNamespace *AbstractSocketNamespace
+
+	// ContainerID is the container the new task belongs to.
+	ContainerID string
+}
+
+// NewTask creates a new task defined by cfg.
+//
+// NewTask does not start the returned task; the caller must call Task.Start.
+func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
+	t, err := ts.newTask(cfg)
+	if err != nil {
+		cfg.TaskContext.release()
+		cfg.FSContext.DecRef()
+		cfg.FDMap.DecRef()
+		return nil, err
+	}
+	return t, nil
+}
+
+// newTask is a helper for TaskSet.NewTask that only takes ownership of parts
+// of cfg if it succeeds.
+func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
+	tg := cfg.ThreadGroup
+	tc := cfg.TaskContext
+	t := &Task{
+		taskNode: taskNode{
+			tg:       tg,
+			parent:   cfg.Parent,
+			children: make(map[*Task]struct{}),
+		},
+		runState:        (*runApp)(nil),
+		interruptChan:   make(chan struct{}, 1),
+		signalMask:      cfg.SignalMask,
+		signalStack:     arch.SignalStack{Flags: arch.SignalStackFlagDisable},
+		tc:              *tc,
+		fsc:             cfg.FSContext,
+		fds:             cfg.FDMap,
+		p:               cfg.Kernel.Platform.NewContext(),
+		k:               cfg.Kernel,
+		ptraceTracees:   make(map[*Task]struct{}),
+		allowedCPUMask:  cfg.AllowedCPUMask.Copy(),
+		ioUsage:         &usage.IO{},
+		creds:           cfg.Credentials,
+		niceness:        cfg.Niceness,
+		netns:           cfg.NetworkNamespaced,
+		utsns:           cfg.UTSNamespace,
+		ipcns:           cfg.IPCNamespace,
+		abstractSockets: cfg.AbstractSocketNamespace,
+		rseqCPU:         -1,
+		futexWaiter:     futex.NewWaiter(),
+		containerID:     cfg.ContainerID,
+	}
+	t.endStopCond.L = &t.tg.signalHandlers.mu
+	t.ptraceTracer.Store((*Task)(nil))
+	// We don't construct t.blockingTimer until Task.run(); see that function
+	// for justification.
+
+	// Make the new task (and possibly thread group) visible to the rest of
+	// the system atomically.
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	if tg.exiting || tg.execing != nil {
+		// If the caller is in the same thread group, then what we return
+		// doesn't matter too much since the caller will exit before it returns
+		// to userspace. If the caller isn't in the same thread group, then
+		// we're in uncharted territory and can return whatever we want.
+		return nil, syserror.EINTR
+	}
+	if err := ts.assignTIDsLocked(t); err != nil {
+		return nil, err
+	}
+	// Below this point, newTask is expected not to fail (there is no rollback
+	// of assignTIDsLocked or any of the following).
+
+	// Logging on t's behalf will panic if t.logPrefix hasn't been initialized.
+	// This is the earliest point at which we can do so (since t now has thread
+	// IDs).
+	t.updateLogPrefixLocked()
+
+	if cfg.InheritParent != nil {
+		t.parent = cfg.InheritParent.parent
+	}
+	if t.parent != nil {
+		t.parent.children[t] = struct{}{}
+	}
+
+	if tg.leader == nil {
+		// New thread group.
+		tg.leader = t
+		if parentPG := tg.parentPG(); parentPG == nil {
+			tg.createSession()
+		} else {
+			// Inherit the process group.
+			parentPG.incRefWithParent(parentPG)
+			tg.processGroup = parentPG
+		}
+	}
+	tg.tasks.PushBack(t)
+	tg.tasksCount++
+	tg.liveTasks++
+	tg.activeTasks++
+
+	// Propagate external TaskSet stops to the new task.
+	t.stopCount = ts.stopCount
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t])
+
+	t.startTime = t.k.RealtimeClock().Now()
+
+	return t, nil
+}
+
+// assignTIDsLocked ensures that new task t is visible in all PID namespaces in
+// which it should be visible.
+//
+// Preconditions: ts.mu must be locked for writing.
+func (ts *TaskSet) assignTIDsLocked(t *Task) error {
+	type allocatedTID struct {
+		ns  *PIDNamespace
+		tid ThreadID
+	}
+	var allocatedTIDs []allocatedTID
+	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+		tid, err := ns.allocateTID()
+		if err != nil {
+			// Failure. Remove the tids we already allocated in descendant
+			// namespaces.
+			for _, a := range allocatedTIDs {
+				delete(a.ns.tasks, a.tid)
+				delete(a.ns.tids, t)
+				if t.tg.leader == nil {
+					delete(a.ns.tgids, t.tg)
+				}
+			}
+			return err
+		}
+		ns.tasks[tid] = t
+		ns.tids[t] = tid
+		if t.tg.leader == nil {
+			// New thread group.
+			ns.tgids[t.tg] = tid
+		}
+		allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
+	}
+	return nil
+}
+
+// allocateTID returns an unused ThreadID from ns.
+//
+// Preconditions: ns.owner.mu must be locked for writing.
+func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
+	if ns.exiting {
+		// "In this case, a subsequent fork(2) into this PID namespace will
+		// fail with the error ENOMEM; it is not possible to create a new
+		// processes [sic] in a PID namespace whose init process has
+		// terminated." - pid_namespaces(7)
+		return 0, syserror.ENOMEM
+	}
+	tid := ns.last
+	for {
+		// Next.
+		tid++
+		if tid > TasksLimit {
+			tid = InitTID + 1
+		}
+
+		// Is it available?
+		_, ok := ns.tasks[tid]
+		if !ok {
+			ns.last = tid
+			return tid, nil
+		}
+
+		// Did we do a full cycle?
+		if tid == ns.last {
+			// No tid available.
+			return 0, syserror.EAGAIN
+		}
+	}
+}
+
+// Start starts the task goroutine. Start must be called exactly once for each
+// task returned by NewTask.
+//
+// 'tid' must be the task's TID in the root PID namespace and it's used for
+// debugging purposes only (set as parameter to Task.run to make it visible
+// in stack dumps).
+func (t *Task) Start(tid ThreadID) {
+	// If the task was restored, it may be "starting" after having already exited.
+	if t.runState == nil {
+		return
+	}
+	t.goroutineStopped.Add(1)
+	t.tg.liveGoroutines.Add(1)
+	t.tg.pidns.owner.liveGoroutines.Add(1)
+	t.tg.pidns.owner.runningGoroutines.Add(1)
+
+	// Task is now running in system mode.
+	t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)
+
+	// Use the task's TID in the root PID namespace to make it visible in stack dumps.
+	go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
+}
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
new file mode 100644
index 000000000..e735a5dd0
--- /dev/null
+++ b/pkg/sentry/kernel/task_stop.go
@@ -0,0 +1,226 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements task stops, which represent the equivalent of Linux's
+// uninterruptible sleep states in a way that is compatible with save/restore.
+// Task stops comprise both internal stops (which form part of the task's
+// "normal" control flow) and external stops (which do not); see README.md for
+// details.
+//
+// There are multiple interfaces for interacting with stops because there are
+// multiple cases to consider:
+//
+// - A task goroutine can begin a stop on its associated task (e.g. a
+// vfork() syscall stopping the calling task until the child task releases its
+// MM). In this case, calling Task.interrupt is both unnecessary (the task
+// goroutine obviously cannot be blocked in Task.block or executing application
+// code) and undesirable (as it may spuriously interrupt a in-progress
+// syscall).
+//
+// Beginning internal stops in this case is implemented by
+// Task.beginInternalStop / Task.beginInternalStopLocked. As of this writing,
+// there are no instances of this case that begin external stops, except for
+// autosave; however, autosave terminates the sentry without ending the
+// external stop, so the spurious interrupt is moot.
+//
+// - An arbitrary goroutine can begin a stop on an unrelated task (e.g. all
+// tasks being stopped in preparation for state checkpointing). If the task
+// goroutine may be in Task.block or executing application code, it must be
+// interrupted by Task.interrupt for it to actually enter the stop; since,
+// strictly speaking, we have no way of determining this, we call
+// Task.interrupt unconditionally.
+//
+// Beginning external stops in this case is implemented by
+// Task.BeginExternalStop. As of this writing, there are no instances of this
+// case that begin internal stops.
+//
+// - An arbitrary goroutine can end a stop on an unrelated task (e.g. an
+// exiting task resuming a sibling task that has been blocked in an execve()
+// syscall waiting for other tasks to exit). In this case, Task.endStopCond
+// must be notified to kick the task goroutine out of Task.doStop.
+//
+// Ending internal stops in this case is implemented by
+// Task.endInternalStopLocked. Ending external stops in this case is
+// implemented by Task.EndExternalStop.
+//
+// - Hypothetically, a task goroutine can end an internal stop on its
+// associated task. As of this writing, there are no instances of this case.
+// However, any instances of this case could still use the above functions,
+// since notifying Task.endStopCond would be unnecessary but harmless.
+
+import (
+	"fmt"
+	"sync/atomic"
+)
+
+// A TaskStop is a condition visible to the task control flow graph that
+// prevents a task goroutine from running or exiting, i.e. an internal stop.
+//
+// NOTE(b/30793614): Most TaskStops don't contain any data; they're
+// distinguished by their type. The obvious way to implement such a TaskStop
+// is:
+//
+//     type groupStop struct{}
+//     func (groupStop) Killable() bool { return true }
+//     ...
+//     t.beginInternalStop(groupStop{})
+//
+// However, this doesn't work because the state package can't serialize values,
+// only pointers. Furthermore, the correctness of save/restore depends on the
+// ability to pass a TaskStop to endInternalStop that will compare equal to the
+// TaskStop that was passed to beginInternalStop, even if a save/restore cycle
+// occurred between the two. As a result, the current idiom is to always use a
+// typecast nil for data-free TaskStops:
+//
+//     type groupStop struct{}
+//     func (*groupStop) Killable() bool { return true }
+//     ...
+//     t.beginInternalStop((*groupStop)(nil))
+//
+// This is pretty gross, but the alternatives seem grosser.
+type TaskStop interface {
+	// Killable returns true if Task.Kill should end the stop prematurely.
+	// Killable is analogous to Linux's TASK_WAKEKILL.
+	Killable() bool
+}
+
+// beginInternalStop indicates the start of an internal stop that applies to t.
+//
+// Preconditions: The task must not already be in an internal stop (i.e. t.stop
+// == nil). The caller must be running on the task goroutine.
+func (t *Task) beginInternalStop(s TaskStop) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.beginInternalStopLocked(s)
+}
+
+// Preconditions: The signal mutex must be locked. All preconditions for
+// Task.beginInternalStop also apply.
+func (t *Task) beginInternalStopLocked(s TaskStop) {
+	if t.stop != nil {
+		panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop))
+	}
+	t.Debugf("Entering internal stop %#v", s)
+	t.stop = s
+	t.beginStopLocked()
+}
+
+// endInternalStopLocked indicates the end of an internal stop that applies to
+// t. endInternalStopLocked does not wait for the task to resume.
+//
+// The caller is responsible for ensuring that the internal stop they expect
+// actually applies to t; this requires holding the signal mutex which protects
+// t.stop, which is why there is no endInternalStop that locks the signal mutex
+// for you.
+//
+// Preconditions: The signal mutex must be locked. The task must be in an
+// internal stop (i.e. t.stop != nil).
+func (t *Task) endInternalStopLocked() {
+	if t.stop == nil {
+		panic("Attempting to leave non-existent internal stop")
+	}
+	t.Debugf("Leaving internal stop %#v", t.stop)
+	t.stop = nil
+	t.endStopLocked()
+}
+
+// BeginExternalStop indicates the start of an external stop that applies to t.
+// BeginExternalStop does not wait for t's task goroutine to stop.
+func (t *Task) BeginExternalStop() {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.beginStopLocked()
+	t.interrupt()
+}
+
+// EndExternalStop indicates the end of an external stop started by a previous
+// call to Task.BeginExternalStop. EndExternalStop does not wait for t's task
+// goroutine to resume.
+func (t *Task) EndExternalStop() {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.endStopLocked()
+}
+
+// beginStopLocked increments t.stopCount to indicate that a new internal or
+// external stop applies to t.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) beginStopLocked() {
+	if newval := atomic.AddInt32(&t.stopCount, 1); newval <= 0 {
+		// Most likely overflow.
+		panic(fmt.Sprintf("Invalid stopCount: %d", newval))
+	}
+}
+
+// endStopLocked decerements t.stopCount to indicate that an existing internal
+// or external stop no longer applies to t.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) endStopLocked() {
+	if newval := atomic.AddInt32(&t.stopCount, -1); newval < 0 {
+		panic(fmt.Sprintf("Invalid stopCount: %d", newval))
+	} else if newval == 0 {
+		t.endStopCond.Signal()
+	}
+}
+
+// BeginExternalStop indicates the start of an external stop that applies to
+// all current and future tasks in ts. BeginExternalStop does not wait for
+// task goroutines to stop.
+func (ts *TaskSet) BeginExternalStop() {
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.stopCount++
+	if ts.stopCount <= 0 {
+		panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount))
+	}
+	if ts.Root == nil {
+		return
+	}
+	for t := range ts.Root.tids {
+		t.tg.signalHandlers.mu.Lock()
+		t.beginStopLocked()
+		t.tg.signalHandlers.mu.Unlock()
+		t.interrupt()
+	}
+}
+
+// EndExternalStop indicates the end of an external stop started by a previous
+// call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task
+// goroutines to resume.
+func (ts *TaskSet) EndExternalStop() {
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.stopCount--
+	if ts.stopCount < 0 {
+		panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount))
+	}
+	if ts.Root == nil {
+		return
+	}
+	for t := range ts.Root.tids {
+		t.tg.signalHandlers.mu.Lock()
+		t.endStopLocked()
+		t.tg.signalHandlers.mu.Unlock()
+	}
+}
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
new file mode 100644
index 000000000..a9283d0df
--- /dev/null
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -0,0 +1,447 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"os"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+	"gvisor.googlesource.com/gvisor/pkg/metric"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
+// include/linux/errno.h. These errnos are never returned to userspace
+// directly, but are used to communicate the expected behavior of an
+// interrupted syscall from the syscall to signal handling.
+type SyscallRestartErrno int
+
+// These numeric values are significant because ptrace syscall exit tracing can
+// observe them.
+//
+// For all of the following errnos, if the syscall is not interrupted by a
+// signal delivered to a user handler, the syscall is restarted.
+const (
+	// ERESTARTSYS is returned by an interrupted syscall to indicate that it
+	// should be converted to EINTR if interrupted by a signal delivered to a
+	// user handler without SA_RESTART set, and restarted otherwise.
+	ERESTARTSYS = SyscallRestartErrno(512)
+
+	// ERESTARTNOINTR is returned by an interrupted syscall to indicate that it
+	// should always be restarted.
+	ERESTARTNOINTR = SyscallRestartErrno(513)
+
+	// ERESTARTNOHAND is returned by an interrupted syscall to indicate that it
+	// should be converted to EINTR if interrupted by a signal delivered to a
+	// user handler, and restarted otherwise.
+	ERESTARTNOHAND = SyscallRestartErrno(514)
+
+	// ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate
+	// that it should be restarted using a custom function. The interrupted
+	// syscall must register a custom restart function by calling
+	// Task.SetRestartSyscallFn.
+	ERESTART_RESTARTBLOCK = SyscallRestartErrno(516)
+)
+
+var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application")
+
+// Error implements error.Error.
+func (e SyscallRestartErrno) Error() string {
+	// Descriptions are borrowed from strace.
+	switch e {
+	case ERESTARTSYS:
+		return "to be restarted if SA_RESTART is set"
+	case ERESTARTNOINTR:
+		return "to be restarted"
+	case ERESTARTNOHAND:
+		return "to be restarted if no handler"
+	case ERESTART_RESTARTBLOCK:
+		return "interrupted by signal"
+	default:
+		return "(unknown interrupt error)"
+	}
+}
+
+// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by
+// rv, the value in a syscall return register.
+func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) {
+	switch int(rv) {
+	case -int(ERESTARTSYS):
+		return ERESTARTSYS, true
+	case -int(ERESTARTNOINTR):
+		return ERESTARTNOINTR, true
+	case -int(ERESTARTNOHAND):
+		return ERESTARTNOHAND, true
+	case -int(ERESTART_RESTARTBLOCK):
+		return ERESTART_RESTARTBLOCK, true
+	default:
+		return 0, false
+	}
+}
+
+// SyscallRestartBlock represents the restart block for a syscall restartable
+// with a custom function. It encapsulates the state required to restart a
+// syscall across a S/R.
+type SyscallRestartBlock interface {
+	Restart(t *Task) (uintptr, error)
+}
+
+// SyscallControl is returned by syscalls to control the behavior of
+// Task.doSyscallInvoke.
+type SyscallControl struct {
+	// next is the state that the task goroutine should switch to. If next is
+	// nil, the task goroutine should continue to syscall exit as usual.
+	next taskRunState
+
+	// If ignoreReturn is true, Task.doSyscallInvoke should not store any value
+	// in the task's syscall return value register.
+	ignoreReturn bool
+}
+
+var (
+	// CtrlDoExit is returned by the implementations of the exit and exit_group
+	// syscalls to enter the task exit path directly, skipping syscall exit
+	// tracing.
+	CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true}
+
+	// ctrlStopAndReinvokeSyscall is returned by syscalls using the external
+	// feature before syscall execution. This causes Task.doSyscallInvoke
+	// to return runSyscallReinvoke, allowing Task.run to check for stops
+	// before immediately re-invoking the syscall (skipping the re-checking
+	// of seccomp filters and ptrace which would confuse userspace
+	// tracing).
+	ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true}
+
+	// ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at
+	// their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather
+	// than tail-calling it, allowing stops to be checked before syscall exit.
+	ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)}
+)
+
+func (t *Task) invokeExternal() {
+	t.BeginExternalStop()
+	go func() { // S/R-SAFE: External control flow.
+		defer t.EndExternalStop()
+		t.SyscallTable().External(t.Kernel())
+	}()
+}
+
+func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) {
+	s := t.SyscallTable()
+
+	fe := s.FeatureEnable.Word(sysno)
+
+	var straceContext interface{}
+	if bits.IsAnyOn32(fe, StraceEnableBits) {
+		straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe)
+	}
+
+	if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) {
+		t.invokeExternal()
+		// Ensure we check for stops, then invoke the syscall again.
+		ctrl = ctrlStopAndReinvokeSyscall
+	} else {
+		fn := s.Lookup(sysno)
+		if fn != nil {
+			// Call our syscall implementation.
+			rval, ctrl, err = fn(t, args)
+		} else {
+			// Use the missing function if not found.
+			rval, err = t.SyscallTable().Missing(t, sysno, args)
+		}
+	}
+
+	if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
+		t.invokeExternal()
+		// Don't reinvoke the syscall.
+	}
+
+	if bits.IsAnyOn32(fe, StraceEnableBits) {
+		s.Stracer.SyscallExit(straceContext, t, sysno, rval, err)
+	}
+
+	return
+}
+
+// doSyscall is the entry point for an invocation of a system call specified by
+// the current state of t's registers.
+//
+// The syscall path is very hot; avoid defer.
+func (t *Task) doSyscall() taskRunState {
+	sysno := t.Arch().SyscallNo()
+	args := t.Arch().SyscallArgs()
+
+	// Tracers expect to see this between when the task traps into the kernel
+	// to perform a syscall and when the syscall is actually invoked.
+	// This useless-looking temporary is needed because Go.
+	tmp := uintptr(syscall.ENOSYS)
+	t.Arch().SetReturn(-tmp)
+
+	// Check seccomp filters. The nil check is for performance (as seccomp use
+	// is rare), not needed for correctness.
+	if t.syscallFilters.Load() != nil {
+		switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r {
+		case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
+			t.Debugf("Syscall %d: denied by seccomp", sysno)
+			return (*runSyscallExit)(nil)
+		case linux.SECCOMP_RET_ALLOW:
+			// ok
+		case linux.SECCOMP_RET_KILL_THREAD:
+			t.Debugf("Syscall %d: killed by seccomp", sysno)
+			t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+			return (*runExit)(nil)
+		case linux.SECCOMP_RET_TRACE:
+			t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno)
+			return (*runSyscallAfterPtraceEventSeccomp)(nil)
+		default:
+			panic(fmt.Sprintf("Unknown seccomp result %d", r))
+		}
+	}
+
+	return t.doSyscallEnter(sysno, args)
+}
+
+type runSyscallAfterPtraceEventSeccomp struct{}
+
+func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
+	if t.killed() {
+		// "[S]yscall-exit-stop is not generated prior to death by SIGKILL." -
+		// ptrace(2)
+		return (*runInterrupt)(nil)
+	}
+	sysno := t.Arch().SyscallNo()
+	// "The tracer can skip the system call by changing the syscall number to
+	// -1." - Documentation/prctl/seccomp_filter.txt
+	if sysno == ^uintptr(0) {
+		return (*runSyscallExit)(nil).execute(t)
+	}
+	args := t.Arch().SyscallArgs()
+	return t.doSyscallEnter(sysno, args)
+}
+
+func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState {
+	if next, ok := t.ptraceSyscallEnter(); ok {
+		return next
+	}
+	return t.doSyscallInvoke(sysno, args)
+}
+
+// +stateify savable
+type runSyscallAfterSyscallEnterStop struct{}
+
+func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
+	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
+		t.tg.signalHandlers.mu.Lock()
+		t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
+		t.tg.signalHandlers.mu.Unlock()
+	}
+	if t.killed() {
+		return (*runInterrupt)(nil)
+	}
+	sysno := t.Arch().SyscallNo()
+	if sysno == ^uintptr(0) {
+		return (*runSyscallExit)(nil)
+	}
+	args := t.Arch().SyscallArgs()
+	return t.doSyscallInvoke(sysno, args)
+}
+
+// +stateify savable
+type runSyscallAfterSysemuStop struct{}
+
+func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
+	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
+		t.tg.signalHandlers.mu.Lock()
+		t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
+		t.tg.signalHandlers.mu.Unlock()
+	}
+	if t.killed() {
+		return (*runInterrupt)(nil)
+	}
+	return (*runSyscallExit)(nil).execute(t)
+}
+
+func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState {
+	rval, ctrl, err := t.executeSyscall(sysno, args)
+
+	if ctrl != nil {
+		if !ctrl.ignoreReturn {
+			t.Arch().SetReturn(rval)
+		}
+		if ctrl.next != nil {
+			return ctrl.next
+		}
+	} else if err != nil {
+		t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+		t.haveSyscallReturn = true
+	} else {
+		t.Arch().SetReturn(rval)
+	}
+
+	return (*runSyscallExit)(nil).execute(t)
+}
+
+// +stateify savable
+type runSyscallReinvoke struct{}
+
+func (*runSyscallReinvoke) execute(t *Task) taskRunState {
+	if t.killed() {
+		// It's possible that since the last execution, the task has
+		// been forcible killed. Invoking the system call here could
+		// result in an infinite loop if it is again preempted by an
+		// external stop and reinvoked.
+		return (*runInterrupt)(nil)
+	}
+
+	sysno := t.Arch().SyscallNo()
+	args := t.Arch().SyscallArgs()
+	return t.doSyscallInvoke(sysno, args)
+}
+
+// +stateify savable
+type runSyscallExit struct{}
+
+func (*runSyscallExit) execute(t *Task) taskRunState {
+	t.ptraceSyscallExit()
+	return (*runApp)(nil)
+}
+
+// doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
+// indicated by an execution fault at address addr. doVsyscall returns the
+// task's next run state.
+func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
+	vsyscallCount.Increment()
+
+	// Grab the caller up front, to make sure there's a sensible stack.
+	caller := t.Arch().Native(uintptr(0))
+	if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
+		t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return (*runApp)(nil)
+	}
+
+	// For _vsyscalls_, there is no need to translate System V calling convention
+	// to syscall ABI because they both use RDI, RSI, and RDX for the first three
+	// arguments and none of the vsyscalls uses more than two arguments.
+	args := t.Arch().SyscallArgs()
+	if t.syscallFilters.Load() != nil {
+		switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
+		case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
+			t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
+			return (*runApp)(nil)
+		case linux.SECCOMP_RET_ALLOW:
+			// ok
+		case linux.SECCOMP_RET_TRACE:
+			t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller))
+			return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller}
+		case linux.SECCOMP_RET_KILL_THREAD:
+			t.Debugf("vsyscall %d: killed by seccomp", sysno)
+			t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+			return (*runExit)(nil)
+		default:
+			panic(fmt.Sprintf("Unknown seccomp result %d", r))
+		}
+	}
+
+	return t.doVsyscallInvoke(sysno, args, caller)
+}
+
+type runVsyscallAfterPtraceEventSeccomp struct {
+	addr   usermem.Addr
+	sysno  uintptr
+	caller interface{}
+}
+
+func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
+	if t.killed() {
+		return (*runInterrupt)(nil)
+	}
+	sysno := t.Arch().SyscallNo()
+	// "... the syscall may not be changed to another system call using the
+	// orig_rax register. It may only be changed to -1 order [sic] to skip the
+	// currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
+	// Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
+	// causes do_exit(SIGSYS), and changing sp is ignored.
+	if (sysno != ^uintptr(0) && sysno != r.sysno) || usermem.Addr(t.Arch().IP()) != r.addr {
+		t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+		return (*runExit)(nil)
+	}
+	if sysno == ^uintptr(0) {
+		return (*runApp)(nil)
+	}
+	return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
+}
+
+func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState {
+	rval, ctrl, err := t.executeSyscall(sysno, args)
+	if ctrl != nil {
+		t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
+		// Set the return value. The stack has already been adjusted.
+		t.Arch().SetReturn(0)
+	} else if err == nil {
+		t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller))
+		// Set the return value. The stack has already been adjusted.
+		t.Arch().SetReturn(uintptr(rval))
+	} else {
+		t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err)
+		if err == syserror.EFAULT {
+			t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+			t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+			// A return is not emulated in this case.
+			return (*runApp)(nil)
+		}
+		t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+	}
+	t.Arch().SetIP(t.Arch().Value(caller))
+	t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
+	return (*runApp)(nil)
+}
+
+// ExtractErrno extracts an integer error number from the error.
+// The syscall number is purely for context in the error case. Use -1 if
+// syscall number is unknown.
+func (t *Task) ExtractErrno(err error, sysno int) int {
+	switch err := err.(type) {
+	case nil:
+		return 0
+	case syscall.Errno:
+		return int(err)
+	case SyscallRestartErrno:
+		return int(err)
+	case *memmap.BusError:
+		// Bus errors may generate SIGBUS, but for syscalls they still
+		// return EFAULT. See case in task_run.go where the fault is
+		// handled (and the SIGBUS is delivered).
+		return int(syscall.EFAULT)
+	case *os.PathError:
+		return t.ExtractErrno(err.Err, sysno)
+	case *os.LinkError:
+		return t.ExtractErrno(err.Err, sysno)
+	case *os.SyscallError:
+		return t.ExtractErrno(err.Err, sysno)
+	default:
+		if errno, ok := syserror.TranslateError(err); ok {
+			return int(errno)
+		}
+	}
+	panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err))
+}
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
new file mode 100644
index 000000000..461bd7316
--- /dev/null
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -0,0 +1,301 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// MAX_RW_COUNT is the maximum size in bytes of a single read or write.
+// Reads and writes that exceed this size may be silently truncated.
+// (Linux: include/linux/fs.h:MAX_RW_COUNT)
+var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
+
+// Activate ensures that the task has an active address space.
+func (t *Task) Activate() {
+	if mm := t.MemoryManager(); mm != nil {
+		if err := mm.Activate(); err != nil {
+			panic("unable to activate mm: " + err.Error())
+		}
+	}
+}
+
+// Deactivate relinquishes the task's active address space.
+func (t *Task) Deactivate() {
+	if mm := t.MemoryManager(); mm != nil {
+		mm.Deactivate()
+	}
+}
+
+// CopyIn copies a fixed-size value or slice of fixed-size values in from the
+// task's memory. The copy will fail with syscall.EFAULT if it traverses user
+// memory that is unmapped or not readable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) {
+	return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyInBytes is a fast version of CopyIn if the caller can serialize the
+// data without reflection and pass in a byte slice.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+	return t.MemoryManager().CopyIn(t, addr, dst, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyOut copies a fixed-size value or slice of fixed-size values out to the
+// task's memory. The copy will fail with syscall.EFAULT if it traverses user
+// memory that is unmapped or not writeable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) {
+	return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyOutBytes is a fast version of CopyOut if the caller can serialize the
+// data without reflection and pass in a byte slice.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+	return t.MemoryManager().CopyOut(t, addr, src, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyInString copies a NUL-terminated string of length at most maxlen in from
+// the task's memory. The copy will fail with syscall.EFAULT if it traverses
+// user memory that is unmapped or not readable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) {
+	return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxlen, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyInVector copies a NULL-terminated vector of strings from the task's
+// memory. The copy will fail with syscall.EFAULT if it traverses
+// user memory that is unmapped or not readable by the user.
+//
+// maxElemSize is the maximum size of each individual element.
+//
+// maxTotalSize is the maximum total length of all elements plus the total
+// number of elements. For example, the following strings correspond to
+// the following set of sizes:
+//
+//     { "a", "b", "c" } => 6 (3 for lengths, 3 for elements)
+//     { "abc" }         => 4 (3 for length, 1 for elements)
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([]string, error) {
+	var v []string
+	for {
+		argAddr := t.Arch().Native(0)
+		if _, err := t.CopyIn(addr, argAddr); err != nil {
+			return v, err
+		}
+		if t.Arch().Value(argAddr) == 0 {
+			break
+		}
+		// Each string has a zero terminating byte counted, so copying out a string
+		// requires at least one byte of space. Also, see the calculation below.
+		if maxTotalSize <= 0 {
+			return nil, syserror.ENOMEM
+		}
+		thisMax := maxElemSize
+		if maxTotalSize < thisMax {
+			thisMax = maxTotalSize
+		}
+		arg, err := t.CopyInString(usermem.Addr(t.Arch().Value(argAddr)), thisMax)
+		if err != nil {
+			return v, err
+		}
+		v = append(v, arg)
+		addr += usermem.Addr(t.Arch().Width())
+		maxTotalSize -= len(arg) + 1
+	}
+	return v, nil
+}
+
+// CopyOutIovecs converts src to an array of struct iovecs and copies it to the
+// memory mapped at addr.
+//
+// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error {
+	switch t.Arch().Width() {
+	case 8:
+		const itemLen = 16
+		if _, ok := addr.AddLength(uint64(src.NumRanges()) * itemLen); !ok {
+			return syserror.EFAULT
+		}
+
+		b := t.CopyScratchBuffer(itemLen)
+		for ; !src.IsEmpty(); src = src.Tail() {
+			ar := src.Head()
+			usermem.ByteOrder.PutUint64(b[0:8], uint64(ar.Start))
+			usermem.ByteOrder.PutUint64(b[8:16], uint64(ar.Length()))
+			if _, err := t.CopyOutBytes(addr, b); err != nil {
+				return err
+			}
+			addr += itemLen
+		}
+
+	default:
+		return syserror.ENOSYS
+	}
+
+	return nil
+}
+
+// CopyInIovecs copies an array of numIovecs struct iovecs from the memory
+// mapped at addr, converts them to usermem.AddrRanges, and returns them as a
+// usermem.AddrRangeSeq.
+//
+// CopyInIovecs shares the following properties with Linux's
+// lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector():
+//
+// - If the length of any AddrRange would exceed the range of an ssize_t,
+// CopyInIovecs returns EINVAL.
+//
+// - If the length of any AddrRange would cause its end to overflow,
+// CopyInIovecs returns EFAULT.
+//
+// - If any AddrRange would include addresses outside the application address
+// range, CopyInIovecs returns EFAULT.
+//
+// - The combined length of all AddrRanges is limited to MAX_RW_COUNT. If the
+// combined length of all AddrRanges would otherwise exceed this amount, ranges
+// beyond MAX_RW_COUNT are silently truncated.
+//
+// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) {
+	if numIovecs == 0 {
+		return usermem.AddrRangeSeq{}, nil
+	}
+
+	var dst []usermem.AddrRange
+	if numIovecs > 1 {
+		dst = make([]usermem.AddrRange, 0, numIovecs)
+	}
+
+	switch t.Arch().Width() {
+	case 8:
+		const itemLen = 16
+		if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok {
+			return usermem.AddrRangeSeq{}, syserror.EFAULT
+		}
+
+		b := t.CopyScratchBuffer(itemLen)
+		for i := 0; i < numIovecs; i++ {
+			if _, err := t.CopyInBytes(addr, b); err != nil {
+				return usermem.AddrRangeSeq{}, err
+			}
+
+			base := usermem.Addr(usermem.ByteOrder.Uint64(b[0:8]))
+			length := usermem.ByteOrder.Uint64(b[8:16])
+			if length > math.MaxInt64 {
+				return usermem.AddrRangeSeq{}, syserror.EINVAL
+			}
+			ar, ok := t.MemoryManager().CheckIORange(base, int64(length))
+			if !ok {
+				return usermem.AddrRangeSeq{}, syserror.EFAULT
+			}
+
+			if numIovecs == 1 {
+				// Special case to avoid allocating dst.
+				return usermem.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil
+			}
+			dst = append(dst, ar)
+
+			addr += itemLen
+		}
+
+	default:
+		return usermem.AddrRangeSeq{}, syserror.ENOSYS
+	}
+
+	// Truncate to MAX_RW_COUNT.
+	var total uint64
+	for i := range dst {
+		dstlen := uint64(dst[i].Length())
+		if rem := uint64(MAX_RW_COUNT) - total; rem < dstlen {
+			dst[i].End -= usermem.Addr(dstlen - rem)
+			dstlen = rem
+		}
+		total += dstlen
+	}
+
+	return usermem.AddrRangeSeqFromSlice(dst), nil
+}
+
+// SingleIOSequence returns a usermem.IOSequence representing [addr,
+// addr+length) in t's address space. If this contains addresses outside the
+// application address range, it returns EFAULT. If length exceeds
+// MAX_RW_COUNT, the range is silently truncated.
+//
+// SingleIOSequence is analogous to Linux's
+// lib/iov_iter.c:import_single_range(). (Note that the non-vectorized read and
+// write syscalls in Linux do not use import_single_range(). However they check
+// access_ok() in fs/read_write.c:vfs_read/vfs_write, and overflowing address
+// ranges are truncated to MAX_RW_COUNT by fs/read_write.c:rw_verify_area().)
+func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+	if length > MAX_RW_COUNT {
+		length = MAX_RW_COUNT
+	}
+	ar, ok := t.MemoryManager().CheckIORange(addr, int64(length))
+	if !ok {
+		return usermem.IOSequence{}, syserror.EFAULT
+	}
+	return usermem.IOSequence{
+		IO:    t.MemoryManager(),
+		Addrs: usermem.AddrRangeSeqOf(ar),
+		Opts:  opts,
+	}, nil
+}
+
+// IovecsIOSequence returns a usermem.IOSequence representing the array of
+// iovcnt struct iovecs at addr in t's address space. opts applies to the
+// returned IOSequence, not the reading of the struct iovec array.
+//
+// IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
+//
+// Preconditions: As for Task.CopyInIovecs.
+func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+	if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+	ars, err := t.CopyInIovecs(addr, iovcnt)
+	if err != nil {
+		return usermem.IOSequence{}, err
+	}
+	return usermem.IOSequence{
+		IO:    t.MemoryManager(),
+		Addrs: ars,
+		Opts:  opts,
+	}, nil
+}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
new file mode 100644
index 000000000..8bd53928e
--- /dev/null
+++ b/pkg/sentry/kernel/thread_group.go
@@ -0,0 +1,330 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// A ThreadGroup is a logical grouping of tasks that has widespread
+// significance to other kernel features (e.g. signal handling). ("Thread
+// groups" are usually called "processes" in userspace documentation.)
+//
+// ThreadGroup is a superset of Linux's struct signal_struct.
+//
+// +stateify savable
+type ThreadGroup struct {
+	threadGroupNode
+
+	// signalHandlers is the set of signal handlers used by every task in this
+	// thread group. (signalHandlers may also be shared with other thread
+	// groups.)
+	//
+	// signalHandlers.mu (hereafter "the signal mutex") protects state related
+	// to signal handling, as well as state that usually needs to be atomic
+	// with signal handling, for all ThreadGroups and Tasks using
+	// signalHandlers. (This is analogous to Linux's use of struct
+	// sighand_struct::siglock.)
+	//
+	// The signalHandlers pointer can only be mutated during an execve
+	// (Task.finishExec). Consequently, when it's possible for a task in the
+	// thread group to be completing an execve, signalHandlers is protected by
+	// the owning TaskSet.mu. Otherwise, it is possible to read the
+	// signalHandlers pointer without synchronization. In particular,
+	// completing an execve requires that all other tasks in the thread group
+	// have exited, so task goroutines do not need the owning TaskSet.mu to
+	// read the signalHandlers pointer of their thread groups.
+	signalHandlers *SignalHandlers
+
+	// pendingSignals is the set of pending signals that may be handled by any
+	// task in this thread group.
+	//
+	// pendingSignals is protected by the signal mutex.
+	pendingSignals pendingSignals
+
+	// If groupStopDequeued is true, a task in the thread group has dequeued a
+	// stop signal, but has not yet initiated the group stop.
+	//
+	// groupStopDequeued is analogous to Linux's JOBCTL_STOP_DEQUEUED.
+	//
+	// groupStopDequeued is protected by the signal mutex.
+	groupStopDequeued bool
+
+	// groupStopSignal is the signal that caused a group stop to be initiated.
+	//
+	// groupStopSignal is protected by the signal mutex.
+	groupStopSignal linux.Signal
+
+	// groupStopPendingCount is the number of active tasks in the thread group
+	// for which Task.groupStopPending is set.
+	//
+	// groupStopPendingCount is analogous to Linux's
+	// signal_struct::group_stop_count.
+	//
+	// groupStopPendingCount is protected by the signal mutex.
+	groupStopPendingCount int
+
+	// If groupStopComplete is true, groupStopPendingCount transitioned from
+	// non-zero to zero without an intervening SIGCONT.
+	//
+	// groupStopComplete is analogous to Linux's SIGNAL_STOP_STOPPED.
+	//
+	// groupStopComplete is protected by the signal mutex.
+	groupStopComplete bool
+
+	// If groupStopWaitable is true, the thread group is indicating a waitable
+	// group stop event (as defined by EventChildGroupStop).
+	//
+	// Linux represents the analogous state as SIGNAL_STOP_STOPPED being set
+	// and group_exit_code being non-zero.
+	//
+	// groupStopWaitable is protected by the signal mutex.
+	groupStopWaitable bool
+
+	// If groupContNotify is true, then a SIGCONT has recently ended a group
+	// stop on this thread group, and the first task to observe it should
+	// notify its parent. groupContInterrupted is true iff SIGCONT ended an
+	// incomplete group stop. If groupContNotify is false, groupContInterrupted is
+	// meaningless.
+	//
+	// Analogues in Linux:
+	//
+	// - groupContNotify && groupContInterrupted is represented by
+	// SIGNAL_CLD_STOPPED.
+	//
+	// - groupContNotify && !groupContInterrupted is represented by
+	// SIGNAL_CLD_CONTINUED.
+	//
+	// - !groupContNotify is represented by neither flag being set.
+	//
+	// groupContNotify and groupContInterrupted are protected by the signal
+	// mutex.
+	groupContNotify      bool
+	groupContInterrupted bool
+
+	// If groupContWaitable is true, the thread group is indicating a waitable
+	// continue event (as defined by EventGroupContinue).
+	//
+	// groupContWaitable is analogous to Linux's SIGNAL_STOP_CONTINUED.
+	//
+	// groupContWaitable is protected by the signal mutex.
+	groupContWaitable bool
+
+	// exiting is true if all tasks in the ThreadGroup should exit. exiting is
+	// analogous to Linux's SIGNAL_GROUP_EXIT.
+	//
+	// exiting is protected by the signal mutex. exiting can only transition
+	// from false to true.
+	exiting bool
+
+	// exitStatus is the thread group's exit status.
+	//
+	// While exiting is false, exitStatus is protected by the signal mutex.
+	// When exiting becomes true, exitStatus becomes immutable.
+	exitStatus ExitStatus
+
+	// terminationSignal is the signal that this thread group's leader will
+	// send to its parent when it exits.
+	//
+	// terminationSignal is protected by the TaskSet mutex.
+	terminationSignal linux.Signal
+
+	// liveGoroutines is the number of non-exited task goroutines in the thread
+	// group.
+	//
+	// liveGoroutines is not saved; it is reset as task goroutines are
+	// restarted by Task.Start.
+	liveGoroutines sync.WaitGroup `state:"nosave"`
+
+	timerMu sync.Mutex `state:"nosave"`
+
+	// itimerRealTimer implements ITIMER_REAL for the thread group.
+	itimerRealTimer *ktime.Timer
+
+	// itimerVirtSetting is the ITIMER_VIRTUAL setting for the thread group.
+	//
+	// itimerVirtSetting is protected by the signal mutex.
+	itimerVirtSetting ktime.Setting
+
+	// itimerProfSetting is the ITIMER_PROF setting for the thread group.
+	//
+	// itimerProfSetting is protected by the signal mutex.
+	itimerProfSetting ktime.Setting
+
+	// rlimitCPUSoftSetting is the setting for RLIMIT_CPU soft limit
+	// notifications for the thread group.
+	//
+	// rlimitCPUSoftSetting is protected by the signal mutex.
+	rlimitCPUSoftSetting ktime.Setting
+
+	// cpuTimersEnabled is non-zero if itimerVirtSetting.Enabled is true,
+	// itimerProfSetting.Enabled is true, rlimitCPUSoftSetting.Enabled is true,
+	// or limits.Get(CPU) is finite.
+	//
+	// cpuTimersEnabled is protected by the signal mutex. cpuTimersEnabled is
+	// accessed using atomic memory operations.
+	cpuTimersEnabled uint32
+
+	// timers is the thread group's POSIX interval timers. nextTimerID is the
+	// TimerID at which allocation should begin searching for an unused ID.
+	//
+	// timers and nextTimerID are protected by timerMu.
+	timers      map[linux.TimerID]*IntervalTimer
+	nextTimerID linux.TimerID
+
+	// exitedCPUStats is the CPU usage for all exited tasks in the thread
+	// group. exitedCPUStats is protected by the TaskSet mutex.
+	exitedCPUStats usage.CPUStats
+
+	// childCPUStats is the CPU usage of all joined descendants of this thread
+	// group. childCPUStats is protected by the TaskSet mutex.
+	childCPUStats usage.CPUStats
+
+	// ioUsage is the I/O usage for all exited tasks in the thread group.
+	// The ioUsage pointer is immutable.
+	ioUsage *usage.IO
+
+	// maxRSS is the historical maximum resident set size of the thread group, updated when:
+	//
+	// - A task in the thread group exits, since after all tasks have
+	// exited the MemoryManager is no longer reachable.
+	//
+	// - The thread group completes an execve, since this changes
+	// MemoryManagers.
+	//
+	// maxRSS is protected by the TaskSet mutex.
+	maxRSS uint64
+
+	// childMaxRSS is the maximum resident set size in bytes of all joined
+	// descendants of this thread group.
+	//
+	// childMaxRSS is protected by the TaskSet mutex.
+	childMaxRSS uint64
+
+	// Resource limits for this ThreadGroup. The limits pointer is immutable.
+	limits *limits.LimitSet
+
+	// processGroup is the processGroup for this thread group.
+	//
+	// processGroup is protected by the TaskSet mutex.
+	processGroup *ProcessGroup
+
+	// execed indicates an exec has occurred since creation. This will be
+	// set by finishExec, and new TheadGroups will have this field cleared.
+	// When execed is set, the processGroup may no longer be changed.
+	//
+	// execed is protected by the TaskSet mutex.
+	execed bool
+
+	// rscr is the thread group's RSEQ critical region.
+	rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
+}
+
+// newThreadGroup returns a new, empty thread group in PID namespace ns. The
+// thread group leader will send its parent terminationSignal when it exits.
+// The new thread group isn't visible to the system until a task has been
+// created inside of it by a successful call to TaskSet.NewTask.
+func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
+	tg := &ThreadGroup{
+		threadGroupNode: threadGroupNode{
+			pidns: ns,
+		},
+		signalHandlers:    sh,
+		terminationSignal: terminationSignal,
+		ioUsage:           &usage.IO{},
+		limits:            limits,
+	}
+	tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
+	tg.timers = make(map[linux.TimerID]*IntervalTimer)
+	tg.rscr.Store(&RSEQCriticalRegion{})
+	return tg
+}
+
+// saveRscr is invopked by stateify.
+func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion {
+	return tg.rscr.Load().(*RSEQCriticalRegion)
+}
+
+// loadRscr is invoked by stateify.
+func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) {
+	tg.rscr.Store(rscr)
+}
+
+// SignalHandlers returns the signal handlers used by tg.
+//
+// Preconditions: The caller must provide the synchronization required to read
+// tg.signalHandlers, as described in the field's comment.
+func (tg *ThreadGroup) SignalHandlers() *SignalHandlers {
+	return tg.signalHandlers
+}
+
+// Limits returns tg's limits.
+func (tg *ThreadGroup) Limits() *limits.LimitSet {
+	return tg.limits
+}
+
+// release releases the thread group's resources.
+func (tg *ThreadGroup) release() {
+	// Timers must be destroyed without holding the TaskSet or signal mutexes
+	// since timers send signals with Timer.mu locked.
+	tg.itimerRealTimer.Destroy()
+	var its []*IntervalTimer
+	tg.pidns.owner.mu.Lock()
+	tg.signalHandlers.mu.Lock()
+	for _, it := range tg.timers {
+		its = append(its, it)
+	}
+	tg.timers = make(map[linux.TimerID]*IntervalTimer) // nil maps can't be saved
+	tg.signalHandlers.mu.Unlock()
+	tg.pidns.owner.mu.Unlock()
+	for _, it := range its {
+		it.DestroyTimer()
+	}
+}
+
+// forEachChildThreadGroupLocked indicates over all child ThreadGroups.
+//
+// Precondition: TaskSet.mu must be held.
+func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		for child := range t.children {
+			if child == child.tg.leader {
+				fn(child.tg)
+			}
+		}
+	}
+}
+
+// itimerRealListener implements ktime.Listener for ITIMER_REAL expirations.
+//
+// +stateify savable
+type itimerRealListener struct {
+	tg *ThreadGroup
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (l *itimerRealListener) Notify(exp uint64) {
+	l.tg.SendSignal(SignalInfoPriv(linux.SIGALRM))
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (l *itimerRealListener) Destroy() {
+}
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
new file mode 100644
index 000000000..656bbd46c
--- /dev/null
+++ b/pkg/sentry/kernel/threads.go
@@ -0,0 +1,465 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// TasksLimit is the maximum number of threads for untrusted application.
+// Linux doesn't really limit this directly, rather it is limited by total
+// memory size, stacks allocated and a global maximum. There's no real reason
+// for us to limit it either, (esp. since threads are backed by go routines),
+// and we would expect to hit resource limits long before hitting this number.
+// However, for correctness, we still check that the user doesn't exceed this
+// number.
+//
+// Note that because of the way futexes are implemented, there *are* in fact
+// serious restrictions on valid thread IDs. They are limited to 2^30 - 1
+// (kernel/fork.c:MAX_THREADS).
+const TasksLimit = (1 << 16)
+
+// ThreadID is a generic thread identifier.
+type ThreadID int32
+
+// String returns a decimal representation of the ThreadID.
+func (tid ThreadID) String() string {
+	return fmt.Sprintf("%d", tid)
+}
+
+// InitTID is the TID given to the first task added to each PID namespace. The
+// thread group led by InitTID is called the namespace's init process. The
+// death of a PID namespace's init process causes all tasks visible in that
+// namespace to be killed.
+const InitTID ThreadID = 1
+
+// A TaskSet comprises all tasks in a system.
+//
+// +stateify savable
+type TaskSet struct {
+	// mu protects all relationships betweens tasks and thread groups in the
+	// TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
+	mu sync.RWMutex `state:"nosave"`
+
+	// Root is the root PID namespace, in which all tasks in the TaskSet are
+	// visible. The Root pointer is immutable.
+	Root *PIDNamespace
+
+	// sessions is the set of all sessions.
+	sessions sessionList
+
+	// stopCount is the number of active external stops applicable to all tasks
+	// in the TaskSet (calls to TaskSet.BeginExternalStop that have not been
+	// paired with a call to TaskSet.EndExternalStop). stopCount is protected
+	// by mu.
+	//
+	// stopCount is not saved for the same reason as Task.stopCount; it is
+	// always reset to zero after restore.
+	stopCount int32 `state:"nosave"`
+
+	// liveGoroutines is the number of non-exited task goroutines in the
+	// TaskSet.
+	//
+	// liveGoroutines is not saved; it is reset as task goroutines are
+	// restarted by Task.Start.
+	liveGoroutines sync.WaitGroup `state:"nosave"`
+
+	// runningGoroutines is the number of running task goroutines in the
+	// TaskSet.
+	//
+	// runningGoroutines is not saved; its counter value is required to be zero
+	// at time of save (but note that this is not necessarily the same thing as
+	// sync.WaitGroup's zero value).
+	runningGoroutines sync.WaitGroup `state:"nosave"`
+}
+
+// newTaskSet returns a new, empty TaskSet.
+func newTaskSet() *TaskSet {
+	ts := &TaskSet{}
+	ts.Root = newPIDNamespace(ts, nil /* parent */, auth.NewRootUserNamespace())
+	return ts
+}
+
+// forEachThreadGroupLocked applies f to each thread group in ts.
+//
+// Preconditions: ts.mu must be locked (for reading or writing).
+func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
+	for tg := range ts.Root.tgids {
+		f(tg)
+	}
+}
+
+// A PIDNamespace represents a PID namespace, a bimap between thread IDs and
+// tasks. See the pid_namespaces(7) man page for further details.
+//
+// N.B. A task is said to be visible in a PID namespace if the PID namespace
+// contains a thread ID that maps to that task.
+//
+// +stateify savable
+type PIDNamespace struct {
+	// owner is the TaskSet that this PID namespace belongs to. The owner
+	// pointer is immutable.
+	owner *TaskSet
+
+	// parent is the PID namespace of the process that created this one. If
+	// this is the root PID namespace, parent is nil. The parent pointer is
+	// immutable.
+	//
+	// Invariant: All tasks that are visible in this namespace are also visible
+	// in all ancestor namespaces.
+	parent *PIDNamespace
+
+	// userns is the user namespace with which this PID namespace is
+	// associated. Privileged operations on this PID namespace must have
+	// appropriate capabilities in userns. The userns pointer is immutable.
+	userns *auth.UserNamespace
+
+	// The following fields are protected by owner.mu.
+
+	// last is the last ThreadID to be allocated in this namespace.
+	last ThreadID
+
+	// tasks is a mapping from ThreadIDs in this namespace to tasks visible in
+	// the namespace.
+	tasks map[ThreadID]*Task
+
+	// tids is a mapping from tasks visible in this namespace to their
+	// identifiers in this namespace.
+	tids map[*Task]ThreadID
+
+	// tgids is a mapping from thread groups visible in this namespace to
+	// their identifiers in this namespace.
+	//
+	// The content of tgids is equivalent to tids[tg.leader]. This exists
+	// primarily as an optimization to quickly find all thread groups.
+	tgids map[*ThreadGroup]ThreadID
+
+	// sessions is a mapping from SessionIDs in this namespace to sessions
+	// visible in the namespace.
+	sessions map[SessionID]*Session
+
+	// sids is a mapping from sessions visible in this namespace to their
+	// identifiers in this namespace.
+	sids map[*Session]SessionID
+
+	// processGroups is a mapping from ProcessGroupIDs in this namespace to
+	// process groups visible in the namespace.
+	processGroups map[ProcessGroupID]*ProcessGroup
+
+	// pgids is a mapping from process groups visible in this namespace to
+	// their identifiers in this namespace.
+	pgids map[*ProcessGroup]ProcessGroupID
+
+	// exiting indicates that the namespace's init process is exiting or has
+	// exited.
+	exiting bool
+}
+
+func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace {
+	return &PIDNamespace{
+		owner:         ts,
+		parent:        parent,
+		userns:        userns,
+		tasks:         make(map[ThreadID]*Task),
+		tids:          make(map[*Task]ThreadID),
+		tgids:         make(map[*ThreadGroup]ThreadID),
+		sessions:      make(map[SessionID]*Session),
+		sids:          make(map[*Session]SessionID),
+		processGroups: make(map[ProcessGroupID]*ProcessGroup),
+		pgids:         make(map[*ProcessGroup]ProcessGroupID),
+	}
+}
+
+// NewChild returns a new, empty PID namespace that is a child of ns. Authority
+// over the new PID namespace is controlled by userns.
+func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace {
+	return newPIDNamespace(ns.owner, ns, userns)
+}
+
+// TaskWithID returns the task with thread ID tid in PID namespace ns. If no
+// task has that TID, TaskWithID returns nil.
+func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task {
+	ns.owner.mu.RLock()
+	t := ns.tasks[tid]
+	ns.owner.mu.RUnlock()
+	return t
+}
+
+// ThreadGroupWithID returns the thread group lead by the task with thread ID
+// tid in PID namespace ns. If no task has that TID, or if the task with that
+// TID is not a thread group leader, ThreadGroupWithID returns nil.
+func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	t := ns.tasks[tid]
+	if t == nil {
+		return nil
+	}
+	if t != t.tg.leader {
+		return nil
+	}
+	return t.tg
+}
+
+// IDOfTask returns the TID assigned to the given task in PID namespace ns. If
+// the task is not visible in that namespace, IDOfTask returns 0. (This return
+// value is significant in some cases, e.g. getppid() is documented as
+// returning 0 if the caller's parent is in an ancestor namespace and
+// consequently not visible to the caller.) If the task is nil, IDOfTask returns
+// 0.
+func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID {
+	ns.owner.mu.RLock()
+	id := ns.tids[t]
+	ns.owner.mu.RUnlock()
+	return id
+}
+
+// IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns.
+// If the task is not visible in that namespace, IDOfThreadGroup returns 0.
+func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID {
+	ns.owner.mu.RLock()
+	id := ns.tgids[tg]
+	ns.owner.mu.RUnlock()
+	return id
+}
+
+// Tasks returns a snapshot of the tasks in ns.
+func (ns *PIDNamespace) Tasks() []*Task {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	tasks := make([]*Task, 0, len(ns.tasks))
+	for t := range ns.tids {
+		tasks = append(tasks, t)
+	}
+	return tasks
+}
+
+// ThreadGroups returns a snapshot of the thread groups in ns.
+func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
+	return ns.ThreadGroupsAppend(nil)
+}
+
+// ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs.
+func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	for tg := range ns.tgids {
+		tgs = append(tgs, tg)
+	}
+	return tgs
+}
+
+// UserNamespace returns the user namespace associated with PID namespace ns.
+func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace {
+	return ns.userns
+}
+
+// A threadGroupNode defines the relationship between a thread group and the
+// rest of the system. Conceptually, threadGroupNode is data belonging to the
+// owning TaskSet, as if TaskSet contained a field `nodes
+// map[*ThreadGroup]*threadGroupNode`. However, for practical reasons,
+// threadGroupNode is embedded in the ThreadGroup it represents.
+// (threadGroupNode is an anonymous field in ThreadGroup; this is to expose
+// threadGroupEntry's methods on ThreadGroup to make it implement
+// threadGroupLinker.)
+//
+// +stateify savable
+type threadGroupNode struct {
+	// pidns is the PID namespace containing the thread group and all of its
+	// member tasks. The pidns pointer is immutable.
+	pidns *PIDNamespace
+
+	// eventQueue is notified whenever a event of interest to Task.Wait occurs
+	// in a child of this thread group, or a ptrace tracee of a task in this
+	// thread group. Events are defined in task_exit.go.
+	//
+	// Note that we cannot check and save this wait queue similarly to other
+	// wait queues, as the queue will not be empty by the time of saving, due
+	// to the wait sourced from Exec().
+	eventQueue waiter.Queue `state:"nosave"`
+
+	// leader is the thread group's leader, which is the oldest task in the
+	// thread group; usually the last task in the thread group to call
+	// execve(), or if no such task exists then the first task in the thread
+	// group, which was created by a call to fork() or clone() without
+	// CLONE_THREAD. Once a thread group has been made visible to the rest of
+	// the system by TaskSet.newTask, leader is never nil.
+	//
+	// Note that it's possible for the leader to exit without causing the rest
+	// of the thread group to exit; in such a case, leader will still be valid
+	// and non-nil, but leader will not be in tasks.
+	//
+	// leader is protected by the TaskSet mutex.
+	leader *Task
+
+	// If execing is not nil, it is a task in the thread group that has killed
+	// all other tasks so that it can become the thread group leader and
+	// perform an execve. (execing may already be the thread group leader.)
+	//
+	// execing is analogous to Linux's signal_struct::group_exit_task.
+	//
+	// execing is protected by the TaskSet mutex.
+	execing *Task
+
+	// tasks is all tasks in the thread group that have not yet been reaped.
+	//
+	// tasks is protected by both the TaskSet mutex and the signal mutex:
+	// Mutating tasks requires locking the TaskSet mutex for writing *and*
+	// locking the signal mutex. Reading tasks requires locking the TaskSet
+	// mutex *or* locking the signal mutex.
+	tasks taskList
+
+	// tasksCount is the number of tasks in the thread group that have not yet
+	// been reaped; equivalently, tasksCount is the number of tasks in tasks.
+	//
+	// tasksCount is protected by both the TaskSet mutex and the signal mutex,
+	// as with tasks.
+	tasksCount int
+
+	// liveTasks is the number of tasks in the thread group that have not yet
+	// reached TaskExitZombie.
+	//
+	// liveTasks is protected by the TaskSet mutex (NOT the signal mutex).
+	liveTasks int
+
+	// activeTasks is the number of tasks in the thread group that have not yet
+	// reached TaskExitInitiated.
+	//
+	// activeTasks is protected by both the TaskSet mutex and the signal mutex,
+	// as with tasks.
+	activeTasks int
+}
+
+// PIDNamespace returns the PID namespace containing tg.
+func (tg *ThreadGroup) PIDNamespace() *PIDNamespace {
+	return tg.pidns
+}
+
+// TaskSet returns the TaskSet containing tg.
+func (tg *ThreadGroup) TaskSet() *TaskSet {
+	return tg.pidns.owner
+}
+
+// Leader returns tg's leader.
+func (tg *ThreadGroup) Leader() *Task {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.leader
+}
+
+// Count returns the number of non-exited threads in the group.
+func (tg *ThreadGroup) Count() int {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	var count int
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		count++
+	}
+	return count
+}
+
+// MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for
+// all tasks in tg.
+func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+
+	var tasks []ThreadID
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		if id, ok := pidns.tids[t]; ok {
+			tasks = append(tasks, id)
+		}
+	}
+	return tasks
+}
+
+// ID returns tg's leader's thread ID in its own PID namespace. If tg's leader
+// is dead, ID returns 0.
+func (tg *ThreadGroup) ID() ThreadID {
+	tg.pidns.owner.mu.RLock()
+	id := tg.pidns.tgids[tg]
+	tg.pidns.owner.mu.RUnlock()
+	return id
+}
+
+// A taskNode defines the relationship between a task and the rest of the
+// system. The comments on threadGroupNode also apply to taskNode.
+//
+// +stateify savable
+type taskNode struct {
+	// tg is the thread group that this task belongs to. The tg pointer is
+	// immutable.
+	tg *ThreadGroup `state:"wait"`
+
+	// taskEntry links into tg.tasks. Note that this means that
+	// Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread
+	// group. See threadGroupNode.tasks for synchronization info.
+	taskEntry
+
+	// parent is the task's parent. parent may be nil.
+	//
+	// parent is protected by the TaskSet mutex.
+	parent *Task
+
+	// children is this task's children.
+	//
+	// children is protected by the TaskSet mutex.
+	children map[*Task]struct{}
+
+	// If childPIDNamespace is not nil, all new tasks created by this task will
+	// be members of childPIDNamespace rather than this one. (As a corollary,
+	// this task becomes unable to create sibling tasks in the same thread
+	// group.)
+	//
+	// childPIDNamespace is exclusive to the task goroutine.
+	childPIDNamespace *PIDNamespace
+}
+
+// ThreadGroup returns the thread group containing t.
+func (t *Task) ThreadGroup() *ThreadGroup {
+	return t.tg
+}
+
+// PIDNamespace returns the PID namespace containing t.
+func (t *Task) PIDNamespace() *PIDNamespace {
+	return t.tg.pidns
+}
+
+// TaskSet returns the TaskSet containing t.
+func (t *Task) TaskSet() *TaskSet {
+	return t.tg.pidns.owner
+}
+
+// Timekeeper returns the system Timekeeper.
+func (t *Task) Timekeeper() *Timekeeper {
+	return t.k.timekeeper
+}
+
+// Parent returns t's parent.
+func (t *Task) Parent() *Task {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	return t.parent
+}
+
+// ThreadID returns t's thread ID in its own PID namespace. If the task is
+// dead, ThreadID returns 0.
+func (t *Task) ThreadID() ThreadID {
+	return t.tg.pidns.IDOfTask(t)
+}
diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go
new file mode 100644
index 000000000..c0660d362
--- /dev/null
+++ b/pkg/sentry/kernel/time/context.go
@@ -0,0 +1,44 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the time package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxRealtimeClock is a Context.Value key for the current real time.
+	CtxRealtimeClock contextID = iota
+)
+
+// RealtimeClockFromContext returns the real time clock associated with context
+// ctx.
+func RealtimeClockFromContext(ctx context.Context) Clock {
+	if v := ctx.Value(CtxRealtimeClock); v != nil {
+		return v.(Clock)
+	}
+	return nil
+}
+
+// NowFromContext returns the current real time associated with context ctx.
+func NowFromContext(ctx context.Context) Time {
+	if clk := RealtimeClockFromContext(ctx); clk != nil {
+		return clk.Now()
+	}
+	panic("encountered context without RealtimeClock")
+}
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
new file mode 100644
index 000000000..3846cf1ea
--- /dev/null
+++ b/pkg/sentry/kernel/time/time.go
@@ -0,0 +1,691 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package time defines the Timer type, which provides a periodic timer that
+// works by sampling a user-provided clock.
+package time
+
+import (
+	"fmt"
+	"math"
+	"sync"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Events that may be generated by a Clock.
+const (
+	// ClockEventSet occurs when a Clock undergoes a discontinuous change.
+	ClockEventSet waiter.EventMask = 1 << iota
+
+	// ClockEventRateIncrease occurs when the rate at which a Clock advances
+	// increases significantly, such that values returned by previous calls to
+	// Clock.WallTimeUntil may be too large.
+	ClockEventRateIncrease
+)
+
+// Time represents an instant in time with nanosecond precision.
+//
+// Time may represent time with respect to any clock and may not have any
+// meaning in the real world.
+//
+// +stateify savable
+type Time struct {
+	ns int64
+}
+
+var (
+	// MinTime is the zero time instant, the lowest possible time that can
+	// be represented by Time.
+	MinTime = Time{ns: math.MinInt64}
+
+	// MaxTime is the highest possible time that can be represented by
+	// Time.
+	MaxTime = Time{ns: math.MaxInt64}
+
+	// ZeroTime represents the zero time in an unspecified Clock's domain.
+	ZeroTime = Time{ns: 0}
+)
+
+const (
+	// MinDuration is the minimum duration representable by time.Duration.
+	MinDuration = time.Duration(math.MinInt64)
+
+	// MaxDuration is the maximum duration representable by time.Duration.
+	MaxDuration = time.Duration(math.MaxInt64)
+)
+
+// FromNanoseconds returns a Time representing the point ns nanoseconds after
+// an unspecified Clock's zero time.
+func FromNanoseconds(ns int64) Time {
+	return Time{ns}
+}
+
+// FromSeconds returns a Time representing the point s seconds after an
+// unspecified Clock's zero time.
+func FromSeconds(s int64) Time {
+	if s > math.MaxInt64/time.Second.Nanoseconds() {
+		return MaxTime
+	}
+	return Time{s * 1e9}
+}
+
+// FromUnix converts from Unix seconds and nanoseconds to Time, assuming a real
+// time Unix clock domain.
+func FromUnix(s int64, ns int64) Time {
+	if s > math.MaxInt64/time.Second.Nanoseconds() {
+		return MaxTime
+	}
+	t := s * 1e9
+	if t > math.MaxInt64-ns {
+		return MaxTime
+	}
+	return Time{t + ns}
+}
+
+// FromTimespec converts from Linux Timespec to Time.
+func FromTimespec(ts linux.Timespec) Time {
+	return Time{ts.ToNsecCapped()}
+}
+
+// FromTimeval converts a Linux Timeval to Time.
+func FromTimeval(tv linux.Timeval) Time {
+	return Time{tv.ToNsecCapped()}
+}
+
+// Nanoseconds returns nanoseconds elapsed since the zero time in t's Clock
+// domain. If t represents walltime, this is nanoseconds since the Unix epoch.
+func (t Time) Nanoseconds() int64 {
+	return t.ns
+}
+
+// Seconds returns seconds elapsed since the zero time in t's Clock domain. If
+// t represents walltime, this is seconds since Unix epoch.
+func (t Time) Seconds() int64 {
+	return t.Nanoseconds() / time.Second.Nanoseconds()
+}
+
+// Timespec converts Time to a Linux timespec.
+func (t Time) Timespec() linux.Timespec {
+	return linux.NsecToTimespec(t.Nanoseconds())
+}
+
+// Unix returns the (seconds, nanoseconds) representation of t such that
+// seconds*1e9 + nanoseconds = t.
+func (t Time) Unix() (s int64, ns int64) {
+	s = t.ns / 1e9
+	ns = t.ns % 1e9
+	return
+}
+
+// TimeT converts Time to a Linux time_t.
+func (t Time) TimeT() linux.TimeT {
+	return linux.NsecToTimeT(t.Nanoseconds())
+}
+
+// Timeval converts Time to a Linux timeval.
+func (t Time) Timeval() linux.Timeval {
+	return linux.NsecToTimeval(t.Nanoseconds())
+}
+
+// Add adds the duration of d to t.
+func (t Time) Add(d time.Duration) Time {
+	if t.ns > 0 && d.Nanoseconds() > math.MaxInt64-int64(t.ns) {
+		return MaxTime
+	}
+	if t.ns < 0 && d.Nanoseconds() < math.MinInt64-int64(t.ns) {
+		return MinTime
+	}
+	return Time{int64(t.ns) + d.Nanoseconds()}
+}
+
+// AddTime adds the duration of u to t.
+func (t Time) AddTime(u Time) Time {
+	return t.Add(time.Duration(u.ns))
+}
+
+// Equal reports whether the two times represent the same instant in time.
+func (t Time) Equal(u Time) bool {
+	return t.ns == u.ns
+}
+
+// Before reports whether the instant t is before the instant u.
+func (t Time) Before(u Time) bool {
+	return t.ns < u.ns
+}
+
+// After reports whether the instant t is after the instant u.
+func (t Time) After(u Time) bool {
+	return t.ns > u.ns
+}
+
+// Sub returns the duration of t - u.
+//
+// N.B. This measure may not make sense for every Time returned by ktime.Clock.
+// Callers who need wall time duration can use ktime.Clock.WallTimeUntil to
+// estimate that wall time.
+func (t Time) Sub(u Time) time.Duration {
+	dur := time.Duration(int64(t.ns)-int64(u.ns)) * time.Nanosecond
+	switch {
+	case u.Add(dur).Equal(t):
+		return dur
+	case t.Before(u):
+		return MinDuration
+	default:
+		return MaxDuration
+	}
+}
+
+// IsMin returns whether t represents the lowest possible time instant.
+func (t Time) IsMin() bool {
+	return t == MinTime
+}
+
+// IsZero returns whether t represents the zero time instant in t's Clock domain.
+func (t Time) IsZero() bool {
+	return t == ZeroTime
+}
+
+// String returns the time represented in nanoseconds as a string.
+func (t Time) String() string {
+	return fmt.Sprintf("%dns", t.Nanoseconds())
+}
+
+// A Clock is an abstract time source.
+type Clock interface {
+	// Now returns the current time in nanoseconds according to the Clock.
+	Now() Time
+
+	// WallTimeUntil returns the estimated wall time until Now will return a
+	// value greater than or equal to t, given that a recent call to Now
+	// returned now. If t has already passed, WallTimeUntil may return 0 or a
+	// negative value.
+	//
+	// WallTimeUntil must be abstract to support Clocks that do not represent
+	// wall time (e.g. thread group execution timers). Clocks that represent
+	// wall times may embed the WallRateClock type to obtain an appropriate
+	// trivial implementation of WallTimeUntil.
+	//
+	// WallTimeUntil is used to determine when associated Timers should next
+	// check for expirations. Returning too small a value may result in
+	// spurious Timer goroutine wakeups, while returning too large a value may
+	// result in late expirations. Implementations should usually err on the
+	// side of underestimating.
+	WallTimeUntil(t, now Time) time.Duration
+
+	// Waitable methods may be used to subscribe to Clock events. Waiters will
+	// not be preserved by Save and must be re-established during restore.
+	//
+	// Since Clock events are transient, implementations of
+	// waiter.Waitable.Readiness should return 0.
+	waiter.Waitable
+}
+
+// WallRateClock implements Clock.WallTimeUntil for Clocks that elapse at the
+// same rate as wall time.
+type WallRateClock struct{}
+
+// WallTimeUntil implements Clock.WallTimeUntil.
+func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
+	return t.Sub(now)
+}
+
+// NoClockEvents implements waiter.Waitable for Clocks that do not generate
+// events.
+type NoClockEvents struct{}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return 0
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (NoClockEvents) EventUnregister(e *waiter.Entry) {
+}
+
+// ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and
+// defining waiter.Waitable.Readiness as required by Clock.
+type ClockEventsQueue struct {
+	waiter.Queue
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return 0
+}
+
+// A TimerListener receives expirations from a Timer.
+type TimerListener interface {
+	// Notify is called when its associated Timer expires. exp is the number of
+	// expirations.
+	//
+	// Notify is called with the associated Timer's mutex locked, so Notify
+	// must not take any locks that precede Timer.mu in lock order.
+	//
+	// Preconditions: exp > 0.
+	Notify(exp uint64)
+
+	// Destroy is called when the timer is destroyed.
+	Destroy()
+}
+
+// Setting contains user-controlled mutable Timer properties.
+//
+// +stateify savable
+type Setting struct {
+	// Enabled is true if the timer is running.
+	Enabled bool
+
+	// Next is the time in nanoseconds of the next expiration.
+	Next Time
+
+	// Period is the time in nanoseconds between expirations. If Period is
+	// zero, the timer will not automatically restart after expiring.
+	//
+	// Invariant: Period >= 0.
+	Period time.Duration
+}
+
+// SettingFromSpec converts a (value, interval) pair to a Setting based on a
+// reading from c. value is interpreted as a time relative to c.Now().
+func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Setting, error) {
+	return SettingFromSpecAt(value, interval, c.Now())
+}
+
+// SettingFromSpecAt converts a (value, interval) pair to a Setting. value is
+// interpreted as a time relative to now.
+func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (Setting, error) {
+	if value < 0 {
+		return Setting{}, syserror.EINVAL
+	}
+	if value == 0 {
+		return Setting{Period: interval}, nil
+	}
+	return Setting{
+		Enabled: true,
+		Next:    now.Add(value),
+		Period:  interval,
+	}, nil
+}
+
+// SettingFromAbsSpec converts a (value, interval) pair to a Setting. value is
+// interpreted as an absolute time.
+func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) {
+	if value.Before(ZeroTime) {
+		return Setting{}, syserror.EINVAL
+	}
+	if value.IsZero() {
+		return Setting{Period: interval}, nil
+	}
+	return Setting{
+		Enabled: true,
+		Next:    value,
+		Period:  interval,
+	}, nil
+}
+
+// SettingFromItimerspec converts a linux.Itimerspec to a Setting. If abs is
+// true, its.Value is interpreted as an absolute time. Otherwise, it is
+// interpreted as a time relative to c.Now().
+func SettingFromItimerspec(its linux.Itimerspec, abs bool, c Clock) (Setting, error) {
+	if abs {
+		return SettingFromAbsSpec(FromTimespec(its.Value), its.Interval.ToDuration())
+	}
+	return SettingFromSpec(its.Value.ToDuration(), its.Interval.ToDuration(), c)
+}
+
+// SpecFromSetting converts a timestamp and a Setting to a (relative value,
+// interval) pair, as used by most Linux syscalls that return a struct
+// itimerval or struct itimerspec.
+func SpecFromSetting(now Time, s Setting) (value, period time.Duration) {
+	if !s.Enabled {
+		return 0, s.Period
+	}
+	return s.Next.Sub(now), s.Period
+}
+
+// ItimerspecFromSetting converts a Setting to a linux.Itimerspec.
+func ItimerspecFromSetting(now Time, s Setting) linux.Itimerspec {
+	val, iv := SpecFromSetting(now, s)
+	return linux.Itimerspec{
+		Interval: linux.DurationToTimespec(iv),
+		Value:    linux.DurationToTimespec(val),
+	}
+}
+
+// At returns an updated Setting and a number of expirations after the
+// associated Clock indicates a time of now.
+//
+// Settings may be created by successive calls to At with decreasing
+// values of now (i.e. time may appear to go backward). Supporting this is
+// required to support non-monotonic clocks, as well as allowing
+// Timer.clock.Now() to be called without holding Timer.mu.
+func (s Setting) At(now Time) (Setting, uint64) {
+	if !s.Enabled {
+		return s, 0
+	}
+	if s.Next.After(now) {
+		return s, 0
+	}
+	if s.Period == 0 {
+		s.Enabled = false
+		return s, 1
+	}
+	exp := 1 + uint64(now.Sub(s.Next).Nanoseconds())/uint64(s.Period)
+	s.Next = s.Next.Add(time.Duration(uint64(s.Period) * exp))
+	return s, exp
+}
+
+// Timer is an optionally-periodic timer driven by sampling a user-specified
+// Clock. Timer's semantics support the requirements of Linux's interval timers
+// (setitimer(2), timer_create(2), timerfd_create(2)).
+//
+// Timers should be created using NewTimer and must be cleaned up by calling
+// Timer.Destroy when no longer used.
+//
+// +stateify savable
+type Timer struct {
+	// clock is the time source. clock is immutable.
+	clock Clock
+
+	// listener is notified of expirations. listener is immutable.
+	listener TimerListener
+
+	// mu protects the following mutable fields.
+	mu sync.Mutex `state:"nosave"`
+
+	// setting is the timer setting. setting is protected by mu.
+	setting Setting
+
+	// paused is true if the Timer is paused. paused is protected by mu.
+	paused bool
+
+	// kicker is used to wake the Timer goroutine. The kicker pointer is
+	// immutable, but its state is protected by mu.
+	kicker *time.Timer `state:"nosave"`
+
+	// entry is registered with clock.EventRegister. entry is immutable.
+	//
+	// Per comment in Clock, entry must be re-registered after restore; per
+	// comment in Timer.Load, this is done in Timer.Resume.
+	entry waiter.Entry `state:"nosave"`
+
+	// events is the channel that will be notified whenever entry receives an
+	// event. It is also closed by Timer.Destroy to instruct the Timer
+	// goroutine to exit.
+	events chan struct{} `state:"nosave"`
+}
+
+// timerTickEvents are Clock events that require the Timer goroutine to Tick
+// prematurely.
+const timerTickEvents = ClockEventSet | ClockEventRateIncrease
+
+// NewTimer returns a new Timer that will obtain time from clock and send
+// expirations to listener. The Timer is initially stopped and has no first
+// expiration or period configured.
+func NewTimer(clock Clock, listener TimerListener) *Timer {
+	t := &Timer{
+		clock:    clock,
+		listener: listener,
+	}
+	t.init()
+	return t
+}
+
+// After waits for the duration to elapse according to clock and then sends a
+// notification on the returned channel. The timer is started immediately and
+// will fire exactly once. The second return value is the start time used with
+// the duration.
+//
+// Callers must call Timer.Destroy.
+func After(clock Clock, duration time.Duration) (*Timer, Time, <-chan struct{}) {
+	notifier, tchan := NewChannelNotifier()
+	t := NewTimer(clock, notifier)
+	now := clock.Now()
+
+	t.Swap(Setting{
+		Enabled: true,
+		Period:  0,
+		Next:    now.Add(duration),
+	})
+	return t, now, tchan
+}
+
+// init initializes Timer state that is not preserved across save/restore. If
+// init has already been called, calling it again is a no-op.
+//
+// Preconditions: t.mu must be locked, or the caller must have exclusive access
+// to t.
+func (t *Timer) init() {
+	if t.kicker != nil {
+		return
+	}
+	// If t.kicker is nil, the Timer goroutine can't be running, so we can't
+	// race with it.
+	t.kicker = time.NewTimer(0)
+	t.entry, t.events = waiter.NewChannelEntry(nil)
+	t.clock.EventRegister(&t.entry, timerTickEvents)
+	go t.runGoroutine() // S/R-SAFE: synchronized by t.mu
+}
+
+// Destroy releases resources owned by the Timer. A Destroyed Timer must not be
+// used again; in particular, a Destroyed Timer should not be Saved.
+func (t *Timer) Destroy() {
+	// Stop the Timer, ensuring that the Timer goroutine will not call
+	// t.kicker.Reset, before calling t.kicker.Stop.
+	t.mu.Lock()
+	t.setting.Enabled = false
+	t.mu.Unlock()
+	t.kicker.Stop()
+	// Unregister t.entry, ensuring that the Clock will not send to t.events,
+	// before closing t.events to instruct the Timer goroutine to exit.
+	t.clock.EventUnregister(&t.entry)
+	close(t.events)
+	t.listener.Destroy()
+}
+
+func (t *Timer) runGoroutine() {
+	for {
+		select {
+		case <-t.kicker.C:
+		case _, ok := <-t.events:
+			if !ok {
+				// Channel closed by Destroy.
+				return
+			}
+		}
+		t.Tick()
+	}
+}
+
+// Tick requests that the Timer immediately check for expirations and
+// re-evaluate when it should next check for expirations.
+func (t *Timer) Tick() {
+	now := t.clock.Now()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.paused {
+		return
+	}
+	s, exp := t.setting.At(now)
+	t.setting = s
+	if exp > 0 {
+		t.listener.Notify(exp)
+	}
+	t.resetKickerLocked(now)
+}
+
+// Pause pauses the Timer, ensuring that it does not generate any further
+// expirations until Resume is called. If the Timer is already paused, Pause
+// has no effect.
+func (t *Timer) Pause() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.paused = true
+	// t.kicker may be nil if we were restored but never resumed.
+	if t.kicker != nil {
+		t.kicker.Stop()
+	}
+}
+
+// Resume ends the effect of Pause. If the Timer is not paused, Resume has no
+// effect.
+func (t *Timer) Resume() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if !t.paused {
+		return
+	}
+	t.paused = false
+
+	// Lazily initialize the Timer. We can't call Timer.init until Timer.Resume
+	// because save/restore will restore Timers before
+	// kernel.Timekeeper.SetClocks() has been called, so if t.clock is backed
+	// by a kernel.Timekeeper then the Timer goroutine will panic if it calls
+	// t.clock.Now().
+	t.init()
+
+	// Kick the Timer goroutine in case it was already initialized, but the
+	// Timer goroutine was sleeping.
+	t.kicker.Reset(0)
+}
+
+// Get returns a snapshot of the Timer's current Setting and the time
+// (according to the Timer's Clock) at which the snapshot was taken.
+//
+// Preconditions: The Timer must not be paused (since its Setting cannot
+// be advanced to the current time while it is paused.)
+func (t *Timer) Get() (Time, Setting) {
+	now := t.clock.Now()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.paused {
+		panic(fmt.Sprintf("Timer.Get called on paused Timer %p", t))
+	}
+	s, exp := t.setting.At(now)
+	t.setting = s
+	if exp > 0 {
+		t.listener.Notify(exp)
+	}
+	t.resetKickerLocked(now)
+	return now, s
+}
+
+// Swap atomically changes the Timer's Setting and returns the Timer's previous
+// Setting and the time (according to the Timer's Clock) at which the snapshot
+// was taken. Setting s.Enabled to true starts the Timer, while setting
+// s.Enabled to false stops it.
+//
+// Preconditions: The Timer must not be paused.
+func (t *Timer) Swap(s Setting) (Time, Setting) {
+	return t.SwapAnd(s, nil)
+}
+
+// SwapAnd atomically changes the Timer's Setting, calls f if it is not nil,
+// and returns the Timer's previous Setting and the time (according to the
+// Timer's Clock) at which the Setting was changed. Setting s.Enabled to true
+// starts the timer, while setting s.Enabled to false stops it.
+//
+// Preconditions: The Timer must not be paused. f cannot call any Timer methods
+// since it is called with the Timer mutex locked.
+func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
+	now := t.clock.Now()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.paused {
+		panic(fmt.Sprintf("Timer.SwapAnd called on paused Timer %p", t))
+	}
+	oldS, oldExp := t.setting.At(now)
+	if oldExp > 0 {
+		t.listener.Notify(oldExp)
+	}
+	if f != nil {
+		f()
+	}
+	newS, newExp := s.At(now)
+	t.setting = newS
+	if newExp > 0 {
+		t.listener.Notify(newExp)
+	}
+	t.resetKickerLocked(now)
+	return now, oldS
+}
+
+// Atomically invokes f atomically with respect to expirations of t; that is, t
+// cannot generate expirations while f is being called.
+//
+// Preconditions: f cannot call any Timer methods since it is called with the
+// Timer mutex locked.
+func (t *Timer) Atomically(f func()) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	f()
+}
+
+// Preconditions: t.mu must be locked.
+func (t *Timer) resetKickerLocked(now Time) {
+	if t.setting.Enabled {
+		// Clock.WallTimeUntil may return a negative value. This is fine;
+		// time.when treats negative Durations as 0.
+		t.kicker.Reset(t.clock.WallTimeUntil(t.setting.Next, now))
+	}
+	// We don't call t.kicker.Stop if !t.setting.Enabled because in most cases
+	// resetKickerLocked will be called from the Timer goroutine itself, in
+	// which case t.kicker has already fired and t.kicker.Stop will be an
+	// expensive no-op (time.Timer.Stop => time.stopTimer => runtime.stopTimer
+	// => runtime.deltimer).
+}
+
+// Clock returns the Clock used by t.
+func (t *Timer) Clock() Clock {
+	return t.clock
+}
+
+// ChannelNotifier is a TimerListener that sends a message on an empty struct
+// channel.
+//
+// ChannelNotifier cannot be saved or loaded.
+type ChannelNotifier struct {
+	// tchan must be a buffered channel.
+	tchan chan struct{}
+}
+
+// NewChannelNotifier creates a new channel notifier.
+//
+// If the notifier is used with a timer, Timer.Destroy will close the channel
+// returned here.
+func NewChannelNotifier() (TimerListener, <-chan struct{}) {
+	tchan := make(chan struct{}, 1)
+	return &ChannelNotifier{tchan}, tchan
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (c *ChannelNotifier) Notify(uint64) {
+	select {
+	case c.tchan <- struct{}{}:
+	default:
+	}
+}
+
+// Destroy implements ktime.TimerListener.Destroy and will close the channel.
+func (c *ChannelNotifier) Destroy() {
+	close(c.tchan)
+}
diff --git a/pkg/sentry/kernel/time/time_state_autogen.go b/pkg/sentry/kernel/time/time_state_autogen.go
new file mode 100755
index 000000000..1750b55d6
--- /dev/null
+++ b/pkg/sentry/kernel/time/time_state_autogen.go
@@ -0,0 +1,56 @@
+// automatically generated by stateify.
+
+package time
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *Time) beforeSave() {}
+func (x *Time) save(m state.Map) {
+	x.beforeSave()
+	m.Save("ns", &x.ns)
+}
+
+func (x *Time) afterLoad() {}
+func (x *Time) load(m state.Map) {
+	m.Load("ns", &x.ns)
+}
+
+func (x *Setting) beforeSave() {}
+func (x *Setting) save(m state.Map) {
+	x.beforeSave()
+	m.Save("Enabled", &x.Enabled)
+	m.Save("Next", &x.Next)
+	m.Save("Period", &x.Period)
+}
+
+func (x *Setting) afterLoad() {}
+func (x *Setting) load(m state.Map) {
+	m.Load("Enabled", &x.Enabled)
+	m.Load("Next", &x.Next)
+	m.Load("Period", &x.Period)
+}
+
+func (x *Timer) beforeSave() {}
+func (x *Timer) save(m state.Map) {
+	x.beforeSave()
+	m.Save("clock", &x.clock)
+	m.Save("listener", &x.listener)
+	m.Save("setting", &x.setting)
+	m.Save("paused", &x.paused)
+}
+
+func (x *Timer) afterLoad() {}
+func (x *Timer) load(m state.Map) {
+	m.Load("clock", &x.clock)
+	m.Load("listener", &x.listener)
+	m.Load("setting", &x.setting)
+	m.Load("paused", &x.paused)
+}
+
+func init() {
+	state.Register("time.Time", (*Time)(nil), state.Fns{Save: (*Time).save, Load: (*Time).load})
+	state.Register("time.Setting", (*Setting)(nil), state.Fns{Save: (*Setting).save, Load: (*Setting).load})
+	state.Register("time.Timer", (*Timer)(nil), state.Fns{Save: (*Timer).save, Load: (*Timer).load})
+}
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
new file mode 100644
index 000000000..505a4fa4f
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -0,0 +1,306 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sync"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// Timekeeper manages all of the kernel clocks.
+//
+// +stateify savable
+type Timekeeper struct {
+	// clocks are the clock sources.
+	//
+	// These are not saved directly, as the new machine's clock may behave
+	// differently.
+	//
+	// It is set only once, by SetClocks.
+	clocks sentrytime.Clocks `state:"nosave"`
+
+	// bootTime is the realtime when the system "booted". i.e., when
+	// SetClocks was called in the initial (not restored) run.
+	bootTime ktime.Time
+
+	// monotonicOffset is the offset to apply to the monotonic clock output
+	// from clocks.
+	//
+	// It is set only once, by SetClocks.
+	monotonicOffset int64 `state:"nosave"`
+
+	// restored, if non-nil, indicates that this Timekeeper was restored
+	// from a state file. The clocks are not set until restored is closed.
+	restored chan struct{} `state:"nosave"`
+
+	// saveMonotonic is the (offset) value of the monotonic clock at the
+	// time of save.
+	//
+	// It is only valid if restored is non-nil.
+	//
+	// It is only used in SetClocks after restore to compute the new
+	// monotonicOffset.
+	saveMonotonic int64
+
+	// saveRealtime is the value of the realtime clock at the time of save.
+	//
+	// It is only valid if restored is non-nil.
+	//
+	// It is only used in SetClocks after restore to compute the new
+	// monotonicOffset.
+	saveRealtime int64
+
+	// params manages the parameter page.
+	params *VDSOParamPage
+
+	// mu protects destruction with stop and wg.
+	mu sync.Mutex `state:"nosave"`
+
+	// stop is used to tell the update goroutine to exit.
+	stop chan struct{} `state:"nosave"`
+
+	// wg is used to indicate that the update goroutine has exited.
+	wg sync.WaitGroup `state:"nosave"`
+}
+
+// NewTimekeeper returns a Timekeeper that is automatically kept up-to-date.
+// NewTimekeeper does not take ownership of paramPage.
+//
+// SetClocks must be called on the returned Timekeeper before it is usable.
+func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) {
+	return &Timekeeper{
+		params: NewVDSOParamPage(mfp, paramPage),
+	}, nil
+}
+
+// SetClocks the backing clock source.
+//
+// SetClocks must be called before the Timekeeper is used, and it may not be
+// called more than once, as changing the clock source without extra correction
+// could cause time discontinuities.
+//
+// It must also be called after Load.
+func (t *Timekeeper) SetClocks(c sentrytime.Clocks) {
+	// Update the params, marking them "not ready", as we may need to
+	// restart calibration on this new machine.
+	if t.restored != nil {
+		if err := t.params.Write(func() vdsoParams {
+			return vdsoParams{}
+		}); err != nil {
+			panic("unable to reset VDSO params: " + err.Error())
+		}
+	}
+
+	if t.clocks != nil {
+		panic("SetClocks called on previously-initialized Timekeeper")
+	}
+
+	t.clocks = c
+
+	// Compute the offset of the monotonic clock from the base Clocks.
+	//
+	// In a fresh (not restored) sentry, monotonic time starts at zero.
+	//
+	// In a restored sentry, monotonic time jumps forward by approximately
+	// the same amount as real time. There are no guarantees here, we are
+	// just making a best-effort attempt to to make it appear that the app
+	// was simply not scheduled for a long period, rather than that the
+	// real time clock was changed.
+	//
+	// If real time went backwards, it remains the same.
+	wantMonotonic := int64(0)
+
+	nowMonotonic, err := t.clocks.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		panic("Unable to get current monotonic time: " + err.Error())
+	}
+
+	nowRealtime, err := t.clocks.GetTime(sentrytime.Realtime)
+	if err != nil {
+		panic("Unable to get current realtime: " + err.Error())
+	}
+
+	if t.restored != nil {
+		wantMonotonic = t.saveMonotonic
+		elapsed := nowRealtime - t.saveRealtime
+		if elapsed > 0 {
+			wantMonotonic += elapsed
+		}
+	}
+
+	t.monotonicOffset = wantMonotonic - nowMonotonic
+
+	if t.restored == nil {
+		// Hold on to the initial "boot" time.
+		t.bootTime = ktime.FromNanoseconds(nowRealtime)
+	}
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.startUpdater()
+
+	if t.restored != nil {
+		close(t.restored)
+	}
+}
+
+// startUpdater starts an update goroutine that keeps the clocks updated.
+//
+// mu must be held.
+func (t *Timekeeper) startUpdater() {
+	if t.stop != nil {
+		// Timekeeper already started
+		return
+	}
+	t.stop = make(chan struct{})
+
+	// Keep the clocks up to date.
+	//
+	// Note that the Go runtime uses host CLOCK_MONOTONIC to service the
+	// timer, so it may run at a *slightly* different rate from the
+	// application CLOCK_MONOTONIC. That is fine, as we only need to update
+	// at approximately this rate.
+	timer := time.NewTicker(sentrytime.ApproxUpdateInterval)
+	t.wg.Add(1)
+	go func() { // S/R-SAFE: stopped during save.
+		for {
+			// Start with an update immediately, so the clocks are
+			// ready ASAP.
+
+			// Call Update within a Write block to prevent the VDSO
+			// from using the old params between Update and
+			// Write.
+			if err := t.params.Write(func() vdsoParams {
+				monotonicParams, monotonicOk, realtimeParams, realtimeOk := t.clocks.Update()
+
+				var p vdsoParams
+				if monotonicOk {
+					p.monotonicReady = 1
+					p.monotonicBaseCycles = int64(monotonicParams.BaseCycles)
+					p.monotonicBaseRef = int64(monotonicParams.BaseRef) + t.monotonicOffset
+					p.monotonicFrequency = monotonicParams.Frequency
+				}
+				if realtimeOk {
+					p.realtimeReady = 1
+					p.realtimeBaseCycles = int64(realtimeParams.BaseCycles)
+					p.realtimeBaseRef = int64(realtimeParams.BaseRef)
+					p.realtimeFrequency = realtimeParams.Frequency
+				}
+
+				log.Debugf("Updating VDSO parameters: %+v", p)
+
+				return p
+			}); err != nil {
+				log.Warningf("Unable to update VDSO parameter page: %v", err)
+			}
+
+			select {
+			case <-timer.C:
+			case <-t.stop:
+				t.wg.Done()
+				return
+			}
+		}
+	}()
+}
+
+// stopUpdater stops the update goroutine, blocking until it exits.
+//
+// mu must be held.
+func (t *Timekeeper) stopUpdater() {
+	if t.stop == nil {
+		// Updater not running.
+		return
+	}
+
+	close(t.stop)
+	t.wg.Wait()
+	t.stop = nil
+}
+
+// Destroy destroys the Timekeeper, freeing all associated resources.
+func (t *Timekeeper) Destroy() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.stopUpdater()
+}
+
+// PauseUpdates stops clock parameter updates. This should only be used when
+// Tasks are not running and thus cannot access the clock.
+func (t *Timekeeper) PauseUpdates() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.stopUpdater()
+}
+
+// ResumeUpdates restarts clock parameter updates stopped by PauseUpdates.
+func (t *Timekeeper) ResumeUpdates() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.startUpdater()
+}
+
+// GetTime returns the current time in nanoseconds.
+func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) {
+	if t.clocks == nil {
+		if t.restored == nil {
+			panic("Timekeeper used before initialized with SetClocks")
+		}
+		<-t.restored
+	}
+	now, err := t.clocks.GetTime(c)
+	if err == nil && c == sentrytime.Monotonic {
+		now += t.monotonicOffset
+	}
+	return now, err
+}
+
+// BootTime returns the system boot real time.
+func (t *Timekeeper) BootTime() ktime.Time {
+	return t.bootTime
+}
+
+// timekeeperClock is a ktime.Clock that reads time from a
+// kernel.Timekeeper-managed clock.
+//
+// +stateify savable
+type timekeeperClock struct {
+	tk *Timekeeper
+	c  sentrytime.ClockID
+
+	// Implements ktime.Clock.WallTimeUntil.
+	ktime.WallRateClock `state:"nosave"`
+
+	// Implements waiter.Waitable. (We have no ability to detect
+	// discontinuities from external changes to CLOCK_REALTIME).
+	ktime.NoClockEvents `state:"nosave"`
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *timekeeperClock) Now() ktime.Time {
+	now, err := tc.tk.GetTime(tc.c)
+	if err != nil {
+		panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err))
+	}
+	return ktime.FromNanoseconds(now)
+}
diff --git a/pkg/sentry/kernel/timekeeper_state.go b/pkg/sentry/kernel/timekeeper_state.go
new file mode 100644
index 000000000..6ce358a05
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper_state.go
@@ -0,0 +1,41 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// beforeSave is invoked by stateify.
+func (t *Timekeeper) beforeSave() {
+	if t.stop != nil {
+		panic("pauseUpdates must be called before Save")
+	}
+
+	// N.B. we want the *offset* monotonic time.
+	var err error
+	if t.saveMonotonic, err = t.GetTime(time.Monotonic); err != nil {
+		panic("unable to get current monotonic time: " + err.Error())
+	}
+
+	if t.saveRealtime, err = t.GetTime(time.Realtime); err != nil {
+		panic("unable to get current realtime: " + err.Error())
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (t *Timekeeper) afterLoad() {
+	t.restored = make(chan struct{})
+}
diff --git a/pkg/sentry/kernel/uncaught_signal_go_proto/uncaught_signal.pb.go b/pkg/sentry/kernel/uncaught_signal_go_proto/uncaught_signal.pb.go
new file mode 100755
index 000000000..6f5580ebe
--- /dev/null
+++ b/pkg/sentry/kernel/uncaught_signal_go_proto/uncaught_signal.pb.go
@@ -0,0 +1,119 @@
+// Code generated by protoc-gen-go. DO NOT EDIT.
+// source: pkg/sentry/kernel/uncaught_signal.proto
+
+package gvisor
+
+import (
+	fmt "fmt"
+	proto "github.com/golang/protobuf/proto"
+	registers_go_proto "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+	math "math"
+)
+
+// Reference imports to suppress errors if they are not otherwise used.
+var _ = proto.Marshal
+var _ = fmt.Errorf
+var _ = math.Inf
+
+// This is a compile-time assertion to ensure that this generated file
+// is compatible with the proto package it is being compiled against.
+// A compilation error at this line likely means your copy of the
+// proto package needs to be updated.
+const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package
+
+type UncaughtSignal struct {
+	Tid                  int32                         `protobuf:"varint,1,opt,name=tid,proto3" json:"tid,omitempty"`
+	Pid                  int32                         `protobuf:"varint,2,opt,name=pid,proto3" json:"pid,omitempty"`
+	Registers            *registers_go_proto.Registers `protobuf:"bytes,3,opt,name=registers,proto3" json:"registers,omitempty"`
+	SignalNumber         int32                         `protobuf:"varint,4,opt,name=signal_number,json=signalNumber,proto3" json:"signal_number,omitempty"`
+	FaultAddr            uint64                        `protobuf:"varint,5,opt,name=fault_addr,json=faultAddr,proto3" json:"fault_addr,omitempty"`
+	XXX_NoUnkeyedLiteral struct{}                      `json:"-"`
+	XXX_unrecognized     []byte                        `json:"-"`
+	XXX_sizecache        int32                         `json:"-"`
+}
+
+func (m *UncaughtSignal) Reset()         { *m = UncaughtSignal{} }
+func (m *UncaughtSignal) String() string { return proto.CompactTextString(m) }
+func (*UncaughtSignal) ProtoMessage()    {}
+func (*UncaughtSignal) Descriptor() ([]byte, []int) {
+	return fileDescriptor_5ca9e03e13704688, []int{0}
+}
+
+func (m *UncaughtSignal) XXX_Unmarshal(b []byte) error {
+	return xxx_messageInfo_UncaughtSignal.Unmarshal(m, b)
+}
+func (m *UncaughtSignal) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
+	return xxx_messageInfo_UncaughtSignal.Marshal(b, m, deterministic)
+}
+func (m *UncaughtSignal) XXX_Merge(src proto.Message) {
+	xxx_messageInfo_UncaughtSignal.Merge(m, src)
+}
+func (m *UncaughtSignal) XXX_Size() int {
+	return xxx_messageInfo_UncaughtSignal.Size(m)
+}
+func (m *UncaughtSignal) XXX_DiscardUnknown() {
+	xxx_messageInfo_UncaughtSignal.DiscardUnknown(m)
+}
+
+var xxx_messageInfo_UncaughtSignal proto.InternalMessageInfo
+
+func (m *UncaughtSignal) GetTid() int32 {
+	if m != nil {
+		return m.Tid
+	}
+	return 0
+}
+
+func (m *UncaughtSignal) GetPid() int32 {
+	if m != nil {
+		return m.Pid
+	}
+	return 0
+}
+
+func (m *UncaughtSignal) GetRegisters() *registers_go_proto.Registers {
+	if m != nil {
+		return m.Registers
+	}
+	return nil
+}
+
+func (m *UncaughtSignal) GetSignalNumber() int32 {
+	if m != nil {
+		return m.SignalNumber
+	}
+	return 0
+}
+
+func (m *UncaughtSignal) GetFaultAddr() uint64 {
+	if m != nil {
+		return m.FaultAddr
+	}
+	return 0
+}
+
+func init() {
+	proto.RegisterType((*UncaughtSignal)(nil), "gvisor.UncaughtSignal")
+}
+
+func init() {
+	proto.RegisterFile("pkg/sentry/kernel/uncaught_signal.proto", fileDescriptor_5ca9e03e13704688)
+}
+
+var fileDescriptor_5ca9e03e13704688 = []byte{
+	// 210 bytes of a gzipped FileDescriptorProto
+	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x4c, 0x8e, 0x4d, 0x4a, 0xc6, 0x30,
+	0x10, 0x86, 0x89, 0xfd, 0x81, 0xc6, 0x1f, 0x34, 0xab, 0x20, 0x88, 0x45, 0x17, 0x76, 0xd5, 0x80,
+	0x9e, 0xc0, 0x0b, 0xb8, 0x88, 0xb8, 0x2e, 0x69, 0x13, 0xd3, 0xd0, 0x9a, 0x86, 0x49, 0x22, 0x78,
+	0x24, 0x6f, 0x29, 0x4d, 0xd4, 0xef, 0xdb, 0x0d, 0xcf, 0xbc, 0xf3, 0xcc, 0x8b, 0x1f, 0xdc, 0xa2,
+	0x99, 0x57, 0x36, 0xc0, 0x17, 0x5b, 0x14, 0x58, 0xb5, 0xb2, 0x68, 0x27, 0x11, 0xf5, 0x1c, 0x06,
+	0x6f, 0xb4, 0x15, 0x6b, 0xef, 0x60, 0x0b, 0x1b, 0xa9, 0xf5, 0xa7, 0xf1, 0x1b, 0x5c, 0xdf, 0x1e,
+	0x1d, 0x08, 0x98, 0x66, 0x06, 0x4a, 0x1b, 0x1f, 0x14, 0xf8, 0x1c, 0xbc, 0xfb, 0x46, 0xf8, 0xe2,
+	0xed, 0x57, 0xf1, 0x9a, 0x0c, 0xe4, 0x12, 0x17, 0xc1, 0x48, 0x8a, 0x5a, 0xd4, 0x55, 0x7c, 0x1f,
+	0x77, 0xe2, 0x8c, 0xa4, 0x27, 0x99, 0x38, 0x23, 0x09, 0xc3, 0xcd, 0xbf, 0x89, 0x16, 0x2d, 0xea,
+	0x4e, 0x1f, 0xaf, 0xfa, 0xfc, 0xb3, 0xe7, 0x7f, 0x0b, 0x7e, 0xc8, 0x90, 0x7b, 0x7c, 0x9e, 0x0b,
+	0x0e, 0x36, 0x7e, 0x8c, 0x0a, 0x68, 0x99, 0x64, 0x67, 0x19, 0xbe, 0x24, 0x46, 0x6e, 0x30, 0x7e,
+	0x17, 0x71, 0x0d, 0x83, 0x90, 0x12, 0x68, 0xd5, 0xa2, 0xae, 0xe4, 0x4d, 0x22, 0xcf, 0x52, 0xc2,
+	0x58, 0xa7, 0xca, 0x4f, 0x3f, 0x01, 0x00, 0x00, 0xff, 0xff, 0xfd, 0x62, 0x54, 0xdf, 0x06, 0x01,
+	0x00, 0x00,
+}
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
new file mode 100644
index 000000000..96fe3cbb9
--- /dev/null
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -0,0 +1,102 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+)
+
+// UTSNamespace represents a UTS namespace, a holder of two system identifiers:
+// the hostname and domain name.
+//
+// +stateify savable
+type UTSNamespace struct {
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	hostName   string
+	domainName string
+
+	// userns is the user namespace associated with the UTSNamespace.
+	// Privileged operations on this UTSNamespace must have appropriate
+	// capabilities in userns.
+	//
+	// userns is immutable.
+	userns *auth.UserNamespace
+}
+
+// NewUTSNamespace creates a new UTS namespace.
+func NewUTSNamespace(hostName, domainName string, userns *auth.UserNamespace) *UTSNamespace {
+	return &UTSNamespace{
+		hostName:   hostName,
+		domainName: domainName,
+		userns:     userns,
+	}
+}
+
+// UTSNamespace returns the task's UTS namespace.
+func (t *Task) UTSNamespace() *UTSNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.utsns
+}
+
+// HostName returns the host name of this UTS namespace.
+func (u *UTSNamespace) HostName() string {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return u.hostName
+}
+
+// SetHostName sets the host name of this UTS namespace.
+func (u *UTSNamespace) SetHostName(host string) {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	u.hostName = host
+}
+
+// DomainName returns the domain name of this UTS namespace.
+func (u *UTSNamespace) DomainName() string {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return u.domainName
+}
+
+// SetDomainName sets the domain name of this UTS namespace.
+func (u *UTSNamespace) SetDomainName(domain string) {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	u.domainName = domain
+}
+
+// UserNamespace returns the user namespace associated with this UTS namespace.
+func (u *UTSNamespace) UserNamespace() *auth.UserNamespace {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return u.userns
+}
+
+// Clone makes a copy of this UTS namespace, associating the given user
+// namespace.
+func (u *UTSNamespace) Clone(userns *auth.UserNamespace) *UTSNamespace {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return &UTSNamespace{
+		hostName:   u.hostName,
+		domainName: u.domainName,
+		userns:     userns,
+	}
+}
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
new file mode 100644
index 000000000..d40ad74f4
--- /dev/null
+++ b/pkg/sentry/kernel/vdso.go
@@ -0,0 +1,148 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// vdsoParams are the parameters exposed to the VDSO.
+//
+// They are exposed to the VDSO via a parameter page managed by VDSOParamPage,
+// which also includes a sequence counter.
+type vdsoParams struct {
+	monotonicReady      uint64
+	monotonicBaseCycles int64
+	monotonicBaseRef    int64
+	monotonicFrequency  uint64
+
+	realtimeReady      uint64
+	realtimeBaseCycles int64
+	realtimeBaseRef    int64
+	realtimeFrequency  uint64
+}
+
+// VDSOParamPage manages a VDSO parameter page.
+//
+// Its memory layout looks like:
+//
+// type page struct {
+//	// seq is a sequence counter that protects the fields below.
+//	seq uint64
+//	vdsoParams
+// }
+//
+// Everything in the struct is 8 bytes for easy alignment.
+//
+// It must be kept in sync with params in vdso/vdso_time.cc.
+//
+// +stateify savable
+type VDSOParamPage struct {
+	// The parameter page is fr, allocated from mfp.MemoryFile().
+	mfp pgalloc.MemoryFileProvider
+	fr  platform.FileRange
+
+	// seq is the current sequence count written to the page.
+	//
+	// A write is in progress if bit 1 of the counter is set.
+	//
+	// Timekeeper's updater goroutine may call Write before equality is
+	// checked in state_test_util tests, causing this field to change across
+	// save / restore.
+	seq uint64
+}
+
+// NewVDSOParamPage returns a VDSOParamPage.
+//
+// Preconditions:
+//
+// * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does
+//   not take ownership of fr; it must remain allocated for the lifetime of the
+//   VDSOParamPage.
+//
+// * VDSOParamPage must be the only writer to fr.
+//
+// * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block.
+func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage {
+	return &VDSOParamPage{mfp: mfp, fr: fr}
+}
+
+// access returns a mapping of the param page.
+func (v *VDSOParamPage) access() (safemem.Block, error) {
+	bs, err := v.mfp.MemoryFile().MapInternal(v.fr, usermem.ReadWrite)
+	if err != nil {
+		return safemem.Block{}, err
+	}
+	if bs.NumBlocks() != 1 {
+		panic(fmt.Sprintf("Multiple blocks (%d) in VDSO param BlockSeq", bs.NumBlocks()))
+	}
+	return bs.Head(), nil
+}
+
+// incrementSeq increments the sequence counter in the param page.
+func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error {
+	next := v.seq + 1
+	old, err := safemem.SwapUint64(paramPage, next)
+	if err != nil {
+		return err
+	}
+
+	if old != v.seq {
+		return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d. Application may hang or get incorrect time from the VDSO.", old, v.seq)
+	}
+
+	v.seq = next
+	return nil
+}
+
+// Write updates the VDSO parameters.
+//
+// Write starts a write block, calls f to get the new parameters, writes
+// out the new parameters, then ends the write block.
+func (v *VDSOParamPage) Write(f func() vdsoParams) error {
+	paramPage, err := v.access()
+	if err != nil {
+		return err
+	}
+
+	// Write begin.
+	next := v.seq + 1
+	if next%2 != 1 {
+		panic("Out-of-order sequence count")
+	}
+
+	err = v.incrementSeq(paramPage)
+	if err != nil {
+		return err
+	}
+
+	// Get the new params.
+	p := f()
+	buf := binary.Marshal(nil, usermem.ByteOrder, p)
+
+	// Skip the sequence counter.
+	if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil {
+		panic(fmt.Sprintf("Unable to get set VDSO parameters: %v", err))
+	}
+
+	// Write end.
+	return v.incrementSeq(paramPage)
+}
diff --git a/pkg/sentry/kernel/version.go b/pkg/sentry/kernel/version.go
new file mode 100644
index 000000000..5640dd71d
--- /dev/null
+++ b/pkg/sentry/kernel/version.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// Version defines the application-visible system version.
+type Version struct {
+	// Operating system name (e.g. "Linux").
+	Sysname string
+
+	// Operating system release (e.g. "4.4-amd64").
+	Release string
+
+	// Operating system version. On Linux this takes the shape
+	// "#VERSION CONFIG_FLAGS TIMESTAMP"
+	// where:
+	// - VERSION is a sequence counter incremented on every successful build
+	// - CONFIG_FLAGS is a space-separated list of major enabled kernel features
+	//   (e.g. "SMP" and "PREEMPT")
+	// - TIMESTAMP is the build timestamp as returned by `date`
+	Version string
+}
author	gVisor bot <gvisor-bot@google.com>	2019-06-02 06:44:55 +0000
committer	gVisor bot <gvisor-bot@google.com>	2019-06-02 06:44:55 +0000
commit	ceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree	83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/kernel
parent	deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent	216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)