35 files changed, 704 insertions, 192 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index c53e3e720..a1ec6daab 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -141,6 +141,7 @@ go_library(
     srcs = [
         "abstract_socket_namespace.go",
         "aio.go",
+        "cgroup.go",
         "context.go",
         "fd_table.go",
         "fd_table_refs.go",
@@ -178,6 +179,7 @@ go_library(
         "task.go",
         "task_acct.go",
         "task_block.go",
+        "task_cgroup.go",
         "task_clone.go",
         "task_context.go",
         "task_exec.go",
@@ -226,6 +228,7 @@ go_library(
         "//pkg/eventchannel",
         "//pkg/fspath",
         "//pkg/goid",
+        "//pkg/hostarch",
         "//pkg/log",
         "//pkg/marshal",
         "//pkg/marshal/primitive",
@@ -240,6 +243,7 @@ go_library(
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
         "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/fsimpl/pipefs",
         "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/fsimpl/timerfd",
@@ -294,6 +298,7 @@ go_test(
     deps = [
         "//pkg/abi",
         "//pkg/context",
+        "//pkg/hostarch",
         "//pkg/sentry/arch",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
@@ -305,6 +310,5 @@ go_test(
         "//pkg/sentry/usage",
         "//pkg/sync",
         "//pkg/syserror",
-        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/kernel/cgroup.go b/pkg/sentry/kernel/cgroup.go
new file mode 100644
index 000000000..1f1c63f37
--- /dev/null
+++ b/pkg/sentry/kernel/cgroup.go
@@ -0,0 +1,281 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// InvalidCgroupHierarchyID indicates an uninitialized hierarchy ID.
+const InvalidCgroupHierarchyID uint32 = 0
+
+// CgroupControllerType is the name of a cgroup controller.
+type CgroupControllerType string
+
+// CgroupController is the common interface to cgroup controllers available to
+// the entire sentry. The controllers themselves are defined by cgroupfs.
+//
+// Callers of this interface are often unable access synchronization needed to
+// ensure returned values remain valid. Some of values returned from this
+// interface are thus snapshots in time, and may become stale. This is ok for
+// many callers like procfs.
+type CgroupController interface {
+	// Returns the type of this cgroup controller (ex "memory", "cpu"). Returned
+	// value is valid for the lifetime of the controller.
+	Type() CgroupControllerType
+
+	// Hierarchy returns the ID of the hierarchy this cgroup controller is
+	// attached to. Returned value is valid for the lifetime of the controller.
+	HierarchyID() uint32
+
+	// Filesystem returns the filesystem this controller is attached to.
+	// Returned value is valid for the lifetime of the controller.
+	Filesystem() *vfs.Filesystem
+
+	// RootCgroup returns the root cgroup for this controller. Returned value is
+	// valid for the lifetime of the controller.
+	RootCgroup() Cgroup
+
+	// NumCgroups returns the number of cgroups managed by this controller.
+	// Returned value is a snapshot in time.
+	NumCgroups() uint64
+
+	// Enabled returns whether this controller is enabled. Returned value is a
+	// snapshot in time.
+	Enabled() bool
+}
+
+// Cgroup represents a named pointer to a cgroup in cgroupfs. When a task enters
+// a cgroup, it holds a reference on the underlying dentry pointing to the
+// cgroup.
+//
+// +stateify savable
+type Cgroup struct {
+	*kernfs.Dentry
+	CgroupImpl
+}
+
+func (c *Cgroup) decRef() {
+	c.Dentry.DecRef(context.Background())
+}
+
+// Path returns the absolute path of c, relative to its hierarchy root.
+func (c *Cgroup) Path() string {
+	return c.FSLocalPath()
+}
+
+// HierarchyID returns the id of the hierarchy that contains this cgroup.
+func (c *Cgroup) HierarchyID() uint32 {
+	// Note: a cgroup is guaranteed to have at least one controller.
+	return c.Controllers()[0].HierarchyID()
+}
+
+// CgroupImpl is the common interface to cgroups.
+type CgroupImpl interface {
+	Controllers() []CgroupController
+	Enter(t *Task)
+	Leave(t *Task)
+}
+
+// hierarchy represents a cgroupfs filesystem instance, with a unique set of
+// controllers attached to it. Multiple cgroupfs mounts may reference the same
+// hierarchy.
+//
+// +stateify savable
+type hierarchy struct {
+	id uint32
+	// These are a subset of the controllers in CgroupRegistry.controllers,
+	// grouped here by hierarchy for conveninent lookup.
+	controllers map[CgroupControllerType]CgroupController
+	// fs is not owned by hierarchy. The FS is responsible for unregistering the
+	// hierarchy on destruction, which removes this association.
+	fs *vfs.Filesystem
+}
+
+func (h *hierarchy) match(ctypes []CgroupControllerType) bool {
+	if len(ctypes) != len(h.controllers) {
+		return false
+	}
+	for _, ty := range ctypes {
+		if _, ok := h.controllers[ty]; !ok {
+			return false
+		}
+	}
+	return true
+}
+
+// CgroupRegistry tracks the active set of cgroup controllers on the system.
+//
+// +stateify savable
+type CgroupRegistry struct {
+	// lastHierarchyID is the id of the last allocated cgroup hierarchy. Valid
+	// ids are from 1 to math.MaxUint32. Must be accessed through atomic ops.
+	//
+	lastHierarchyID uint32
+
+	mu sync.Mutex `state:"nosave"`
+
+	// controllers is the set of currently known cgroup controllers on the
+	// system. Protected by mu.
+	//
+	// +checklocks:mu
+	controllers map[CgroupControllerType]CgroupController
+
+	// hierarchies is the active set of cgroup hierarchies. Protected by mu.
+	//
+	// +checklocks:mu
+	hierarchies map[uint32]hierarchy
+}
+
+func newCgroupRegistry() *CgroupRegistry {
+	return &CgroupRegistry{
+		controllers: make(map[CgroupControllerType]CgroupController),
+		hierarchies: make(map[uint32]hierarchy),
+	}
+}
+
+// nextHierarchyID returns a newly allocated, unique hierarchy ID.
+func (r *CgroupRegistry) nextHierarchyID() (uint32, error) {
+	if hid := atomic.AddUint32(&r.lastHierarchyID, 1); hid != 0 {
+		return hid, nil
+	}
+	return InvalidCgroupHierarchyID, fmt.Errorf("cgroup hierarchy ID overflow")
+}
+
+// FindHierarchy returns a cgroup filesystem containing exactly the set of
+// controllers named in names. If no such FS is found, FindHierarchy return
+// nil. FindHierarchy takes a reference on the returned FS, which is transferred
+// to the caller.
+func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Filesystem {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	for _, h := range r.hierarchies {
+		if h.match(ctypes) {
+			h.fs.IncRef()
+			return h.fs
+		}
+	}
+
+	return nil
+}
+
+// Register registers the provided set of controllers with the registry as a new
+// hierarchy. If any controller is already registered, the function returns an
+// error without modifying the registry. The hierarchy can be later referenced
+// by the returned id.
+func (r *CgroupRegistry) Register(cs []CgroupController) (uint32, error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if len(cs) == 0 {
+		return InvalidCgroupHierarchyID, fmt.Errorf("can't register hierarchy with no controllers")
+	}
+
+	for _, c := range cs {
+		if _, ok := r.controllers[c.Type()]; ok {
+			return InvalidCgroupHierarchyID, fmt.Errorf("controllers may only be mounted on a single hierarchy")
+		}
+	}
+
+	hid, err := r.nextHierarchyID()
+	if err != nil {
+		return hid, err
+	}
+
+	h := hierarchy{
+		id:          hid,
+		controllers: make(map[CgroupControllerType]CgroupController),
+		fs:          cs[0].Filesystem(),
+	}
+	for _, c := range cs {
+		n := c.Type()
+		r.controllers[n] = c
+		h.controllers[n] = c
+	}
+	r.hierarchies[hid] = h
+	return hid, nil
+}
+
+// Unregister removes a previously registered hierarchy from the registry. If
+// the controller was not previously registered, Unregister is a no-op.
+func (r *CgroupRegistry) Unregister(hid uint32) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if h, ok := r.hierarchies[hid]; ok {
+		for name, _ := range h.controllers {
+			delete(r.controllers, name)
+		}
+		delete(r.hierarchies, hid)
+	}
+}
+
+// computeInitialGroups takes a reference on each of the returned cgroups. The
+// caller takes ownership of this returned reference.
+func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[Cgroup]struct{} {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	ctlSet := make(map[CgroupControllerType]CgroupController)
+	cgset := make(map[Cgroup]struct{})
+
+	// Remember controllers from the inherited cgroups set...
+	for cg, _ := range inherit {
+		cg.IncRef() // Ref transferred to caller.
+		for _, ctl := range cg.Controllers() {
+			ctlSet[ctl.Type()] = ctl
+			cgset[cg] = struct{}{}
+		}
+	}
+
+	// ... and add the root cgroups of all the missing controllers.
+	for name, ctl := range r.controllers {
+		if _, ok := ctlSet[name]; !ok {
+			cg := ctl.RootCgroup()
+			cg.IncRef() // Ref transferred to caller.
+			cgset[cg] = struct{}{}
+		}
+	}
+	return cgset
+}
+
+// GenerateProcCgroups writes the contents of /proc/cgroups to buf.
+func (r *CgroupRegistry) GenerateProcCgroups(buf *bytes.Buffer) {
+	r.mu.Lock()
+	entries := make([]string, 0, len(r.controllers))
+	for _, c := range r.controllers {
+		en := 0
+		if c.Enabled() {
+			en = 1
+		}
+		entries = append(entries, fmt.Sprintf("%s\t%d\t%d\t%d\n", c.Type(), c.HierarchyID(), c.NumCgroups(), en))
+	}
+	r.mu.Unlock()
+
+	sort.Strings(entries)
+	fmt.Fprint(buf, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n")
+	for _, e := range entries {
+		fmt.Fprint(buf, e)
+	}
+}
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index 7ecbd29ab..564c3d42e 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -10,6 +10,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fdnotifier",
+        "//pkg/hostarch",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 2aca02fd5..4466fbc9d 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -186,7 +187,7 @@ func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) erro
 	e.wq.Notify(waiter.WritableEvents)
 
 	var buf [8]byte
-	usermem.ByteOrder.PutUint64(buf[:], val)
+	hostarch.ByteOrder.PutUint64(buf[:], val)
 	_, err := dst.CopyOut(ctx, buf[:])
 	return err
 }
@@ -194,7 +195,7 @@ func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) erro
 // Must be called with e.mu locked.
 func (e *EventOperations) hostWrite(val uint64) error {
 	var buf [8]byte
-	usermem.ByteOrder.PutUint64(buf[:], val)
+	hostarch.ByteOrder.PutUint64(buf[:], val)
 	_, err := unix.Write(e.hostfd, buf[:])
 	if err == unix.EWOULDBLOCK {
 		return syserror.ErrWouldBlock
@@ -207,7 +208,7 @@ func (e *EventOperations) write(ctx context.Context, src usermem.IOSequence) err
 	if _, err := src.CopyIn(ctx, buf[:]); err != nil {
 		return err
 	}
-	val := usermem.ByteOrder.Uint64(buf[:])
+	val := hostarch.ByteOrder.Uint64(buf[:])
 
 	return e.Signal(val)
 }
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 041e3d4ca..a75686cf3 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -37,6 +37,7 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/hostarch",
         "//pkg/log",
         "//pkg/sentry/memmap",
         "//pkg/sync",
@@ -52,8 +53,8 @@ go_test(
     library = ":futex",
     deps = [
         "//pkg/context",
+        "//pkg/hostarch",
         "//pkg/sync",
-        "//pkg/usermem",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index e4dcc4d40..0427cf3f4 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -20,10 +20,10 @@ package futex
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // KeyKind indicates the type of a Key.
@@ -83,8 +83,8 @@ func (k *Key) clone() Key {
 }
 
 // Preconditions: k.Kind == KindPrivate or KindSharedPrivate.
-func (k *Key) addr() usermem.Addr {
-	return usermem.Addr(k.Offset)
+func (k *Key) addr() hostarch.Addr {
+	return hostarch.Addr(k.Offset)
 }
 
 // matches returns true if a wakeup on k2 should wake a waiter waiting on k.
@@ -97,14 +97,14 @@ func (k *Key) matches(k2 *Key) bool {
 type Target interface {
 	context.Context
 
-	// SwapUint32 gives access to usermem.IO.SwapUint32.
-	SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
+	// SwapUint32 gives access to hostarch.IO.SwapUint32.
+	SwapUint32(addr hostarch.Addr, new uint32) (uint32, error)
 
-	// CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32.
-	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
+	// CompareAndSwap gives access to hostarch.IO.CompareAndSwapUint32.
+	CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error)
 
-	// LoadUint32 gives access to usermem.IO.LoadUint32.
-	LoadUint32(addr usermem.Addr) (uint32, error)
+	// LoadUint32 gives access to hostarch.IO.LoadUint32.
+	LoadUint32(addr hostarch.Addr) (uint32, error)
 
 	// GetSharedKey returns a Key with kind KindSharedPrivate or
 	// KindSharedMappable corresponding to the memory mapped at address addr.
@@ -112,11 +112,11 @@ type Target interface {
 	// If GetSharedKey returns a Key with a non-nil MappingIdentity, a
 	// reference is held on the MappingIdentity, which must be dropped by the
 	// caller when the Key is no longer in use.
-	GetSharedKey(addr usermem.Addr) (Key, error)
+	GetSharedKey(addr hostarch.Addr) (Key, error)
 }
 
 // check performs a basic equality check on the given address.
-func check(t Target, addr usermem.Addr, val uint32) error {
+func check(t Target, addr hostarch.Addr, val uint32) error {
 	cur, err := t.LoadUint32(addr)
 	if err != nil {
 		return err
@@ -128,7 +128,7 @@ func check(t Target, addr usermem.Addr, val uint32) error {
 }
 
 // atomicOp performs a complex operation on the given address.
-func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
+func atomicOp(t Target, addr hostarch.Addr, opIn uint32) (bool, error) {
 	opType := (opIn >> 28) & 0xf
 	cmp := (opIn >> 24) & 0xf
 	opArg := (opIn >> 12) & 0xfff
@@ -328,7 +328,7 @@ const (
 )
 
 // getKey returns a Key representing address addr in c.
-func getKey(t Target, addr usermem.Addr, private bool) (Key, error) {
+func getKey(t Target, addr hostarch.Addr, private bool) (Key, error) {
 	// Ensure the address is aligned.
 	// It must be a DWORD boundary.
 	if addr&0x3 != 0 {
@@ -341,7 +341,7 @@ func getKey(t Target, addr usermem.Addr, private bool) (Key, error) {
 }
 
 // bucketIndexForAddr returns the index into Manager.buckets for addr.
-func bucketIndexForAddr(addr usermem.Addr) uintptr {
+func bucketIndexForAddr(addr hostarch.Addr) uintptr {
 	// - The bottom 2 bits of addr must be 0, per getKey.
 	//
 	// - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
@@ -448,7 +448,7 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
 
 // Wake wakes up to n waiters matching the bitmask on the given addr.
 // The number of waiters woken is returned.
-func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32, n int) (int, error) {
+func (m *Manager) Wake(t Target, addr hostarch.Addr, private bool, bitmask uint32, n int) (int, error) {
 	// This function is very hot; avoid defer.
 	k, err := getKey(t, addr, private)
 	if err != nil {
@@ -463,7 +463,7 @@ func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32
 	return r, nil
 }
 
-func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
+func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
 	k1, err := getKey(t, addr, private)
 	if err != nil {
 		return 0, err
@@ -498,14 +498,14 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch
 
 // Requeue wakes up to nwake waiters on the given addr, and unconditionally
 // requeues up to nreq waiters on naddr.
-func (m *Manager) Requeue(t Target, addr, naddr usermem.Addr, private bool, nwake int, nreq int) (int, error) {
+func (m *Manager) Requeue(t Target, addr, naddr hostarch.Addr, private bool, nwake int, nreq int) (int, error) {
 	return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq)
 }
 
 // RequeueCmp atomically checks that the addr contains val (via the Target),
 // wakes up to nwake waiters on addr and then unconditionally requeues nreq
 // waiters on naddr.
-func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, val uint32, nwake int, nreq int) (int, error) {
+func (m *Manager) RequeueCmp(t Target, addr, naddr hostarch.Addr, private bool, val uint32, nwake int, nreq int) (int, error) {
 	return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq)
 }
 
@@ -513,7 +513,7 @@ func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, v
 // waiters unconditionally from addr1, and, based on the original value at addr2
 // and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
 // It returns the total number of waiters woken.
-func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
+func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
 	k1, err := getKey(t, addr1, private)
 	if err != nil {
 		return 0, err
@@ -553,7 +553,7 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwak
 // enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
 // Waiter must be subsequently removed by calling WaitComplete, whether or not
 // a wakeup is received on w.C.
-func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) error {
+func (m *Manager) WaitPrepare(w *Waiter, t Target, addr hostarch.Addr, private bool, val uint32, bitmask uint32) error {
 	k, err := getKey(t, addr, private)
 	if err != nil {
 		return err
@@ -631,7 +631,7 @@ func (m *Manager) WaitComplete(w *Waiter, t Target) {
 // FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see
 // exit_robust_list()). Given we don't support robust lists, although handled
 // below, it's never set.
-func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) {
+func (m *Manager) LockPI(w *Waiter, t Target, addr hostarch.Addr, tid uint32, private, try bool) (bool, error) {
 	k, err := getKey(t, addr, private)
 	if err != nil {
 		return false, err
@@ -663,7 +663,7 @@ func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, pri
 	return success, nil
 }
 
-func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) {
+func (m *Manager) lockPILocked(w *Waiter, t Target, addr hostarch.Addr, tid uint32, b *bucket, try bool) (bool, error) {
 	for {
 		cur, err := t.LoadUint32(addr)
 		if err != nil {
@@ -724,7 +724,7 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint3
 // The address provided must contain the caller's TID. If there are waiters,
 // TID of the next waiter (FIFO) is set to the given address, and the waiter
 // woken up. If there are no waiters, 0 is set to the address.
-func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
+func (m *Manager) UnlockPI(t Target, addr hostarch.Addr, tid uint32, private bool) error {
 	k, err := getKey(t, addr, private)
 	if err != nil {
 		return err
@@ -738,7 +738,7 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool
 	return err
 }
 
-func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket, key *Key) error {
+func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bucket, key *Key) error {
 	cur, err := t.LoadUint32(addr)
 	if err != nil {
 		return err
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index ba7f95d8a..deba44e5c 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -23,8 +23,8 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // testData implements the Target interface, and allows us to
@@ -43,23 +43,23 @@ func newTestData(size uint) testData {
 	}
 }
 
-func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
+func (t testData) SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) {
 	val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), new)
 	return val, nil
 }
 
-func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
+func (t testData) CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) {
 	if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), old, new) {
 		return old, nil
 	}
 	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil
 }
 
-func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) {
+func (t testData) LoadUint32(addr hostarch.Addr) (uint32, error) {
 	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil
 }
 
-func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) {
+func (t testData) GetSharedKey(addr hostarch.Addr) (Key, error) {
 	return Key{
 		Kind:   KindSharedMappable,
 		Offset: uint64(addr),
@@ -73,7 +73,7 @@ func futexKind(private bool) string {
 	return "shared"
 }
 
-func newPreparedTestWaiter(t *testing.T, m *Manager, ta Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) *Waiter {
+func newPreparedTestWaiter(t *testing.T, m *Manager, ta Target, addr hostarch.Addr, private bool, val uint32, bitmask uint32) *Waiter {
 	w := NewWaiter()
 	if err := m.WaitPrepare(w, ta, addr, private, val, bitmask); err != nil {
 		t.Fatalf("WaitPrepare failed: %v", err)
@@ -463,12 +463,12 @@ const (
 // Beyond being used as a Locker, this is a simple mechanism for
 // changing the underlying values for simpler tests.
 type testMutex struct {
-	a usermem.Addr
+	a hostarch.Addr
 	d testData
 	m *Manager
 }
 
-func newTestMutex(addr usermem.Addr, d testData, m *Manager) *testMutex {
+func newTestMutex(addr hostarch.Addr, d testData, m *Manager) *testMutex {
 	return &testMutex{a: addr, d: d, m: m}
 }
 
diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go
index 4fcdfc541..4b943106b 100644
--- a/pkg/sentry/kernel/kcov.go
+++ b/pkg/sentry/kernel/kcov.go
@@ -22,13 +22,13 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/coverage"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov
@@ -130,7 +130,7 @@ func (kcov *Kcov) InitTrace(size uint64) error {
 
 	// To simplify all the logic around mapping, we require that the length of the
 	// shared region is a multiple of the system page size.
-	if (8*size)&(usermem.PageSize-1) != 0 {
+	if (8*size)&(hostarch.PageSize-1) != 0 {
 		return syserror.EINVAL
 	}
 
@@ -286,7 +286,7 @@ func (rw *kcovReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 	}
 
 	// Get internal mappings.
-	bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Read)
+	bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, hostarch.Read)
 	if err != nil {
 		return 0, err
 	}
@@ -314,7 +314,7 @@ func (rw *kcovReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 	}
 
 	// Get internal mapping.
-	bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Write)
+	bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, hostarch.Write)
 	if err != nil {
 		return 0, err
 	}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 43065b45a..e6e9da898 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -294,6 +294,11 @@ type Kernel struct {
 
 	// YAMAPtraceScope is the current level of YAMA ptrace restrictions.
 	YAMAPtraceScope int32
+
+	// cgroupRegistry contains the set of active cgroup controllers on the
+	// system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on
+	// the system.
+	cgroupRegistry *CgroupRegistry
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -438,6 +443,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 		k.socketMount = socketMount
 
 		k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord)
+
+		k.cgroupRegistry = newCgroupRegistry()
 	}
 	return nil
 }
@@ -1815,6 +1822,11 @@ func (k *Kernel) SocketMount() *vfs.Mount {
 	return k.socketMount
 }
 
+// CgroupRegistry returns the cgroup registry.
+func (k *Kernel) CgroupRegistry() *CgroupRegistry {
+	return k.cgroupRegistry
+}
+
 // Release releases resources owned by k.
 //
 // Precondition: This should only be called after the kernel is fully
@@ -1831,3 +1843,43 @@ func (k *Kernel) Release() {
 	k.timekeeper.Destroy()
 	k.vdso.Release(ctx)
 }
+
+// PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup
+// hierarchy.
+//
+// Precondition: root must be a new cgroup with no tasks. This implies the
+// controllers for root are also new and currently manage no task, which in turn
+// implies the new cgroup can be populated without migrating tasks between
+// cgroups.
+func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) {
+	k.tasks.mu.RLock()
+	k.tasks.forEachTaskLocked(func(t *Task) {
+		if t.exitState != TaskExitNone {
+			return
+		}
+		t.mu.Lock()
+		t.enterCgroupLocked(root)
+		t.mu.Unlock()
+	})
+	k.tasks.mu.RUnlock()
+}
+
+// ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the
+// hierarchy with the provided id.  This is intended for use during hierarchy
+// teardown, as otherwise the tasks would be orphaned w.r.t to some controllers.
+func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) {
+	k.tasks.mu.RLock()
+	k.tasks.forEachTaskLocked(func(t *Task) {
+		if t.exitState != TaskExitNone {
+			return
+		}
+		t.mu.Lock()
+		for cg, _ := range t.cgroups {
+			if cg.HierarchyID() == hid {
+				t.leaveCgroupLocked(cg)
+			}
+		}
+		t.mu.Unlock()
+	})
+	k.tasks.mu.RUnlock()
+}
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index beba6d97d..34c617b08 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -21,6 +21,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/amutex",
         "//pkg/context",
+        "//pkg/hostarch",
         "//pkg/marshal/primitive",
         "//pkg/safemem",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index d004f2357..06769931a 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -22,18 +22,18 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
 const (
 	// MinimumPipeSize is a hard limit of the minimum size of a pipe.
 	// It corresponds to fs/pipe.c:pipe_min_size.
-	MinimumPipeSize = usermem.PageSize
+	MinimumPipeSize = hostarch.PageSize
 
 	// MaximumPipeSize is a hard limit on the maximum size of a pipe.
 	// It corresponds to fs/pipe.c:pipe_max_size.
@@ -41,7 +41,7 @@ const (
 
 	// DefaultPipeSize is the system-wide default size of a pipe in bytes.
 	// It corresponds to pipe_fs_i.h:PIPE_DEF_BUFFERS.
-	DefaultPipeSize = 16 * usermem.PageSize
+	DefaultPipeSize = 16 * hostarch.PageSize
 
 	// atomicIOBytes is the maximum number of bytes that the pipe will
 	// guarantee atomic reads or writes atomically.
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index e524afad5..95b948edb 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -17,6 +17,7 @@ package pipe
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -274,7 +275,7 @@ func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescripti
 	}
 	src := usermem.IOSequence{
 		IO:    fd,
-		Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+		Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(count)}),
 	}
 
 	var (
@@ -302,7 +303,7 @@ func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescripti
 func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) {
 	dst := usermem.IOSequence{
 		IO:    fd,
-		Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+		Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(count)}),
 	}
 
 	var (
@@ -328,7 +329,7 @@ func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescript
 // fd.pipe.Notify(waiter.WritableEvents) after the read is completed.
 //
 // Preconditions: fd.pipe.mu must be locked.
-func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
+func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
 	n, err := fd.pipe.peekLocked(int64(len(dst)), func(srcs safemem.BlockSeq) (uint64, error) {
 		return safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), srcs)
 	})
@@ -340,7 +341,7 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
 // is completed.
 //
 // Preconditions: fd.pipe.mu must be locked.
-func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
+func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts usermem.IOOpts) (int, error) {
 	n, err := fd.pipe.writeLocked(int64(len(src)), func(dsts safemem.BlockSeq) (uint64, error) {
 		return safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src)))
 	})
@@ -350,7 +351,7 @@ func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte,
 // ZeroOut implements usermem.IO.ZeroOut.
 //
 // Preconditions: fd.pipe.mu must be locked.
-func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
+func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
 	n, err := fd.pipe.writeLocked(toZero, func(dsts safemem.BlockSeq) (uint64, error) {
 		return safemem.ZeroSeq(dsts)
 	})
@@ -362,7 +363,7 @@ func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int6
 // fd.pipe.Notify(waiter.WritableEvents) after the read is completed.
 //
 // Preconditions: fd.pipe.mu must be locked.
-func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
+func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
 	return fd.pipe.peekLocked(ars.NumBytes(), func(srcs safemem.BlockSeq) (uint64, error) {
 		return dst.WriteFromBlocks(srcs)
 	})
@@ -373,25 +374,25 @@ func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst
 // is completed.
 //
 // Preconditions: fd.pipe.mu must be locked.
-func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
+func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
 	return fd.pipe.writeLocked(ars.NumBytes(), func(dsts safemem.BlockSeq) (uint64, error) {
 		return src.ReadToBlocks(dsts)
 	})
 }
 
 // SwapUint32 implements usermem.IO.SwapUint32.
-func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
+func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
 	// How did a pipe get passed as the virtual address space to futex(2)?
 	panic("VFSPipeFD.SwapUint32 called unexpectedly")
 }
 
 // CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
-func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
+func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
 	panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly")
 }
 
 // LoadUint32 implements usermem.IO.LoadUint32.
-func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) {
+func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr hostarch.Addr, opts usermem.IOOpts) (uint32, error) {
 	panic("VFSPipeFD.LoadUint32 called unexpectedly")
 }
 
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index f5a60e749..57c7659e7 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -19,6 +19,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -1011,7 +1012,7 @@ func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
 }
 
 // Ptrace implements the ptrace system call.
-func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
+func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error {
 	// PTRACE_TRACEME ignores all other arguments.
 	if req == linux.PTRACE_TRACEME {
 		return t.ptraceTraceme()
@@ -1190,7 +1191,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 			panic(fmt.Sprintf("%#x + %#x overflows. Invalid reg size > %#x", ar.Start, n, ar.Length()))
 		}
 		ar.End = end
-		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+		return t.CopyOutIovecs(data, hostarch.AddrRangeSeqOf(ar))
 
 	case linux.PTRACE_SETREGSET:
 		ars, err := t.CopyInIovecs(data, 1)
@@ -1214,8 +1215,8 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 			return err
 		}
 		t.p.FullStateChanged()
-		ar.End -= usermem.Addr(n)
-		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+		ar.End -= hostarch.Addr(n)
+		return t.CopyOutIovecs(data, hostarch.AddrRangeSeqOf(ar))
 
 	case linux.PTRACE_GETSIGINFO:
 		t.tg.pidns.owner.mu.RLock()
@@ -1267,7 +1268,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 	case linux.PTRACE_GETEVENTMSG:
 		t.tg.pidns.owner.mu.RLock()
 		defer t.tg.pidns.owner.mu.RUnlock()
-		_, err := primitive.CopyUint64Out(t, usermem.Addr(data), target.ptraceEventMsg)
+		_, err := primitive.CopyUint64Out(t, hostarch.Addr(data), target.ptraceEventMsg)
 		return err
 
 	// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
index 7aea3dcd8..5ae05b5c3 100644
--- a/pkg/sentry/kernel/ptrace_amd64.go
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -18,12 +18,13 @@ package kernel
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // ptraceArch implements arch-specific ptrace commands.
-func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error {
+func (t *Task) ptraceArch(target *Task, req int64, addr, data hostarch.Addr) error {
 	switch req {
 	case linux.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER
 		n, err := target.Arch().PtracePeekUser(uintptr(addr))
diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go
index d971b96b3..46dd84cbc 100644
--- a/pkg/sentry/kernel/ptrace_arm64.go
+++ b/pkg/sentry/kernel/ptrace_arm64.go
@@ -17,11 +17,11 @@
 package kernel
 
 import (
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // ptraceArch implements arch-specific ptrace commands.
-func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error {
+func (t *Task) ptraceArch(target *Task, req int64, addr, data hostarch.Addr) error {
 	return syserror.EIO
 }
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 2a9023fdf..4bc5bca44 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -43,8 +44,8 @@ type OldRSeqCriticalRegion struct {
 	// application handler while its instruction pointer is in CriticalSection,
 	// set the instruction pointer to Restart and application register r10 (on
 	// amd64) to the former instruction pointer.
-	CriticalSection usermem.AddrRange
-	Restart         usermem.Addr
+	CriticalSection hostarch.AddrRange
+	Restart         hostarch.Addr
 }
 
 // RSeqAvailable returns true if t supports (old and new) restartable sequences.
@@ -55,7 +56,7 @@ func (t *Task) RSeqAvailable() bool {
 // SetRSeq registers addr as this thread's rseq structure.
 //
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error {
+func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error {
 	if t.rseqAddr != 0 {
 		if t.rseqAddr != addr {
 			return syserror.EINVAL
@@ -100,7 +101,7 @@ func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error {
 // ClearRSeq unregisters addr as this thread's rseq structure.
 //
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) ClearRSeq(addr usermem.Addr, length, signature uint32) error {
+func (t *Task) ClearRSeq(addr hostarch.Addr, length, signature uint32) error {
 	if t.rseqAddr == 0 {
 		return syserror.EINVAL
 	}
@@ -166,7 +167,7 @@ func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error {
 // CPU number.
 //
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) OldRSeqCPUAddr() usermem.Addr {
+func (t *Task) OldRSeqCPUAddr() hostarch.Addr {
 	return t.oldRSeqCPUAddr
 }
 
@@ -177,7 +178,7 @@ func (t *Task) OldRSeqCPUAddr() usermem.Addr {
 // * t.RSeqAvailable() == true.
 // * The caller must be running on the task goroutine.
 // * t's AddressSpace must be active.
-func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
+func (t *Task) SetOldRSeqCPUAddr(addr hostarch.Addr) error {
 	t.oldRSeqCPUAddr = addr
 
 	// Check that addr is writable.
@@ -221,7 +222,7 @@ func (t *Task) oldRSeqCopyOutCPU() error {
 	}
 
 	buf := t.CopyScratchBuffer(4)
-	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
+	hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
 	_, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf)
 	return err
 }
@@ -236,8 +237,8 @@ func (t *Task) rseqCopyOutCPU() error {
 
 	buf := t.CopyScratchBuffer(8)
 	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
-	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))     // CPUIDStart
-	usermem.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID
+	hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))     // CPUIDStart
+	hostarch.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID
 	// N.B. This write is not atomic, but since this occurs on the task
 	// goroutine then as long as userspace uses a single-instruction read
 	// it can't see an invalid value.
@@ -251,8 +252,8 @@ func (t *Task) rseqCopyOutCPU() error {
 func (t *Task) rseqClearCPU() error {
 	buf := t.CopyScratchBuffer(8)
 	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
-	usermem.ByteOrder.PutUint32(buf, 0)                                   // CPUIDStart
-	usermem.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID
+	hostarch.ByteOrder.PutUint32(buf, 0)                                   // CPUIDStart
+	hostarch.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID
 	// N.B. This write is not atomic, but since this occurs on the task
 	// goroutine then as long as userspace uses a single-instruction read
 	// it can't see an invalid value.
@@ -305,7 +306,7 @@ func (t *Task) rseqAddrInterrupt() {
 		return
 	}
 
-	critAddr := usermem.Addr(usermem.ByteOrder.Uint64(buf))
+	critAddr := hostarch.Addr(hostarch.ByteOrder.Uint64(buf))
 	if critAddr == 0 {
 		return
 	}
@@ -325,7 +326,7 @@ func (t *Task) rseqAddrInterrupt() {
 		return
 	}
 
-	start := usermem.Addr(cs.Start)
+	start := hostarch.Addr(cs.Start)
 	critRange, ok := start.ToRange(cs.PostCommitOffset)
 	if !ok {
 		t.Debugf("Invalid start and offset in %+v", cs)
@@ -334,7 +335,7 @@ func (t *Task) rseqAddrInterrupt() {
 		return
 	}
 
-	abort := usermem.Addr(cs.Abort)
+	abort := hostarch.Addr(cs.Abort)
 	if critRange.Contains(abort) {
 		t.Debugf("Abort in critical section in %+v", cs)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
@@ -353,7 +354,7 @@ func (t *Task) rseqAddrInterrupt() {
 		return
 	}
 
-	sig := usermem.ByteOrder.Uint32(buf)
+	sig := hostarch.ByteOrder.Uint32(buf)
 	if sig != t.rseqSignature {
 		t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
@@ -376,7 +377,7 @@ func (t *Task) rseqAddrInterrupt() {
 	}
 
 	// Finally we can actually decide whether or not to restart.
-	if !critRange.Contains(usermem.Addr(t.Arch().IP())) {
+	if !critRange.Contains(hostarch.Addr(t.Arch().IP())) {
 		return
 	}
 
@@ -386,7 +387,7 @@ func (t *Task) rseqAddrInterrupt() {
 // Preconditions: The caller must be running on the task goroutine.
 func (t *Task) oldRSeqInterrupt() {
 	r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
-	if ip := t.Arch().IP(); r.CriticalSection.Contains(usermem.Addr(ip)) {
+	if ip := t.Arch().IP(); r.CriticalSection.Contains(hostarch.Addr(ip)) {
 		t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart)
 		t.Arch().SetIP(uintptr(r.Restart))
 		t.Arch().SetOldRSeqInterruptedIP(ip)
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index 8163a6132..a95e174a2 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -18,9 +18,9 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const maxSyscallFilterInstructions = 1 << 15
@@ -35,11 +35,11 @@ func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input {
 	return bpf.InputBytes{
 		Data: buf,
 		// Go-marshal always uses the native byte order.
-		Order: usermem.ByteOrder,
+		Order: hostarch.ByteOrder,
 	}
 }
 
-func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo {
+func seccompSiginfo(t *Task, errno, sysno int32, ip hostarch.Addr) *arch.SignalInfo {
 	si := &arch.SignalInfo{
 		Signo: int32(linux.SIGSYS),
 		Errno: errno,
@@ -56,7 +56,7 @@ func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalIn
 // in because vsyscalls do not use the values in t.Arch().)
 //
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) linux.BPFAction {
+func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) linux.BPFAction {
 	result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip))
 	action := result & linux.SECCOMP_RET_ACTION
 	switch action {
@@ -102,7 +102,7 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u
 	return action
 }
 
-func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
+func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) uint32 {
 	data := linux.SeccompData{
 		Nr:                 sysno,
 		Arch:               t.image.st.AuditNumber,
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index 073e14507..1c3c0794f 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -28,6 +28,7 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/hostarch",
         "//pkg/log",
         "//pkg/refs",
         "//pkg/refsvfs2",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 92d60ba78..a73f1bdca 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -38,6 +38,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -47,7 +48,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Key represents a shm segment key. Analogous to a file name.
@@ -197,13 +197,13 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
 	}
 
 	var sizeAligned uint64
-	if val, ok := usermem.Addr(size).RoundUp(); ok {
+	if val, ok := hostarch.Addr(size).RoundUp(); ok {
 		sizeAligned = uint64(val)
 	} else {
 		return nil, syserror.EINVAL
 	}
 
-	if numPages := sizeAligned / usermem.PageSize; r.totalPages+numPages > linux.SHMALL {
+	if numPages := sizeAligned / hostarch.PageSize; r.totalPages+numPages > linux.SHMALL {
 		// "... allocating a segment of the requested size would cause the
 		// system to exceed the system-wide limit on shared memory (SHMALL)."
 		//   - man shmget(2)
@@ -232,7 +232,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
 		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
 	}
 
-	effectiveSize := uint64(usermem.Addr(size).MustRoundUp())
+	effectiveSize := uint64(hostarch.Addr(size).MustRoundUp())
 	fr, err := mfp.MemoryFile().Allocate(effectiveSize, usage.Anonymous)
 	if err != nil {
 		return nil, err
@@ -267,7 +267,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
 			r.shms[id] = shm
 			r.keysToShms[key] = shm
 
-			r.totalPages += effectiveSize / usermem.PageSize
+			r.totalPages += effectiveSize / hostarch.PageSize
 
 			return shm, nil
 		}
@@ -318,7 +318,7 @@ func (r *Registry) remove(s *Shm) {
 	}
 
 	delete(r.shms, s.ID)
-	r.totalPages -= s.effectiveSize / usermem.PageSize
+	r.totalPages -= s.effectiveSize / hostarch.PageSize
 }
 
 // Release drops the self-reference of each active shm segment in the registry.
@@ -386,7 +386,7 @@ type Shm struct {
 	// effectiveSize of the segment, rounding up to the next page
 	// boundary. Immutable.
 	//
-	// Invariant: effectiveSize must be a multiple of usermem.PageSize.
+	// Invariant: effectiveSize must be a multiple of hostarch.PageSize.
 	effectiveSize uint64
 
 	// fr is the offset into mfp.MemoryFile() that backs this contents of this
@@ -467,7 +467,7 @@ func (s *Shm) Msync(context.Context, memmap.MappableRange) error {
 }
 
 // AddMapping implements memmap.Mappable.AddMapping.
-func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) error {
+func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	s.attachTime = ktime.NowFromContext(ctx)
@@ -482,7 +482,7 @@ func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.A
 }
 
 // RemoveMapping implements memmap.Mappable.RemoveMapping.
-func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) {
+func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	// RemoveMapping may be called during task exit, when ctx
@@ -503,12 +503,12 @@ func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ userme
 }
 
 // CopyMapping implements memmap.Mappable.CopyMapping.
-func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error {
+func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, hostarch.AddrRange, uint64, bool) error {
 	return nil
 }
 
 // Translate implements memmap.Mappable.Translate.
-func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
 	var err error
 	if required.End > s.fr.Length() {
 		err = &memmap.BusError{syserror.EFAULT}
@@ -519,7 +519,7 @@ func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableR
 				Source: source,
 				File:   s.mfp.MemoryFile(),
 				Offset: s.fr.Start + source.Start,
-				Perms:  usermem.AnyAccess,
+				Perms:  hostarch.AnyAccess,
 			},
 		}, err
 	}
@@ -543,7 +543,7 @@ type AttachOpts struct {
 //
 // Postconditions: The returned MMapOpts are valid only as long as a reference
 // continues to be held on s.
-func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
+func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	if s.pendingDestruction && s.ReadRefs() == 0 {
@@ -565,12 +565,12 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts Attac
 		Offset: 0,
 		Addr:   addr,
 		Fixed:  opts.Remap,
-		Perms: usermem.AccessType{
+		Perms: hostarch.AccessType{
 			Read:    true,
 			Write:   !opts.Readonly,
 			Execute: opts.Execute,
 		},
-		MaxPerms:        usermem.AnyAccess,
+		MaxPerms:        hostarch.AnyAccess,
 		Mappable:        s,
 		MappingIdentity: s,
 	}, nil
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 332bdb8e8..953d4310e 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -20,9 +20,9 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/bits"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // maxSyscallNum is the highest supported syscall number.
@@ -243,7 +243,7 @@ type SyscallTable struct {
 
 	// Emulate is a collection of instruction addresses to emulate. The
 	// keys are addresses, and the values are system call numbers.
-	Emulate map[usermem.Addr]uintptr
+	Emulate map[hostarch.Addr]uintptr
 
 	// The function to call in case of a missing system call.
 	Missing MissingFn
@@ -316,7 +316,7 @@ func (s *SyscallTable) Init() {
 	}
 	if s.Emulate == nil {
 		// Ensure non-nil emulate table.
-		s.Emulate = make(map[usermem.Addr]uintptr)
+		s.Emulate = make(map[hostarch.Addr]uintptr)
 	}
 
 	max := s.MaxSysno() // Checked during RegisterSyscallTable.
@@ -359,7 +359,7 @@ func (s *SyscallTable) LookupNo(name string) (uintptr, error) {
 }
 
 // LookupEmulate looks up an emulation syscall number.
-func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
+func (s *SyscallTable) LookupEmulate(addr hostarch.Addr) (uintptr, bool) {
 	sysno, ok := s.Emulate[addr]
 	return sysno, ok
 }
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 36141dd09..be1371855 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
@@ -33,7 +34,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -470,7 +470,7 @@ type Task struct {
 	// ThreadID to 0, and wake any futex waiters.
 	//
 	// cleartid is exclusive to the task goroutine.
-	cleartid usermem.Addr
+	cleartid hostarch.Addr
 
 	// This is mostly a fake cpumask just for sched_set/getaffinity as we
 	// don't really control the affinity.
@@ -540,12 +540,12 @@ type Task struct {
 	// oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable.
 	//
 	// oldRSeqCPUAddr is exclusive to the task goroutine.
-	oldRSeqCPUAddr usermem.Addr
+	oldRSeqCPUAddr hostarch.Addr
 
 	// rseqAddr is a pointer to the userspace linux.RSeq structure.
 	//
 	// rseqAddr is exclusive to the task goroutine.
-	rseqAddr usermem.Addr
+	rseqAddr hostarch.Addr
 
 	// rseqSignature is the signature that the rseq abort IP must be signed
 	// with.
@@ -575,7 +575,7 @@ type Task struct {
 
 	// robustList is a pointer to the head of the tasks's robust futex
 	// list.
-	robustList usermem.Addr
+	robustList hostarch.Addr
 
 	// startTime is the real time at which the task started. It is set when
 	// a Task is created or invokes execve(2).
@@ -587,6 +587,12 @@ type Task struct {
 	//
 	// kcov is exclusive to the task goroutine.
 	kcov *Kcov
+
+	// cgroups is the set of cgroups this task belongs to. This may be empty if
+	// no cgroup controllers are enabled. Protected by mu.
+	//
+	// +checklocks:mu
+	cgroups map[Cgroup]struct{}
 }
 
 func (t *Task) savePtraceTracer() *Task {
@@ -652,7 +658,7 @@ func (t *Task) Kernel() *Kernel {
 // SetClearTID sets t's cleartid.
 //
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) SetClearTID(addr usermem.Addr) {
+func (t *Task) SetClearTID(addr hostarch.Addr) {
 	t.cleartid = addr
 }
 
diff --git a/pkg/sentry/kernel/task_cgroup.go b/pkg/sentry/kernel/task_cgroup.go
new file mode 100644
index 000000000..25d2504fa
--- /dev/null
+++ b/pkg/sentry/kernel/task_cgroup.go
@@ -0,0 +1,138 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// EnterInitialCgroups moves t into an initial set of cgroups.
+//
+// Precondition: t isn't in any cgroups yet, t.cgs is empty.
+//
+// +checklocksignore parent.mu is conditionally acquired.
+func (t *Task) EnterInitialCgroups(parent *Task) {
+	var inherit map[Cgroup]struct{}
+	if parent != nil {
+		parent.mu.Lock()
+		defer parent.mu.Unlock()
+		inherit = parent.cgroups
+	}
+	joinSet := t.k.cgroupRegistry.computeInitialGroups(inherit)
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// Transfer ownership of joinSet refs to the task's cgset.
+	t.cgroups = joinSet
+	for c, _ := range t.cgroups {
+		// Since t isn't in any cgroup yet, we can skip the check against
+		// existing cgroups.
+		c.Enter(t)
+	}
+}
+
+// EnterCgroup moves t into c.
+func (t *Task) EnterCgroup(c Cgroup) error {
+	newControllers := make(map[CgroupControllerType]struct{})
+	for _, ctl := range c.Controllers() {
+		newControllers[ctl.Type()] = struct{}{}
+	}
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	for oldCG, _ := range t.cgroups {
+		for _, oldCtl := range oldCG.Controllers() {
+			if _, ok := newControllers[oldCtl.Type()]; ok {
+				// Already in a cgroup with the same controller as one of the
+				// new ones.  Requires migration between cgroups.
+				//
+				// TODO(b/183137098): Implement cgroup migration.
+				log.Warningf("Cgroup migration is not implemented")
+				return syserror.EBUSY
+			}
+		}
+	}
+
+	// No migration required.
+	t.enterCgroupLocked(c)
+
+	return nil
+}
+
+// +checklocks:t.mu
+func (t *Task) enterCgroupLocked(c Cgroup) {
+	c.IncRef()
+	t.cgroups[c] = struct{}{}
+	c.Enter(t)
+}
+
+// LeaveCgroups removes t out from all its cgroups.
+func (t *Task) LeaveCgroups() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	for c, _ := range t.cgroups {
+		t.leaveCgroupLocked(c)
+	}
+}
+
+// +checklocks:t.mu
+func (t *Task) leaveCgroupLocked(c Cgroup) {
+	c.Leave(t)
+	delete(t.cgroups, c)
+	c.decRef()
+}
+
+// taskCgroupEntry represents a line in /proc/<pid>/cgroup, and is used to
+// format a cgroup for display.
+type taskCgroupEntry struct {
+	hierarchyID uint32
+	controllers string
+	path        string
+}
+
+// GenerateProcTaskCgroup writes the contents of /proc/<pid>/cgroup for t to buf.
+func (t *Task) GenerateProcTaskCgroup(buf *bytes.Buffer) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	cgEntries := make([]taskCgroupEntry, 0, len(t.cgroups))
+	for c, _ := range t.cgroups {
+		ctls := c.Controllers()
+		ctlNames := make([]string, 0, len(ctls))
+		for _, ctl := range ctls {
+			ctlNames = append(ctlNames, string(ctl.Type()))
+		}
+
+		cgEntries = append(cgEntries, taskCgroupEntry{
+			// Note: We're guaranteed to have at least one controller, and all
+			// controllers are guaranteed to be on the same hierarchy.
+			hierarchyID: ctls[0].HierarchyID(),
+			controllers: strings.Join(ctlNames, ","),
+			path:        c.Path(),
+		})
+	}
+
+	sort.Slice(cgEntries, func(i, j int) bool { return cgEntries[i].hierarchyID > cgEntries[j].hierarchyID })
+	for _, cgE := range cgEntries {
+		fmt.Fprintf(buf, "%d:%s:%s\n", cgE.hierarchyID, cgE.controllers, cgE.path)
+	}
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index f305e69c0..405771f3f 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
 	"gvisor.dev/gvisor/pkg/cleanup"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -85,12 +86,12 @@ type CloneOptions struct {
 
 	// Stack is the initial stack pointer of the new task. If Stack is 0, the
 	// new task will start with the same stack pointer as its parent.
-	Stack usermem.Addr
+	Stack hostarch.Addr
 
 	// If SetTLS is true, set the new task's TLS (thread-local storage)
 	// descriptor to TLS. If SetTLS is false, TLS is ignored.
 	SetTLS bool
-	TLS    usermem.Addr
+	TLS    hostarch.Addr
 
 	// If ChildClearTID is true, when the child exits, 0 is written to the
 	// address ChildTID in the child's memory, and if the write is successful a
@@ -101,7 +102,7 @@ type CloneOptions struct {
 	// Linux, failed writes are silently ignored.)
 	ChildClearTID bool
 	ChildSetTID   bool
-	ChildTID      usermem.Addr
+	ChildTID      hostarch.Addr
 
 	// If ParentSetTID is true, the child's thread ID (in the parent's PID
 	// namespace) is written to address ParentTID in the parent's memory. (As
@@ -112,7 +113,7 @@ type CloneOptions struct {
 	// and child's memory, but this is a documentation error fixed by
 	// 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID").
 	ParentSetTID bool
-	ParentTID    usermem.Addr
+	ParentTID    hostarch.Addr
 
 	// If Vfork is true, place the parent in vforkStop until the cloned task
 	// releases its TaskImage.
@@ -268,7 +269,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	}
 
 	tg := t.tg
-	rseqAddr := usermem.Addr(0)
+	rseqAddr := hostarch.Addr(0)
 	rseqSignature := uint32(0)
 	if opts.NewThreadGroup {
 		if tg.mounts != nil {
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index ad59e4f60..b1af1a7ef 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -275,6 +275,10 @@ func (*runExitMain) execute(t *Task) taskRunState {
 	t.fsContext.DecRef(t)
 	t.fdTable.DecRef(t)
 
+	// Detach task from all cgroups. This must happen before potentially the
+	// last ref to the cgroupfs mount is dropped below.
+	t.LeaveCgroups()
+
 	t.mu.Lock()
 	if t.mountNamespaceVFS2 != nil {
 		t.mountNamespaceVFS2.DecRef(t)
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index 195c7da9b..4dc41b82b 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -16,6 +16,7 @@ package kernel
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -30,33 +31,33 @@ func (t *Task) Futex() *futex.Manager {
 }
 
 // SwapUint32 implements futex.Target.SwapUint32.
-func (t *Task) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
+func (t *Task) SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) {
 	return t.MemoryManager().SwapUint32(t, addr, new, usermem.IOOpts{
 		AddressSpaceActive: true,
 	})
 }
 
 // CompareAndSwapUint32 implements futex.Target.CompareAndSwapUint32.
-func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
+func (t *Task) CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) {
 	return t.MemoryManager().CompareAndSwapUint32(t, addr, old, new, usermem.IOOpts{
 		AddressSpaceActive: true,
 	})
 }
 
 // LoadUint32 implements futex.Target.LoadUint32.
-func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) {
+func (t *Task) LoadUint32(addr hostarch.Addr) (uint32, error) {
 	return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{
 		AddressSpaceActive: true,
 	})
 }
 
 // GetSharedKey implements futex.Target.GetSharedKey.
-func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) {
+func (t *Task) GetSharedKey(addr hostarch.Addr) (futex.Key, error) {
 	return t.MemoryManager().GetSharedFutexKey(t, addr)
 }
 
 // GetRobustList sets the robust futex list for the task.
-func (t *Task) GetRobustList() usermem.Addr {
+func (t *Task) GetRobustList() hostarch.Addr {
 	t.mu.Lock()
 	addr := t.robustList
 	t.mu.Unlock()
@@ -64,7 +65,7 @@ func (t *Task) GetRobustList() usermem.Addr {
 }
 
 // SetRobustList sets the robust futex list for the task.
-func (t *Task) SetRobustList(addr usermem.Addr) {
+func (t *Task) SetRobustList(addr hostarch.Addr) {
 	t.mu.Lock()
 	t.robustList = addr
 	t.mu.Unlock()
@@ -84,28 +85,28 @@ func (t *Task) exitRobustList() {
 	}
 
 	var rl linux.RobustListHead
-	if _, err := rl.CopyIn(t, usermem.Addr(addr)); err != nil {
+	if _, err := rl.CopyIn(t, hostarch.Addr(addr)); err != nil {
 		return
 	}
 
 	next := primitive.Uint64(rl.List)
 	done := 0
-	var pendingLockAddr usermem.Addr
+	var pendingLockAddr hostarch.Addr
 	if rl.ListOpPending != 0 {
-		pendingLockAddr = usermem.Addr(rl.ListOpPending + rl.FutexOffset)
+		pendingLockAddr = hostarch.Addr(rl.ListOpPending + rl.FutexOffset)
 	}
 
 	// Wake up normal elements.
-	for usermem.Addr(next) != addr {
+	for hostarch.Addr(next) != addr {
 		// We traverse to the next element of the list before we
 		// actually wake anything. This prevents the race where waking
 		// this futex causes a modification of the list.
-		thisLockAddr := usermem.Addr(uint64(next) + rl.FutexOffset)
+		thisLockAddr := hostarch.Addr(uint64(next) + rl.FutexOffset)
 
 		// Try to decode the next element in the list before waking the
 		// current futex. But don't check the error until after we've
 		// woken the current futex. Linux does it in this order too
-		_, nextErr := next.CopyIn(t, usermem.Addr(next))
+		_, nextErr := next.CopyIn(t, hostarch.Addr(next))
 
 		// Wakeup the current futex if it's not pending.
 		if thisLockAddr != pendingLockAddr {
@@ -133,7 +134,7 @@ func (t *Task) exitRobustList() {
 }
 
 // wakeRobustListOne wakes a single futex from the robust list.
-func (t *Task) wakeRobustListOne(addr usermem.Addr) {
+func (t *Task) wakeRobustListOne(addr hostarch.Addr) {
 	// Bit 0 in address signals PI futex.
 	pi := addr&1 == 1
 	addr = addr &^ 1
diff --git a/pkg/sentry/kernel/task_image.go b/pkg/sentry/kernel/task_image.go
index ce5fbd299..bd5543d4e 100644
--- a/pkg/sentry/kernel/task_image.go
+++ b/pkg/sentry/kernel/task_image.go
@@ -19,12 +19,12 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/loader"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/syserr"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC)
@@ -129,7 +129,7 @@ func (t *Task) Stack() *arch.Stack {
 	return &arch.Stack{
 		Arch:   t.Arch(),
 		IO:     t.MemoryManager(),
-		Bottom: usermem.Addr(t.Arch().Stack()),
+		Bottom: hostarch.Addr(t.Arch().Stack()),
 	}
 }
 
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index c70e5e6ce..72b9a0384 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -20,6 +20,7 @@ import (
 	"sort"
 
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -108,9 +109,9 @@ func (t *Task) debugDumpStack() {
 		return
 	}
 	t.Debugf("Stack:")
-	start := usermem.Addr(t.Arch().Stack())
+	start := hostarch.Addr(t.Arch().Stack())
 	// Round addr down to a 16-byte boundary.
-	start &= ^usermem.Addr(15)
+	start &= ^hostarch.Addr(15)
 	// Print 16 bytes per line, one byte at a time.
 	for offset := uint64(0); offset < maxStackDebugBytes; offset += 16 {
 		addr, ok := start.AddLength(offset)
@@ -127,7 +128,7 @@ func (t *Task) debugDumpStack() {
 			t.Debugf("%x: % x", addr, data[:n])
 		}
 		if err != nil {
-			t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err)
+			t.Debugf("Error reading stack at address %x: %v", addr+hostarch.Addr(n), err)
 			break
 		}
 	}
@@ -147,9 +148,9 @@ func (t *Task) debugDumpCode() {
 	}
 	t.Debugf("Code:")
 	// Print code on both sides of the instruction register.
-	start := usermem.Addr(t.Arch().IP()) - maxCodeDebugBytes/2
+	start := hostarch.Addr(t.Arch().IP()) - maxCodeDebugBytes/2
 	// Round addr down to a 16-byte boundary.
-	start &= ^usermem.Addr(15)
+	start &= ^hostarch.Addr(15)
 	// Print 16 bytes per line, one byte at a time.
 	for offset := uint64(0); offset < maxCodeDebugBytes; offset += 16 {
 		addr, ok := start.AddLength(offset)
@@ -166,7 +167,7 @@ func (t *Task) debugDumpCode() {
 			t.Debugf("%x: % x", addr, data[:n])
 		}
 		if err != nil {
-			t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err)
+			t.Debugf("Error reading stack at address %x: %v", addr+hostarch.Addr(n), err)
 			break
 		}
 	}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 3ccecf4b6..068f25af1 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -23,13 +23,13 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/goid"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // A taskRunState is a reified state in the task state machine. See README.md
@@ -148,7 +148,7 @@ func (*runApp) handleCPUIDInstruction(t *Task) error {
 	region := trace.StartRegion(t.traceContext, cpuidRegion)
 	expected := arch.CPUIDInstruction[:]
 	found := make([]byte, len(expected))
-	_, err := t.CopyInBytes(usermem.Addr(t.Arch().IP()), found)
+	_, err := t.CopyInBytes(hostarch.Addr(t.Arch().IP()), found)
 	if err == nil && bytes.Equal(expected, found) {
 		// Skip the cpuid instruction.
 		t.Arch().CPUIDEmulate(t)
@@ -307,8 +307,8 @@ func (app *runApp) execute(t *Task) taskRunState {
 		// normally.
 		if at.Any() {
 			region := trace.StartRegion(t.traceContext, faultRegion)
-			addr := usermem.Addr(info.Addr())
-			err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
+			addr := hostarch.Addr(info.Addr())
+			err := t.MemoryManager().HandleUserFault(t, addr, at, hostarch.Addr(t.Arch().Stack()))
 			region.End()
 			if err == nil {
 				// The fault was handled appropriately.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 75af3af79..c2b9fc08f 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -23,11 +23,11 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/eventchannel"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -243,7 +243,7 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
 
 	// Are executing on the main stack,
 	// or the provided alternate stack?
-	sp := usermem.Addr(t.Arch().Stack())
+	sp := hostarch.Addr(t.Arch().Stack())
 
 	// N.B. This is a *copy* of the alternate stack that the user's signal
 	// handler expects to see in its ucontext (even if it's not in use).
@@ -251,7 +251,7 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
 	if act.IsOnStack() && alt.IsEnabled() {
 		alt.SetOnStack()
 		if !alt.Contains(sp) {
-			sp = usermem.Addr(alt.Top())
+			sp = hostarch.Addr(alt.Top())
 		}
 	}
 
@@ -652,7 +652,7 @@ func (t *Task) SignalStack() arch.SignalStack {
 
 // onSignalStack returns true if the task is executing on the given signal stack.
 func (t *Task) onSignalStack(alt arch.SignalStack) bool {
-	sp := usermem.Addr(t.Arch().Stack())
+	sp := hostarch.Addr(t.Arch().Stack())
 	return alt.Contains(sp)
 }
 
@@ -720,7 +720,7 @@ func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (a
 
 // CopyOutSignalAct converts the given SignalAct into an architecture-specific
 // type and then copies it out to task memory.
-func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
+func (t *Task) CopyOutSignalAct(addr hostarch.Addr, s *arch.SignalAct) error {
 	n := t.Arch().NewSignalAct()
 	n.SerializeFrom(s)
 	_, err := n.CopyOut(t, addr)
@@ -729,7 +729,7 @@ func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
 
 // CopyInSignalAct copies an architecture-specific sigaction type from task
 // memory and then converts it into a SignalAct.
-func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
+func (t *Task) CopyInSignalAct(addr hostarch.Addr) (arch.SignalAct, error) {
 	n := t.Arch().NewSignalAct()
 	var s arch.SignalAct
 	if _, err := n.CopyIn(t, addr); err != nil {
@@ -741,7 +741,7 @@ func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
 
 // CopyOutSignalStack converts the given SignalStack into an
 // architecture-specific type and then copies it out to task memory.
-func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error {
+func (t *Task) CopyOutSignalStack(addr hostarch.Addr, s *arch.SignalStack) error {
 	n := t.Arch().NewSignalStack()
 	n.SerializeFrom(s)
 	_, err := n.CopyOut(t, addr)
@@ -750,7 +750,7 @@ func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error
 
 // CopyInSignalStack copies an architecture-specific stack_t from task memory
 // and then converts it into a SignalStack.
-func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
+func (t *Task) CopyInSignalStack(addr hostarch.Addr) (arch.SignalStack, error) {
 	n := t.Arch().NewSignalStack()
 	var s arch.SignalStack
 	if _, err := n.CopyIn(t, addr); err != nil {
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 36e1384f1..32031cd70 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -25,7 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // TaskConfig defines the configuration of a new Task (see below).
@@ -86,7 +86,7 @@ type TaskConfig struct {
 	MountNamespaceVFS2 *vfs.MountNamespace
 
 	// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
-	RSeqAddr usermem.Addr
+	RSeqAddr hostarch.Addr
 
 	// RSeqSignature is the signature that the rseq abort IP must be signed
 	// with.
@@ -151,6 +151,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		rseqSignature:      cfg.RSeqSignature,
 		futexWaiter:        futex.NewWaiter(),
 		containerID:        cfg.ContainerID,
+		cgroups:            make(map[Cgroup]struct{}),
 	}
 	t.creds.Store(cfg.Credentials)
 	t.endStopCond.L = &t.tg.signalHandlers.mu
@@ -189,6 +190,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		t.parent.children[t] = struct{}{}
 	}
 
+	if VFS2Enabled {
+		t.EnterInitialCgroups(t.parent)
+	}
+
 	if tg.leader == nil {
 		// New thread group.
 		tg.leader = t
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 2e84bd88a..2c658d001 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -22,12 +22,12 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bits"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application")
@@ -153,7 +153,7 @@ func (t *Task) doSyscall() taskRunState {
 	// Check seccomp filters. The nil check is for performance (as seccomp use
 	// is rare), not needed for correctness.
 	if t.syscallFilters.Load() != nil {
-		switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r {
+		switch r := t.checkSeccompSyscall(int32(sysno), args, hostarch.Addr(t.Arch().IP())); r {
 		case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
 			t.Debugf("Syscall %d: denied by seccomp", sysno)
 			return (*runSyscallExit)(nil)
@@ -283,12 +283,12 @@ func (*runSyscallExit) execute(t *Task) taskRunState {
 // doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
 // indicated by an execution fault at address addr. doVsyscall returns the
 // task's next run state.
-func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
+func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState {
 	vsyscallCount.Increment()
 
 	// Grab the caller up front, to make sure there's a sensible stack.
 	caller := t.Arch().Native(uintptr(0))
-	if _, err := caller.CopyIn(t, usermem.Addr(t.Arch().Stack())); err != nil {
+	if _, err := caller.CopyIn(t, hostarch.Addr(t.Arch().Stack())); err != nil {
 		t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
 		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
@@ -322,7 +322,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
 }
 
 type runVsyscallAfterPtraceEventSeccomp struct {
-	addr   usermem.Addr
+	addr   hostarch.Addr
 	sysno  uintptr
 	caller marshal.Marshallable
 }
@@ -337,7 +337,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
 	// currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
 	// Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
 	// causes do_exit(SIGSYS), and changing sp is ignored.
-	if (sysno != ^uintptr(0) && sysno != r.sysno) || usermem.Addr(t.Arch().IP()) != r.addr {
+	if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr {
 		t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
 		return (*runExit)(nil)
 	}
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 94dabbcd8..fc6d9438a 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -27,7 +28,7 @@ import (
 // MAX_RW_COUNT is the maximum size in bytes of a single read or write.
 // Reads and writes that exceed this size may be silently truncated.
 // (Linux: include/linux/fs.h:MAX_RW_COUNT)
-var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
+var MAX_RW_COUNT = int(hostarch.Addr(math.MaxInt32).RoundDown())
 
 // Activate ensures that the task has an active address space.
 func (t *Task) Activate() {
@@ -49,7 +50,7 @@ func (t *Task) Deactivate() {
 // data without reflection and pass in a byte slice.
 //
 // This Task's AddressSpace must be active.
-func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+func (t *Task) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) {
 	return t.MemoryManager().CopyIn(t, addr, dst, usermem.IOOpts{
 		AddressSpaceActive: true,
 	})
@@ -59,7 +60,7 @@ func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
 // data without reflection and pass in a byte slice.
 //
 // This Task's AddressSpace must be active.
-func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+func (t *Task) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) {
 	return t.MemoryManager().CopyOut(t, addr, src, usermem.IOOpts{
 		AddressSpaceActive: true,
 	})
@@ -70,7 +71,7 @@ func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
 // user memory that is unmapped or not readable by the user.
 //
 // This Task's AddressSpace must be active.
-func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) {
+func (t *Task) CopyInString(addr hostarch.Addr, maxlen int) (string, error) {
 	return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxlen, usermem.IOOpts{
 		AddressSpaceActive: true,
 	})
@@ -90,7 +91,7 @@ func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) {
 //     { "abc" }         => 4 (3 for length, 1 for elements)
 //
 // This Task's AddressSpace must be active.
-func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([]string, error) {
+func (t *Task) CopyInVector(addr hostarch.Addr, maxElemSize, maxTotalSize int) ([]string, error) {
 	var v []string
 	for {
 		argAddr := t.Arch().Native(0)
@@ -109,12 +110,12 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
 		if maxTotalSize < thisMax {
 			thisMax = maxTotalSize
 		}
-		arg, err := t.CopyInString(usermem.Addr(t.Arch().Value(argAddr)), thisMax)
+		arg, err := t.CopyInString(hostarch.Addr(t.Arch().Value(argAddr)), thisMax)
 		if err != nil {
 			return v, err
 		}
 		v = append(v, arg)
-		addr += usermem.Addr(t.Arch().Width())
+		addr += hostarch.Addr(t.Arch().Width())
 		maxTotalSize -= len(arg) + 1
 	}
 	return v, nil
@@ -126,7 +127,7 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
 // Preconditions: Same as usermem.IO.CopyOut, plus:
 // * The caller must be running on the task goroutine.
 // * t's AddressSpace must be active.
-func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error {
+func (t *Task) CopyOutIovecs(addr hostarch.Addr, src hostarch.AddrRangeSeq) error {
 	switch t.Arch().Width() {
 	case 8:
 		const itemLen = 16
@@ -137,8 +138,8 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
 		b := t.CopyScratchBuffer(itemLen)
 		for ; !src.IsEmpty(); src = src.Tail() {
 			ar := src.Head()
-			usermem.ByteOrder.PutUint64(b[0:8], uint64(ar.Start))
-			usermem.ByteOrder.PutUint64(b[8:16], uint64(ar.Length()))
+			hostarch.ByteOrder.PutUint64(b[0:8], uint64(ar.Start))
+			hostarch.ByteOrder.PutUint64(b[8:16], uint64(ar.Length()))
 			if _, err := t.CopyOutBytes(addr, b); err != nil {
 				return err
 			}
@@ -153,8 +154,8 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
 }
 
 // CopyInIovecs copies an array of numIovecs struct iovecs from the memory
-// mapped at addr, converts them to usermem.AddrRanges, and returns them as a
-// usermem.AddrRangeSeq.
+// mapped at addr, converts them to hostarch.AddrRanges, and returns them as a
+// hostarch.AddrRangeSeq.
 //
 // CopyInIovecs shares the following properties with Linux's
 // lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector():
@@ -175,42 +176,42 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
 // Preconditions: Same as usermem.IO.CopyIn, plus:
 // * The caller must be running on the task goroutine.
 // * t's AddressSpace must be active.
-func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) {
+func (t *Task) CopyInIovecs(addr hostarch.Addr, numIovecs int) (hostarch.AddrRangeSeq, error) {
 	if numIovecs == 0 {
-		return usermem.AddrRangeSeq{}, nil
+		return hostarch.AddrRangeSeq{}, nil
 	}
 
-	var dst []usermem.AddrRange
+	var dst []hostarch.AddrRange
 	if numIovecs > 1 {
-		dst = make([]usermem.AddrRange, 0, numIovecs)
+		dst = make([]hostarch.AddrRange, 0, numIovecs)
 	}
 
 	switch t.Arch().Width() {
 	case 8:
 		const itemLen = 16
 		if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok {
-			return usermem.AddrRangeSeq{}, syserror.EFAULT
+			return hostarch.AddrRangeSeq{}, syserror.EFAULT
 		}
 
 		b := t.CopyScratchBuffer(itemLen)
 		for i := 0; i < numIovecs; i++ {
 			if _, err := t.CopyInBytes(addr, b); err != nil {
-				return usermem.AddrRangeSeq{}, err
+				return hostarch.AddrRangeSeq{}, err
 			}
 
-			base := usermem.Addr(usermem.ByteOrder.Uint64(b[0:8]))
-			length := usermem.ByteOrder.Uint64(b[8:16])
+			base := hostarch.Addr(hostarch.ByteOrder.Uint64(b[0:8]))
+			length := hostarch.ByteOrder.Uint64(b[8:16])
 			if length > math.MaxInt64 {
-				return usermem.AddrRangeSeq{}, syserror.EINVAL
+				return hostarch.AddrRangeSeq{}, syserror.EINVAL
 			}
 			ar, ok := t.MemoryManager().CheckIORange(base, int64(length))
 			if !ok {
-				return usermem.AddrRangeSeq{}, syserror.EFAULT
+				return hostarch.AddrRangeSeq{}, syserror.EFAULT
 			}
 
 			if numIovecs == 1 {
 				// Special case to avoid allocating dst.
-				return usermem.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil
+				return hostarch.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil
 			}
 			dst = append(dst, ar)
 
@@ -218,7 +219,7 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange
 		}
 
 	default:
-		return usermem.AddrRangeSeq{}, syserror.ENOSYS
+		return hostarch.AddrRangeSeq{}, syserror.ENOSYS
 	}
 
 	// Truncate to MAX_RW_COUNT.
@@ -226,13 +227,13 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange
 	for i := range dst {
 		dstlen := uint64(dst[i].Length())
 		if rem := uint64(MAX_RW_COUNT) - total; rem < dstlen {
-			dst[i].End -= usermem.Addr(dstlen - rem)
+			dst[i].End -= hostarch.Addr(dstlen - rem)
 			dstlen = rem
 		}
 		total += dstlen
 	}
 
-	return usermem.AddrRangeSeqFromSlice(dst), nil
+	return hostarch.AddrRangeSeqFromSlice(dst), nil
 }
 
 // SingleIOSequence returns a usermem.IOSequence representing [addr,
@@ -245,7 +246,7 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange
 // write syscalls in Linux do not use import_single_range(). However they check
 // access_ok() in fs/read_write.c:vfs_read/vfs_write, and overflowing address
 // ranges are truncated to MAX_RW_COUNT by fs/read_write.c:rw_verify_area().)
-func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+func (t *Task) SingleIOSequence(addr hostarch.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) {
 	if length > MAX_RW_COUNT {
 		length = MAX_RW_COUNT
 	}
@@ -255,7 +256,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp
 	}
 	return usermem.IOSequence{
 		IO:    t.MemoryManager(),
-		Addrs: usermem.AddrRangeSeqOf(ar),
+		Addrs: hostarch.AddrRangeSeqOf(ar),
 		Opts:  opts,
 	}, nil
 }
@@ -267,7 +268,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp
 // IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
 //
 // Preconditions: Same as Task.CopyInIovecs.
-func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+func (t *Task) IovecsIOSequence(addr hostarch.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
 	if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
 		return usermem.IOSequence{}, syserror.EINVAL
 	}
@@ -317,7 +318,7 @@ func (cc *taskCopyContext) getMemoryManager() (*mm.MemoryManager, error) {
 }
 
 // CopyInBytes implements marshal.CopyContext.CopyInBytes.
-func (cc *taskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+func (cc *taskCopyContext) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) {
 	tmm, err := cc.getMemoryManager()
 	if err != nil {
 		return 0, err
@@ -327,7 +328,7 @@ func (cc *taskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, erro
 }
 
 // CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
-func (cc *taskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+func (cc *taskCopyContext) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) {
 	tmm, err := cc.getMemoryManager()
 	if err != nil {
 		return 0, err
@@ -360,11 +361,11 @@ func (cc *ownTaskCopyContext) CopyScratchBuffer(size int) []byte {
 }
 
 // CopyInBytes implements marshal.CopyContext.CopyInBytes.
-func (cc *ownTaskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+func (cc *ownTaskCopyContext) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) {
 	return cc.t.MemoryManager().CopyIn(cc.t, addr, dst, cc.opts)
 }
 
 // CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
-func (cc *ownTaskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+func (cc *ownTaskCopyContext) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) {
 	return cc.t.MemoryManager().CopyOut(cc.t, addr, src, cc.opts)
 }
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 09d070ec8..77ad62445 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -114,6 +114,15 @@ func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
 	}
 }
 
+// forEachTaskLocked applies f to each Task in ts.
+//
+// Preconditions: ts.mu must be locked (for reading or writing).
+func (ts *TaskSet) forEachTaskLocked(f func(t *Task)) {
+	for t := range ts.Root.tids {
+		f(t)
+	}
+}
+
 // A PIDNamespace represents a PID namespace, a bimap between thread IDs and
 // tasks. See the pid_namespaces(7) man page for further details.
 //
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
index cf2f7ca72..dfc3c0719 100644
--- a/pkg/sentry/kernel/timekeeper_test.go
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -17,12 +17,12 @@ package kernel
 import (
 	"testing"
 
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // mockClocks is a sentrytime.Clocks that simply returns the times in the
@@ -54,7 +54,7 @@ func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) {
 func stateTestClocklessTimekeeper(tb testing.TB) *Timekeeper {
 	ctx := contexttest.Context(tb)
 	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
-	fr, err := mfp.MemoryFile().Allocate(usermem.PageSize, usage.Anonymous)
+	fr, err := mfp.MemoryFile().Allocate(hostarch.PageSize, usage.Anonymous)
 	if err != nil {
 		tb.Fatalf("failed to allocate memory: %v", err)
 	}
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 9e5c2d26f..cc0917504 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -17,10 +17,10 @@ package kernel
 import (
 	"fmt"
 
+	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // vdsoParams are the parameters exposed to the VDSO.
@@ -96,7 +96,7 @@ func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSO
 
 // access returns a mapping of the param page.
 func (v *VDSOParamPage) access() (safemem.Block, error) {
-	bs, err := v.mfp.MemoryFile().MapInternal(v.fr, usermem.ReadWrite)
+	bs, err := v.mfp.MemoryFile().MapInternal(v.fr, hostarch.ReadWrite)
 	if err != nil {
 		return safemem.Block{}, err
 	}