35 files changed, 1006 insertions, 777 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index c172d399e..e61d39c82 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -31,7 +31,7 @@ go_template_instance(
 
 go_template_instance(
     name = "seqatomic_taskgoroutineschedinfo",
-    out = "seqatomic_taskgoroutineschedinfo.go",
+    out = "seqatomic_taskgoroutineschedinfo_unsafe.go",
     package = "kernel",
     suffix = "TaskGoroutineSchedInfo",
     template = "//third_party/gvsync:generic_seqatomic",
@@ -96,7 +96,8 @@ go_library(
     srcs = [
         "abstract_socket_namespace.go",
         "context.go",
-        "fd_map.go",
+        "fd_table.go",
+        "fd_table_unsafe.go",
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
@@ -111,7 +112,7 @@ go_library(
         "ptrace_arm64.go",
         "rseq.go",
         "seccomp.go",
-        "seqatomic_taskgoroutineschedinfo.go",
+        "seqatomic_taskgoroutineschedinfo_unsafe.go",
         "session_list.go",
         "sessions.go",
         "signal.go",
@@ -179,7 +180,6 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/epoll",
         "//pkg/sentry/kernel/futex",
-        "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/sched",
         "//pkg/sentry/kernel/semaphore",
         "//pkg/sentry/kernel/shm",
@@ -214,7 +214,7 @@ go_test(
     name = "kernel_test",
     size = "small",
     srcs = [
-        "fd_map_test.go",
+        "fd_table_test.go",
         "table_test.go",
         "task_test.go",
         "timekeeper_test.go",
@@ -223,9 +223,10 @@ go_test(
     deps = [
         "//pkg/abi",
         "//pkg/sentry/arch",
+        "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
         "//pkg/sentry/fs/filetest",
-        "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/sched",
         "//pkg/sentry/limits",
         "//pkg/sentry/pgalloc",
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 37cb8c8b9..1d00a6310 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -4,6 +4,17 @@ load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
+    name = "atomicptr_credentials",
+    out = "atomicptr_credentials_unsafe.go",
+    package = "auth",
+    suffix = "Credentials",
+    template = "//third_party/gvsync:generic_atomicptr",
+    types = {
+        "Value": "Credentials",
+    },
+)
+
+go_template_instance(
     name = "id_map_range",
     out = "id_map_range.go",
     package = "auth",
@@ -34,6 +45,7 @@ go_template_instance(
 go_library(
     name = "auth",
     srcs = [
+        "atomicptr_credentials_unsafe.go",
         "auth.go",
         "capability_set.go",
         "context.go",
diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go
index a21fa6f0f..fc8c6745c 100644
--- a/pkg/sentry/kernel/auth/capability_set.go
+++ b/pkg/sentry/kernel/auth/capability_set.go
@@ -24,7 +24,7 @@ import (
 type CapabilitySet uint64
 
 // AllCapabilities is a CapabilitySet containing all valid capabilities.
-var AllCapabilities = CapabilitySetOf(linux.MaxCapability+1) - 1
+var AllCapabilities = CapabilitySetOf(linux.CAP_LAST_CAP+1) - 1
 
 // CapabilitySetOf returns a CapabilitySet containing only the given
 // capability.
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index fb99cfc8f..f46c43128 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -30,7 +30,6 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/usermem",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 33c7dccae..9c0a4e1b4 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -26,7 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -61,7 +60,7 @@ const (
 // +stateify savable
 type FileIdentifier struct {
 	File *fs.File `state:"wait"`
-	Fd   kdefs.FD
+	Fd   int32
 }
 
 // pollEntry holds all the state associated with an event poll entry, that is,
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
deleted file mode 100644
index 786936a7d..000000000
--- a/pkg/sentry/kernel/fd_map.go
+++ /dev/null
@@ -1,364 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kernel
-
-import (
-	"bytes"
-	"fmt"
-	"sort"
-	"sync"
-	"sync/atomic"
-	"syscall"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
-	"gvisor.dev/gvisor/pkg/sentry/limits"
-)
-
-// FDs is an ordering of FD's that can be made stable.
-type FDs []kdefs.FD
-
-func (f FDs) Len() int {
-	return len(f)
-}
-
-func (f FDs) Swap(i, j int) {
-	f[i], f[j] = f[j], f[i]
-}
-
-func (f FDs) Less(i, j int) bool {
-	return f[i] < f[j]
-}
-
-// FDFlags define flags for an individual descriptor.
-//
-// +stateify savable
-type FDFlags struct {
-	// CloseOnExec indicates the descriptor should be closed on exec.
-	CloseOnExec bool
-}
-
-// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
-// representation.
-func (f FDFlags) ToLinuxFileFlags() (mask uint) {
-	if f.CloseOnExec {
-		mask |= linux.O_CLOEXEC
-	}
-	return
-}
-
-// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
-// representation.
-func (f FDFlags) ToLinuxFDFlags() (mask uint) {
-	if f.CloseOnExec {
-		mask |= linux.FD_CLOEXEC
-	}
-	return
-}
-
-// descriptor holds the details about a file descriptor, namely a pointer the
-// file itself and the descriptor flags.
-//
-// +stateify savable
-type descriptor struct {
-	file  *fs.File
-	flags FDFlags
-}
-
-// FDMap is used to manage File references and flags.
-//
-// +stateify savable
-type FDMap struct {
-	refs.AtomicRefCount
-	k     *Kernel
-	files map[kdefs.FD]descriptor
-	mu    sync.RWMutex `state:"nosave"`
-	uid   uint64
-}
-
-// ID returns a unique identifier for this FDMap.
-func (f *FDMap) ID() uint64 {
-	return f.uid
-}
-
-// NewFDMap allocates a new FDMap that may be used by tasks in k.
-func (k *Kernel) NewFDMap() *FDMap {
-	return &FDMap{
-		k:     k,
-		files: make(map[kdefs.FD]descriptor),
-		uid:   atomic.AddUint64(&k.fdMapUids, 1),
-	}
-}
-
-// destroy removes all of the file descriptors from the map.
-func (f *FDMap) destroy() {
-	f.RemoveIf(func(*fs.File, FDFlags) bool {
-		return true
-	})
-}
-
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
-func (f *FDMap) DecRef() {
-	f.DecRefWithDestructor(f.destroy)
-}
-
-// Size returns the number of file descriptor slots currently allocated.
-func (f *FDMap) Size() int {
-	f.mu.RLock()
-	defer f.mu.RUnlock()
-
-	return len(f.files)
-}
-
-// String is a stringer for FDMap.
-func (f *FDMap) String() string {
-	f.mu.RLock()
-	defer f.mu.RUnlock()
-
-	var b bytes.Buffer
-	for k, v := range f.files {
-		n, _ := v.file.Dirent.FullName(nil /* root */)
-		b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", k, n))
-	}
-	return b.String()
-}
-
-// NewFDFrom allocates a new FD guaranteed to be the lowest number available
-// greater than or equal to from. This property is important as Unix programs
-// tend to count on this allocation order.
-func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error) {
-	if fd < 0 {
-		// Don't accept negative FDs.
-		return 0, syscall.EINVAL
-	}
-
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	// Finds the lowest fd not in the handles map.
-	lim := limitSet.Get(limits.NumberOfFiles)
-	for i := fd; lim.Cur == limits.Infinity || i < kdefs.FD(lim.Cur); i++ {
-		if _, ok := f.files[i]; !ok {
-			file.IncRef()
-			f.files[i] = descriptor{file, flags}
-			return i, nil
-		}
-	}
-
-	return -1, syscall.EMFILE
-}
-
-// NewFDAt sets the file reference for the given FD. If there is an
-// active reference for that FD, the ref count for that existing reference
-// is decremented.
-func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error {
-	if fd < 0 {
-		// Don't accept negative FDs.
-		return syscall.EBADF
-	}
-
-	// In this one case we do not do a defer of the Unlock.  The
-	// reason is that we must have done all the work needed for
-	// discarding any old open file before we return to the
-	// caller. In other words, the DecRef(), below, must have
-	// completed by the time we return to the caller to ensure
-	// side effects are, in fact, effected. A classic example is
-	// dup2(fd1, fd2); if fd2 was already open, it must be closed,
-	// and we don't want to resume the caller until it is; we have
-	// to block on the DecRef(). Hence we can not just do a 'go
-	// oldfile.DecRef()', since there would be no guarantee that
-	// it would be done before we the caller resumed. Since we
-	// must wait for the DecRef() to finish, and that could take
-	// time, it's best to first call f.muUnlock beore so we are
-	// not blocking other uses of this FDMap on the DecRef() call.
-	f.mu.Lock()
-	oldDesc, oldExists := f.files[fd]
-	lim := limitSet.Get(limits.NumberOfFiles).Cur
-	// if we're closing one then the effective limit is one
-	// more than the actual limit.
-	if oldExists && lim != limits.Infinity {
-		lim++
-	}
-	if lim != limits.Infinity && fd >= kdefs.FD(lim) {
-		f.mu.Unlock()
-		return syscall.EMFILE
-	}
-
-	file.IncRef()
-	f.files[fd] = descriptor{file, flags}
-	f.mu.Unlock()
-
-	if oldExists {
-		oldDesc.file.DecRef()
-	}
-	return nil
-}
-
-// SetFlags sets the flags for the given file descriptor, if it is valid.
-func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	desc, ok := f.files[fd]
-	if !ok {
-		return
-	}
-
-	f.files[fd] = descriptor{desc.file, flags}
-}
-
-// GetDescriptor returns a reference to the file and the flags for the FD. It
-// bumps its reference count as well. It returns nil if there is no File
-// for the FD, i.e. if the FD is invalid. The caller must use DecRef
-// when they are done.
-func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags) {
-	f.mu.RLock()
-	defer f.mu.RUnlock()
-
-	if desc, ok := f.files[fd]; ok {
-		desc.file.IncRef()
-		return desc.file, desc.flags
-	}
-	return nil, FDFlags{}
-}
-
-// GetFile returns a reference to the File for the FD and bumps
-// its reference count as well. It returns nil if there is no File
-// for the FD, i.e. if the FD is invalid. The caller must use DecRef
-// when they are done.
-func (f *FDMap) GetFile(fd kdefs.FD) *fs.File {
-	f.mu.RLock()
-	if desc, ok := f.files[fd]; ok {
-		desc.file.IncRef()
-		f.mu.RUnlock()
-		return desc.file
-	}
-	f.mu.RUnlock()
-	return nil
-}
-
-// fds returns an ordering of FDs.
-func (f *FDMap) fds() FDs {
-	fds := make(FDs, 0, len(f.files))
-	for fd := range f.files {
-		fds = append(fds, fd)
-	}
-	sort.Sort(fds)
-	return fds
-}
-
-// GetFDs returns a list of valid fds.
-func (f *FDMap) GetFDs() FDs {
-	f.mu.RLock()
-	defer f.mu.RUnlock()
-	return f.fds()
-}
-
-// GetRefs returns a stable slice of references to all files and bumps the
-// reference count on each.  The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDMap) GetRefs() []*fs.File {
-	f.mu.RLock()
-	defer f.mu.RUnlock()
-
-	fds := f.fds()
-	fs := make([]*fs.File, 0, len(fds))
-	for _, fd := range fds {
-		desc := f.files[fd]
-		desc.file.IncRef()
-		fs = append(fs, desc.file)
-	}
-	return fs
-}
-
-// Fork returns an independent FDMap pointing to the same descriptors.
-func (f *FDMap) Fork() *FDMap {
-	f.mu.RLock()
-	defer f.mu.RUnlock()
-
-	clone := f.k.NewFDMap()
-
-	// Grab a extra reference for every file.
-	for fd, desc := range f.files {
-		desc.file.IncRef()
-		clone.files[fd] = desc
-	}
-
-	// That's it!
-	return clone
-}
-
-// unlock releases all file locks held by this FDMap's uid.  Must only be
-// called on a non-nil *fs.File.
-func (f *FDMap) unlock(file *fs.File) {
-	id := lock.UniqueID(f.ID())
-	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(id, lock.LockRange{0, lock.LockEOF})
-}
-
-// inotifyFileClose generates the appropriate inotify events for f being closed.
-func inotifyFileClose(f *fs.File) {
-	var ev uint32
-	d := f.Dirent
-
-	if fs.IsDir(d.Inode.StableAttr) {
-		ev |= linux.IN_ISDIR
-	}
-
-	if f.Flags().Write {
-		ev |= linux.IN_CLOSE_WRITE
-	} else {
-		ev |= linux.IN_CLOSE_NOWRITE
-	}
-
-	d.InotifyEvent(ev, 0)
-}
-
-// Remove removes an FD from the FDMap, and returns (File, true) if a File
-// one was found. Callers are expected to decrement the reference count on
-// the File. Otherwise returns (nil, false).
-func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool) {
-	f.mu.Lock()
-	desc := f.files[fd]
-	delete(f.files, fd)
-	f.mu.Unlock()
-	if desc.file != nil {
-		f.unlock(desc.file)
-		inotifyFileClose(desc.file)
-		return desc.file, true
-	}
-	return nil, false
-}
-
-// RemoveIf removes all FDs where cond is true.
-func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool) {
-	var removed []*fs.File
-	f.mu.Lock()
-	for fd, desc := range f.files {
-		if desc.file != nil && cond(desc.file, desc.flags) {
-			delete(f.files, fd)
-			removed = append(removed, desc.file)
-		}
-	}
-	f.mu.Unlock()
-
-	for _, file := range removed {
-		f.unlock(file)
-		inotifyFileClose(file)
-		file.DecRef()
-	}
-}
diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go
deleted file mode 100644
index 8571dbe59..000000000
--- a/pkg/sentry/kernel/fd_map_test.go
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kernel
-
-import (
-	"testing"
-
-	"gvisor.dev/gvisor/pkg/sentry/fs/filetest"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
-	"gvisor.dev/gvisor/pkg/sentry/limits"
-)
-
-const (
-	// maxFD is the maximum FD to try to create in the map.
-	// This number of open files has been seen in the wild.
-	maxFD = 2 * 1024
-)
-
-func newTestFDMap() *FDMap {
-	return &FDMap{
-		files: make(map[kdefs.FD]descriptor),
-	}
-}
-
-// TestFDMapMany allocates maxFD FDs, i.e. maxes out the FDMap,
-// until there is no room, then makes sure that NewFDAt works
-// and also that if we remove one and add one that works too.
-func TestFDMapMany(t *testing.T) {
-	file := filetest.NewTestFile(t)
-	limitSet := limits.NewLimitSet()
-	limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true /* privileged */)
-
-	f := newTestFDMap()
-	for i := 0; i < maxFD; i++ {
-		if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
-			t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD)
-		}
-	}
-
-	if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil {
-		t.Fatalf("f.NewFDFrom(0, r) in full map: got nil, wanted error")
-	}
-
-	if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil {
-		t.Fatalf("f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
-	}
-}
-
-// TestFDMap does a set of simple tests to make sure simple adds,
-// removes, GetRefs, and DecRefs work. The ordering is just weird
-// enough that a table-driven approach seemed clumsy.
-func TestFDMap(t *testing.T) {
-	file := filetest.NewTestFile(t)
-	limitSet := limits.NewLimitSet()
-	limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}, true /* privileged */)
-
-	f := newTestFDMap()
-	if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
-		t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err)
-	}
-
-	if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil {
-		t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error")
-	}
-
-	largeLimit := limits.Limit{maxFD, maxFD}
-	limitSet.Set(limits.NumberOfFiles, largeLimit, true /* privileged */)
-
-	if fd, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
-		t.Fatalf("Adding an FD to a resized map: got %v, want nil", err)
-	} else if fd != kdefs.FD(1) {
-		t.Fatalf("Added an FD to a resized map: got %v, want 1", fd)
-	}
-
-	if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil {
-		t.Fatalf("Replacing FD 1 via f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
-	}
-
-	if err := f.NewFDAt(maxFD+1, file, FDFlags{}, limitSet); err == nil {
-		t.Fatalf("Using an FD that was too large via f.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1)
-	}
-
-	if ref := f.GetFile(1); ref == nil {
-		t.Fatalf("f.GetFile(1): got nil, wanted %v", file)
-	}
-
-	if ref := f.GetFile(2); ref != nil {
-		t.Fatalf("f.GetFile(2): got a %v, wanted nil", ref)
-	}
-
-	ref, ok := f.Remove(1)
-	if !ok {
-		t.Fatalf("f.Remove(1) for an existing FD: failed, want success")
-	}
-	ref.DecRef()
-
-	if ref, ok := f.Remove(1); ok {
-		ref.DecRef()
-		t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
-	}
-
-}
-
-func TestDescriptorFlags(t *testing.T) {
-	file := filetest.NewTestFile(t)
-	f := newTestFDMap()
-	limitSet := limits.NewLimitSet()
-	limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true /* privileged */)
-
-	origFlags := FDFlags{CloseOnExec: true}
-
-	if err := f.NewFDAt(2, file, origFlags, limitSet); err != nil {
-		t.Fatalf("f.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err)
-	}
-
-	newFile, newFlags := f.GetDescriptor(2)
-	if newFile == nil {
-		t.Fatalf("f.GetFile(2): got a %v, wanted nil", newFile)
-	}
-
-	if newFlags != origFlags {
-		t.Fatalf("new File flags %+v don't match original %+v", newFlags, origFlags)
-	}
-}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
new file mode 100644
index 000000000..1f3a57dc1
--- /dev/null
+++ b/pkg/sentry/kernel/fd_table.go
@@ -0,0 +1,380 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+)
+
+// FDFlags define flags for an individual descriptor.
+//
+// +stateify savable
+type FDFlags struct {
+	// CloseOnExec indicates the descriptor should be closed on exec.
+	CloseOnExec bool
+}
+
+// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
+// representation.
+func (f FDFlags) ToLinuxFileFlags() (mask uint) {
+	if f.CloseOnExec {
+		mask |= linux.O_CLOEXEC
+	}
+	return
+}
+
+// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
+// representation.
+func (f FDFlags) ToLinuxFDFlags() (mask uint) {
+	if f.CloseOnExec {
+		mask |= linux.FD_CLOEXEC
+	}
+	return
+}
+
+// descriptor holds the details about a file descriptor, namely a pointer to
+// the file itself and the descriptor flags.
+//
+// Note that this is immutable and can only be changed via operations on the
+// descriptorTable.
+//
+// +stateify savable
+type descriptor struct {
+	file  *fs.File
+	flags FDFlags
+}
+
+// FDTable is used to manage File references and flags.
+//
+// +stateify savable
+type FDTable struct {
+	refs.AtomicRefCount
+	k *Kernel
+
+	// uid is a unique identifier.
+	uid uint64
+
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// used contains the number of non-nil entries.
+	used int32
+
+	// descriptorTable holds descriptors.
+	descriptorTable `state:".(map[int32]descriptor)"`
+}
+
+func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
+	m := make(map[int32]descriptor)
+	f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+		m[fd] = descriptor{
+			file:  file,
+			flags: flags,
+		}
+	})
+	return m
+}
+
+func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
+	f.init() // Initialize table.
+	for fd, d := range m {
+		f.set(fd, d.file, d.flags)
+
+		// Note that we do _not_ need to acquire a extra table
+		// reference here. The table reference will already be
+		// accounted for in the file, so we drop the reference taken by
+		// set above.
+		d.file.DecRef()
+	}
+}
+
+// drop drops the table reference.
+func (f *FDTable) drop(file *fs.File) {
+	// Release locks.
+	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF})
+
+	// Send inotify events.
+	d := file.Dirent
+	var ev uint32
+	if fs.IsDir(d.Inode.StableAttr) {
+		ev |= linux.IN_ISDIR
+	}
+	if file.Flags().Write {
+		ev |= linux.IN_CLOSE_WRITE
+	} else {
+		ev |= linux.IN_CLOSE_NOWRITE
+	}
+	d.InotifyEvent(ev, 0)
+
+	// Drop the table reference.
+	file.DecRef()
+}
+
+// ID returns a unique identifier for this FDTable.
+func (f *FDTable) ID() uint64 {
+	return f.uid
+}
+
+// NewFDTable allocates a new FDTable that may be used by tasks in k.
+func (k *Kernel) NewFDTable() *FDTable {
+	f := &FDTable{
+		k:   k,
+		uid: atomic.AddUint64(&k.fdMapUids, 1),
+	}
+	f.init()
+	return f
+}
+
+// destroy removes all of the file descriptors from the map.
+func (f *FDTable) destroy() {
+	f.RemoveIf(func(*fs.File, FDFlags) bool {
+		return true
+	})
+}
+
+// DecRef implements RefCounter.DecRef with destructor f.destroy.
+func (f *FDTable) DecRef() {
+	f.DecRefWithDestructor(f.destroy)
+}
+
+// Size returns the number of file descriptor slots currently allocated.
+func (f *FDTable) Size() int {
+	size := atomic.LoadInt32(&f.used)
+	return int(size)
+}
+
+// forEach iterates over all non-nil files.
+//
+// It is the caller's responsibility to acquire an appropriate lock.
+func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) {
+	fd := int32(0)
+	for {
+		file, flags, ok := f.get(fd)
+		if !ok {
+			break
+		}
+		if file != nil {
+			if !file.TryIncRef() {
+				continue // Race caught.
+			}
+			fn(int32(fd), file, flags)
+			file.DecRef()
+		}
+		fd++
+	}
+}
+
+// String is a stringer for FDTable.
+func (f *FDTable) String() string {
+	var b bytes.Buffer
+	f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+		n, _ := file.Dirent.FullName(nil /* root */)
+		b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n))
+	})
+	return b.String()
+}
+
+// NewFDs allocates new FDs guaranteed to be the lowest number available
+// greater than or equal to the fd parameter. All files will share the set
+// flags. Success is guaranteed to be all or none.
+func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return nil, syscall.EINVAL
+	}
+
+	// Default limit.
+	end := int32(math.MaxInt32)
+
+	// Ensure we don't get past the provided limit.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		lim := limitSet.Get(limits.NumberOfFiles)
+		if lim.Cur != limits.Infinity {
+			end = int32(lim.Cur)
+		}
+		if fd >= end {
+			return nil, syscall.EMFILE
+		}
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// Install all entries.
+	for i := fd; i < end && len(fds) < len(files); i++ {
+		if d, _, _ := f.get(i); d == nil {
+			f.set(i, files[len(fds)], flags) // Set the descriptor.
+			fds = append(fds, i)             // Record the file descriptor.
+		}
+	}
+
+	// Failure? Unwind existing FDs.
+	if len(fds) < len(files) {
+		for _, i := range fds {
+			f.set(i, nil, FDFlags{}) // Zap entry.
+		}
+		return nil, syscall.EMFILE
+	}
+
+	return fds, nil
+}
+
+// NewFDAt sets the file reference for the given FD. If there is an active
+// reference for that FD, the ref count for that existing reference is
+// decremented.
+func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return syscall.EBADF
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// Check the limit for the provided file.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
+			return syscall.EMFILE
+		}
+	}
+
+	// Install the entry.
+	f.set(fd, file, flags)
+	return nil
+}
+
+// SetFlags sets the flags for the given file descriptor.
+//
+// True is returned iff flags were changed.
+func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return syscall.EBADF
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	file, _, _ := f.get(fd)
+	if file == nil {
+		// No file found.
+		return syscall.EBADF
+	}
+
+	// Update the flags.
+	f.set(fd, file, flags)
+	return nil
+}
+
+// Get returns a reference to the file and the flags for the FD or nil if no
+// file is defined for the given fd.
+//
+// N.B. Callers are required to use DecRef when they are done.
+//
+//go:nosplit
+func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) {
+	if fd < 0 {
+		return nil, FDFlags{}
+	}
+
+	for {
+		file, flags, _ := f.get(fd)
+		if file != nil {
+			if !file.TryIncRef() {
+				continue // Race caught.
+			}
+			// Reference acquired.
+			return file, flags
+		}
+		// No file available.
+		return nil, FDFlags{}
+	}
+}
+
+// GetFDs returns a list of valid fds.
+func (f *FDTable) GetFDs() []int32 {
+	fds := make([]int32, 0, f.used)
+	f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+		fds = append(fds, fd)
+	})
+	return fds
+}
+
+// GetRefs returns a stable slice of references to all files and bumps the
+// reference count on each. The caller must use DecRef on each reference when
+// they're done using the slice.
+func (f *FDTable) GetRefs() []*fs.File {
+	files := make([]*fs.File, 0, f.Size())
+	f.forEach(func(_ int32, file *fs.File, flags FDFlags) {
+		file.IncRef() // Acquire a reference for caller.
+		files = append(files, file)
+	})
+	return files
+}
+
+// Fork returns an independent FDTable.
+func (f *FDTable) Fork() *FDTable {
+	clone := f.k.NewFDTable()
+
+	f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+		// The set function here will acquire an appropriate table
+		// reference for the clone. We don't need anything else.
+		clone.set(fd, file, flags)
+	})
+	return clone
+}
+
+// Remove removes an FD from and returns a non-file iff successful.
+//
+// N.B. Callers are required to use DecRef when they are done.
+func (f *FDTable) Remove(fd int32) *fs.File {
+	if fd < 0 {
+		return nil
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	orig, _, _ := f.get(fd)
+	if orig != nil {
+		orig.IncRef()             // Reference for caller.
+		f.set(fd, nil, FDFlags{}) // Zap entry.
+	}
+	return orig
+}
+
+// RemoveIf removes all FDs where cond is true.
+func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+		if cond(file, flags) {
+			f.set(fd, nil, FDFlags{}) // Clear from table.
+		}
+	})
+}
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
new file mode 100644
index 000000000..2413788e7
--- /dev/null
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -0,0 +1,192 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"runtime"
+	"sync"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/filetest"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+)
+
+const (
+	// maxFD is the maximum FD to try to create in the map.
+	//
+	// This number of open files has been seen in the wild.
+	maxFD = 2 * 1024
+)
+
+func runTest(t testing.TB, fn func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet)) {
+	t.Helper() // Don't show in stacks.
+
+	// Create the limits and context.
+	limitSet := limits.NewLimitSet()
+	limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true)
+	ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet)
+
+	// Create a test file.;
+	file := filetest.NewTestFile(t)
+
+	// Create the table.
+	fdTable := new(FDTable)
+	fdTable.init()
+
+	// Run the test.
+	fn(ctx, fdTable, file, limitSet)
+}
+
+// TestFDTableMany allocates maxFD FDs, i.e. maxes out the FDTable, until there
+// is no room, then makes sure that NewFDAt works and also that if we remove
+// one and add one that works too.
+func TestFDTableMany(t *testing.T) {
+	runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+		for i := 0; i < maxFD; i++ {
+			if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil {
+				t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD)
+			}
+		}
+
+		if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil {
+			t.Fatalf("fdTable.NewFDs(0, r) in full map: got nil, wanted error")
+		}
+
+		if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil {
+			t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+		}
+	})
+}
+
+// TestFDTable does a set of simple tests to make sure simple adds, removes,
+// GetRefs, and DecRefs work. The ordering is just weird enough that a
+// table-driven approach seemed clumsy.
+func TestFDTable(t *testing.T) {
+	runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet) {
+		// Cap the limit at one.
+		limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}, true)
+
+		if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil {
+			t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err)
+		}
+
+		if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil {
+			t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error")
+		}
+
+		// Remove the previous limit.
+		limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true)
+
+		if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil {
+			t.Fatalf("Adding an FD to a resized map: got %v, want nil", err)
+		} else if len(fds) != 1 || fds[0] != 1 {
+			t.Fatalf("Added an FD to a resized map: got %v, want {1}", fds)
+		}
+
+		if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil {
+			t.Fatalf("Replacing FD 1 via fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+		}
+
+		if err := fdTable.NewFDAt(ctx, maxFD+1, file, FDFlags{}); err == nil {
+			t.Fatalf("Using an FD that was too large via fdTable.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1)
+		}
+
+		if ref, _ := fdTable.Get(1); ref == nil {
+			t.Fatalf("fdTable.Get(1): got nil, wanted %v", file)
+		}
+
+		if ref, _ := fdTable.Get(2); ref != nil {
+			t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref)
+		}
+
+		ref := fdTable.Remove(1)
+		if ref == nil {
+			t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success")
+		}
+		ref.DecRef()
+
+		if ref := fdTable.Remove(1); ref != nil {
+			t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
+		}
+	})
+}
+
+func TestDescriptorFlags(t *testing.T) {
+	runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+		if err := fdTable.NewFDAt(ctx, 2, file, FDFlags{CloseOnExec: true}); err != nil {
+			t.Fatalf("fdTable.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err)
+		}
+
+		newFile, flags := fdTable.Get(2)
+		if newFile == nil {
+			t.Fatalf("fdTable.Get(2): got a %v, wanted nil", newFile)
+		}
+
+		if !flags.CloseOnExec {
+			t.Fatalf("new File flags %v don't match original %d\n", flags, 0)
+		}
+	})
+}
+
+func BenchmarkFDLookupAndDecRef(b *testing.B) {
+	b.StopTimer() // Setup.
+
+	runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+		fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{})
+		if err != nil {
+			b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err)
+		}
+
+		b.StartTimer() // Benchmark.
+		for i := 0; i < b.N; i++ {
+			tf, _ := fdTable.Get(fds[i%len(fds)])
+			tf.DecRef()
+		}
+	})
+}
+
+func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) {
+	b.StopTimer() // Setup.
+
+	runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+		fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{})
+		if err != nil {
+			b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err)
+		}
+
+		concurrency := runtime.GOMAXPROCS(0)
+		if concurrency < 4 {
+			concurrency = 4
+		}
+		each := b.N / concurrency
+
+		b.StartTimer() // Benchmark.
+		var wg sync.WaitGroup
+		for i := 0; i < concurrency; i++ {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				for i := 0; i < each; i++ {
+					tf, _ := fdTable.Get(fds[i%len(fds)])
+					tf.DecRef()
+				}
+			}()
+		}
+		wg.Wait()
+	})
+}
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
new file mode 100644
index 000000000..e009df974
--- /dev/null
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -0,0 +1,103 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync/atomic"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+)
+
+type descriptorTable struct {
+	// slice is a *[]unsafe.Pointer, where each element is actually
+	// *descriptor object, updated atomically.
+	//
+	// Changes to the slice itself requiring holding FDTable.mu.
+	slice unsafe.Pointer `state:".(map[int32]*descriptor)"`
+}
+
+// init initializes the table.
+func (f *FDTable) init() {
+	var slice []unsafe.Pointer // Empty slice.
+	atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
+}
+
+// get gets a file entry.
+//
+// The boolean indicates whether this was in range.
+//
+//go:nosplit
+func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) {
+	slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+	if fd >= int32(len(slice)) {
+		return nil, FDFlags{}, false
+	}
+	d := (*descriptor)(atomic.LoadPointer(&slice[fd]))
+	if d == nil {
+		return nil, FDFlags{}, true
+	}
+	return d.file, d.flags, true
+}
+
+// set sets an entry.
+//
+// This handles accounting changes, as well as acquiring and releasing the
+// reference needed by the table iff the file is different.
+//
+// Precondition: mu must be held.
+func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
+	slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+
+	// Grow the table as required.
+	if last := int32(len(slice)); fd >= last {
+		end := fd + 1
+		if end < 2*last {
+			end = 2 * last
+		}
+		slice = append(slice, make([]unsafe.Pointer, end-last)...)
+		atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
+	}
+
+	// Create the new element.
+	var d *descriptor
+	if file != nil {
+		d = &descriptor{
+			file:  file,
+			flags: flags,
+		}
+	}
+
+	// Update the single element.
+	orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(d)))
+
+	// Acquire a table reference.
+	if file != nil && (orig == nil || file != orig.file) {
+		file.IncRef()
+	}
+
+	// Drop the table reference.
+	if orig != nil && file != orig.file {
+		f.drop(orig.file)
+	}
+
+	// Adjust used.
+	switch {
+	case orig == nil && file != nil:
+		atomic.AddInt32(&f.used, 1)
+	case orig != nil && file == nil:
+		atomic.AddInt32(&f.used, -1)
+	}
+}
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index a08917889..ded27d668 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -51,18 +51,20 @@ type FSContext struct {
 func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
 	root.IncRef()
 	cwd.IncRef()
-	return &FSContext{
+	f := FSContext{
 		root:  root,
 		cwd:   cwd,
 		umask: umask,
 	}
+	f.EnableLeakCheck("kernel.FSContext")
+	return &f
 }
 
 // destroy is the destructor for an FSContext.
 //
 // This will call DecRef on both root and cwd Dirents.  If either call to
-// DecRef returns an error, then it will be propigated.  If both calls to
-// DecRef return an error, then the one from root.DecRef will be propigated.
+// DecRef returns an error, then it will be propagated.  If both calls to
+// DecRef return an error, then the one from root.DecRef will be propagated.
 //
 // Note that there may still be calls to WorkingDirectory() or RootDirectory()
 // (that return nil).  This is because valid references may still be held via
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index a5cf1f627..6a31dc044 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -5,7 +5,7 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "atomicptr_bucket",
-    out = "atomicptr_bucket.go",
+    out = "atomicptr_bucket_unsafe.go",
     package = "futex",
     suffix = "Bucket",
     template = "//third_party/gvsync:generic_atomicptr",
@@ -29,7 +29,7 @@ go_template_instance(
 go_library(
     name = "futex",
     srcs = [
-        "atomicptr_bucket.go",
+        "atomicptr_bucket_unsafe.go",
         "futex.go",
         "waiter_list.go",
     ],
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 3bd5c04af..278cc8143 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -729,14 +729,14 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool
 	}
 	b := m.lockBucket(&k)
 
-	err = m.unlockPILocked(t, addr, tid, b)
+	err = m.unlockPILocked(t, addr, tid, b, &k)
 
 	k.release()
 	b.mu.Unlock()
 	return err
 }
 
-func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket) error {
+func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket, key *Key) error {
 	cur, err := t.LoadUint32(addr)
 	if err != nil {
 		return err
@@ -746,7 +746,22 @@ func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *buc
 		return syserror.EPERM
 	}
 
-	if b.waiters.Empty() {
+	var next *Waiter  // Who's the next owner?
+	var next2 *Waiter // Who's the one after that?
+	for w := b.waiters.Front(); w != nil; w = w.Next() {
+		if !w.key.matches(key) {
+			continue
+		}
+
+		if next == nil {
+			next = w
+		} else {
+			next2 = w
+			break
+		}
+	}
+
+	if next == nil {
 		// It's safe to set 0 because there are no waiters, no new owner, and the
 		// executing task is the current owner (no owner died bit).
 		prev, err := t.CompareAndSwapUint32(addr, cur, 0)
@@ -761,12 +776,10 @@ func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *buc
 		return nil
 	}
 
-	next := b.waiters.Front()
-
 	// Set next owner's TID, waiters if there are any. Resets owner died bit, if
 	// set, because the executing task takes over as the owner.
 	val := next.tid
-	if next.Next() != nil {
+	if next2 != nil {
 		val |= linux.FUTEX_WAITERS
 	}
 
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index f0db0838d..80a070d7e 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -40,7 +40,7 @@ func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
 	}
 }
 
-// SemaphoreRegistry returns the semanphore set registry for this namespace.
+// SemaphoreRegistry returns the semaphore set registry for this namespace.
 func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry {
 	return i.semaphores
 }
diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD
deleted file mode 100644
index 5d62f406a..000000000
--- a/pkg/sentry/kernel/kdefs/BUILD
+++ /dev/null
@@ -1,10 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "kdefs",
-    srcs = ["kdefs.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs",
-    visibility = ["//:sandbox"],
-)
diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go
deleted file mode 100644
index 304da2032..000000000
--- a/pkg/sentry/kernel/kdefs/kdefs.go
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package kdefs defines common kernel definitions.
-//
-package kdefs
-
-// FD is a File Descriptor.
-type FD int32
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 9fe9eb914..38b49cba2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -155,7 +155,7 @@ type Kernel struct {
 	// cpuClockTicker increments cpuClock.
 	cpuClockTicker *ktime.Timer `state:"nosave"`
 
-	// fdMapUids is an ever-increasing counter for generating FDMap uids.
+	// fdMapUids is an ever-increasing counter for generating FDTable uids.
 	//
 	// fdMapUids is mutable, and is accessed using atomic memory operations.
 	fdMapUids uint64
@@ -400,8 +400,8 @@ func (k *Kernel) flushMountSourceRefs() error {
 
 	// There may be some open FDs whose filesystems have been unmounted. We
 	// must flush those as well.
-	return k.tasks.forEachFDPaused(func(desc descriptor) error {
-		desc.file.Dirent.Inode.MountSource.FlushDirentRefs()
+	return k.tasks.forEachFDPaused(func(file *fs.File) error {
+		file.Dirent.Inode.MountSource.FlushDirentRefs()
 		return nil
 	})
 }
@@ -410,35 +410,35 @@ func (k *Kernel) flushMountSourceRefs() error {
 // task.
 //
 // Precondition: Must be called with the kernel paused.
-func (ts *TaskSet) forEachFDPaused(f func(descriptor) error) error {
+func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) {
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 	for t := range ts.Root.tids {
 		// We can skip locking Task.mu here since the kernel is paused.
-		if t.fds == nil {
+		if t.fdTable == nil {
 			continue
 		}
-		for _, desc := range t.fds.files {
-			if err := f(desc); err != nil {
-				return err
+		t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+			if lastErr := f(file); lastErr != nil && err == nil {
+				err = lastErr
 			}
-		}
+		})
 	}
-	return nil
+	return err
 }
 
 func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
-	return ts.forEachFDPaused(func(desc descriptor) error {
-		if flags := desc.file.Flags(); !flags.Write {
+	return ts.forEachFDPaused(func(file *fs.File) error {
+		if flags := file.Flags(); !flags.Write {
 			return nil
 		}
-		if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
+		if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
 			return nil
 		}
 		// Here we need all metadata synced.
-		syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+		syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
 		if err := fs.SaveFileFsyncError(syncErr); err != nil {
-			name, _ := desc.file.Dirent.FullName(nil /* root */)
+			name, _ := file.Dirent.FullName(nil /* root */)
 			// Wrap this error in ErrSaveRejection
 			// so that it will trigger a save
 			// error, rather than a panic. This
@@ -483,14 +483,12 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 	defer ts.mu.RUnlock()
 	for t := range ts.Root.tids {
 		// We can skip locking Task.mu here since the kernel is paused.
-		if fdmap := t.fds; fdmap != nil {
-			for _, desc := range fdmap.files {
-				if desc.file != nil {
-					if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok {
-						e.UnregisterEpollWaiters()
-					}
+		if t.fdTable != nil {
+			t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+				if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
+					e.UnregisterEpollWaiters()
 				}
-			}
+			})
 		}
 	}
 }
@@ -538,6 +536,8 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
 	}
 	log.Infof("Memory load took [%s].", time.Since(memoryStart))
 
+	log.Infof("Overall load took [%s]", time.Since(loadStart))
+
 	// Ensure that all pending asynchronous work is complete:
 	//   - namedpipe opening
 	//   - inode file opening
@@ -602,9 +602,9 @@ type CreateProcessArgs struct {
 	// Credentials is the initial credentials.
 	Credentials *auth.Credentials
 
-	// FDMap is the initial set of file descriptors. If CreateProcess succeeds,
-	// it takes a reference on FDMap.
-	FDMap *FDMap
+	// FDTable is the initial set of file descriptors. If CreateProcess succeeds,
+	// it takes a reference on FDTable.
+	FDTable *FDTable
 
 	// Umask is the initial umask.
 	Umask uint
@@ -679,7 +679,7 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.args.Credentials
 	case fs.CtxRoot:
 		if ctx.args.Root != nil {
-			// Take a refernce on the root dirent that will be
+			// Take a reference on the root dirent that will be
 			// given to the caller.
 			ctx.args.Root.IncRef()
 			return ctx.args.Root
@@ -789,9 +789,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		return nil, 0, errors.New(se.String())
 	}
 
-	// Take a reference on the FDMap, which will be transferred to
+	// Take a reference on the FDTable, which will be transferred to
 	// TaskSet.NewTask().
-	args.FDMap.IncRef()
+	args.FDTable.IncRef()
 
 	// Create the task.
 	config := &TaskConfig{
@@ -799,7 +799,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		ThreadGroup:             tg,
 		TaskContext:             tc,
 		FSContext:               newFSContext(root, wd, args.Umask),
-		FDMap:                   args.FDMap,
+		FDTable:                 args.FDTable,
 		Credentials:             args.Credentials,
 		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
 		UTSNamespace:            args.UTSNamespace,
@@ -871,7 +871,7 @@ func (k *Kernel) pauseTimeLocked() {
 	}
 
 	// By precondition, nothing else can be interacting with PIDNamespace.tids
-	// or FDMap.files, so we can iterate them without synchronization. (We
+	// or FDTable.files, so we can iterate them without synchronization. (We
 	// can't hold the TaskSet mutex when pausing thread group timers because
 	// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
 	// mutex, while holding the Timer mutex.)
@@ -882,14 +882,14 @@ func (k *Kernel) pauseTimeLocked() {
 				it.PauseTimer()
 			}
 		}
-		// This means we'll iterate FDMaps shared by multiple tasks repeatedly,
+		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
 		// but ktime.Timer.Pause is idempotent so this is harmless.
-		if fdm := t.fds; fdm != nil {
-			for _, desc := range fdm.files {
-				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+		if t.fdTable != nil {
+			t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+				if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
 					tfd.PauseTimer()
 				}
-			}
+			})
 		}
 	}
 	k.timekeeper.PauseUpdates()
@@ -914,12 +914,12 @@ func (k *Kernel) resumeTimeLocked() {
 				it.ResumeTimer()
 			}
 		}
-		if fdm := t.fds; fdm != nil {
-			for _, desc := range fdm.files {
-				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+		if t.fdTable != nil {
+			t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+				if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
 					tfd.ResumeTimer()
 				}
-			}
+			})
 		}
 	}
 }
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 8e49070a9..247e2928e 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -39,19 +39,6 @@ const (
 	MaximumPipeSize = 8 << 20
 )
 
-// Sizer is an interface for setting and getting the size of a pipe.
-//
-// It is implemented by Pipe and, through embedding, all other types.
-type Sizer interface {
-	// PipeSize returns the pipe capacity in bytes.
-	PipeSize() int64
-
-	// SetPipeSize sets the new pipe capacity in bytes.
-	//
-	// The new size is returned (which may be capped).
-	SetPipeSize(int64) (int64, error)
-}
-
 // Pipe is an encapsulation of a platform-independent pipe.
 // It manages a buffered byte queue shared between a reader/writer
 // pair.
@@ -399,15 +386,15 @@ func (p *Pipe) queued() int64 {
 	return p.size
 }
 
-// PipeSize implements PipeSizer.PipeSize.
-func (p *Pipe) PipeSize() int64 {
+// FifoSize implements fs.FifoSizer.FifoSize.
+func (p *Pipe) FifoSize(context.Context, *fs.File) (int64, error) {
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	return p.max
+	return p.max, nil
 }
 
-// SetPipeSize implements PipeSize.SetPipeSize.
-func (p *Pipe) SetPipeSize(size int64) (int64, error) {
+// SetFifoSize implements fs.FifoSizer.SetFifoSize.
+func (p *Pipe) SetFifoSize(size int64) (int64, error) {
 	if size < 0 {
 		return 0, syserror.EINVAL
 	}
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index fedfcd921..f69dbf27b 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -77,7 +77,7 @@ func (rw *ReaderWriter) Readiness(mask waiter.EventMask) waiter.EventMask {
 }
 
 // Ioctl implements fs.FileOperations.Ioctl.
-func (rw *ReaderWriter) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (rw *ReaderWriter) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	// Switch on ioctl request.
 	switch int(args[1].Int()) {
 	case linux.FIONREAD:
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index fb4a0e1e0..93fe68a3e 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -86,7 +86,7 @@ type Set struct {
 	dead bool
 }
 
-// sem represents a single semanphore from a set.
+// sem represents a single semaphore from a set.
 //
 // +stateify savable
 type sem struct {
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 355984140..81fcd8258 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -294,6 +294,7 @@ func (tg *ThreadGroup) createSession() error {
 		id:     SessionID(id),
 		leader: tg,
 	}
+	s.refs.EnableLeakCheck("kernel.Session")
 
 	// Create a new ProcessGroup, belonging to that Session.
 	// This also has a single reference (assigned below).
@@ -307,6 +308,7 @@ func (tg *ThreadGroup) createSession() error {
 		session:    s,
 		ancestors:  0,
 	}
+	pg.refs.EnableLeakCheck("kernel.ProcessGroup")
 
 	// Tie them and return the result.
 	s.processGroups.PushBack(pg)
@@ -378,11 +380,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
 	// We manually adjust the ancestors if the parent is in the same
 	// session.
 	tg.processGroup.session.incRef()
-	pg := &ProcessGroup{
+	pg := ProcessGroup{
 		id:         ProcessGroupID(id),
 		originator: tg,
 		session:    tg.processGroup.session,
 	}
+	pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+
 	if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
 		pg.ancestors++
 	}
@@ -390,20 +394,20 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
 	// Assign the new process group; adjust children.
 	oldParentPG := tg.parentPG()
 	tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
-		childTG.processGroup.incRefWithParent(pg)
+		childTG.processGroup.incRefWithParent(&pg)
 		childTG.processGroup.decRefWithParent(oldParentPG)
 	})
 	tg.processGroup.decRefWithParent(oldParentPG)
-	tg.processGroup = pg
+	tg.processGroup = &pg
 
 	// Add the new process group to the session.
-	pg.session.processGroups.PushBack(pg)
+	pg.session.processGroups.PushBack(&pg)
 
 	// Ensure this translation is added to all namespaces.
 	for ns := tg.pidns; ns != nil; ns = ns.parent {
 		local := ns.tgids[tg]
-		ns.pgids[pg] = ProcessGroupID(local)
-		ns.processGroups[ProcessGroupID(local)] = pg
+		ns.pgids[&pg] = ProcessGroupID(local)
+		ns.processGroups[ProcessGroupID(local)] = &pg
 	}
 
 	return nil
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 3e9fe70e2..5bd610f68 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -224,6 +224,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
 		creatorPID:    pid,
 		changeTime:    ktime.NowFromContext(ctx),
 	}
+	shm.EnableLeakCheck("kernel.Shm")
 
 	// Find the next available ID.
 	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 175d1b247..8227ecf1d 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -67,6 +67,7 @@ func (s *syslog) Log() []byte {
 		"Creating process schedule...",
 		"Generating random numbers by fair dice roll...",
 		"Rewriting operating system in Javascript...",
+		"Reticulating splines...",
 		"Consulting tar man page...",
 		"Forking spaghetti code...",
 		"Checking naughty and nice process list...",
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 7ed589a02..e91f82bb3 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -236,15 +236,15 @@ type Task struct {
 	// tc is protected by mu, and is owned by the task goroutine.
 	tc TaskContext
 
-	// fsc is the task's filesystem context.
+	// fsContext is the task's filesystem context.
 	//
-	// fsc is protected by mu, and is owned by the task goroutine.
-	fsc *FSContext
+	// fsContext is protected by mu, and is owned by the task goroutine.
+	fsContext *FSContext
 
-	// fds is the task's file descriptor table.
+	// fdTable is the task's file descriptor table.
 	//
-	// fds is protected by mu, and is owned by the task goroutine.
-	fds *FDMap
+	// fdTable is protected by mu, and is owned by the task goroutine.
+	fdTable *FDTable
 
 	// If vforkParent is not nil, it is the task that created this task with
 	// vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
@@ -386,10 +386,11 @@ type Task struct {
 
 	// creds is the task's credentials.
 	//
-	// creds is protected by mu, however the value itself is immutable and can
-	// only be changed by a copy. After reading the pointer, access will
-	// proceed outside the scope of mu. creds is owned by the task goroutine.
-	creds *auth.Credentials
+	// creds.Load() may be called without synchronization. creds.Store() is
+	// serialized by mu. creds is owned by the task goroutine. All
+	// auth.Credentials objects that creds may point to, or have pointed to
+	// in the past, must be treated as immutable.
+	creds auth.AtomicPtrCredentials
 
 	// utsns is the task's UTS namespace.
 	//
@@ -597,11 +598,11 @@ func (t *Task) Value(key interface{}) interface{} {
 	case CtxTask:
 		return t
 	case auth.CtxCredentials:
-		return t.creds
+		return t.Credentials()
 	case context.CtxThreadGroupID:
 		return int32(t.ThreadGroup().ID())
 	case fs.CtxRoot:
-		return t.fsc.RootDirectory()
+		return t.fsContext.RootDirectory()
 	case fs.CtxDirentCacheLimiter:
 		return t.k.DirentCacheLimiter
 	case inet.CtxStack:
@@ -667,7 +668,7 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
 func (t *Task) IsChrooted() bool {
 	realRoot := t.tg.mounts.Root()
 	defer realRoot.DecRef()
-	root := t.fsc.RootDirectory()
+	root := t.fsContext.RootDirectory()
 	if root != nil {
 		defer root.DecRef()
 	}
@@ -688,23 +689,62 @@ func (t *Task) TaskContext() *TaskContext {
 // Precondition: The caller must be running on the task goroutine, or t.mu must
 // be locked.
 func (t *Task) FSContext() *FSContext {
-	return t.fsc
+	return t.fsContext
 }
 
-// FDMap returns t's FDMap. FDMap does not take an additional reference on the
-// returned FDMap.
+// FDTable returns t's FDTable. FDMTable does not take an additional reference
+// on the returned FDMap.
 //
 // Precondition: The caller must be running on the task goroutine, or t.mu must
 // be locked.
-func (t *Task) FDMap() *FDMap {
-	return t.fds
+func (t *Task) FDTable() *FDTable {
+	return t.fdTable
+}
+
+// GetFile is a convenience wrapper t.FDTable().GetFile.
+//
+// Precondition: same as FDTable.
+func (t *Task) GetFile(fd int32) *fs.File {
+	f, _ := t.fdTable.Get(fd)
+	return f
+}
+
+// NewFDs is a convenience wrapper for t.FDTable().NewFDs.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error) {
+	return t.fdTable.NewFDs(t, fd, files, flags)
+}
+
+// NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) {
+	fds, err := t.fdTable.NewFDs(t, fd, []*fs.File{file}, flags)
+	if err != nil {
+		return 0, err
+	}
+	return fds[0], nil
+}
+
+// NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error {
+	return t.fdTable.NewFDAt(t, fd, file, flags)
 }
 
 // WithMuLocked executes f with t.mu locked.
 func (t *Task) WithMuLocked(f func(*Task)) {
 	t.mu.Lock()
-	defer t.mu.Unlock()
 	f(t)
+	t.mu.Unlock()
 }
 
 // MountNamespace returns t's MountNamespace. MountNamespace does not take an
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 0e621f0d1..0916fd658 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -214,20 +214,20 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		}
 	}
 
-	var fsc *FSContext
+	var fsContext *FSContext
 	if opts.NewFSContext {
-		fsc = t.fsc.Fork()
+		fsContext = t.fsContext.Fork()
 	} else {
-		fsc = t.fsc
-		fsc.IncRef()
+		fsContext = t.fsContext
+		fsContext.IncRef()
 	}
 
-	var fds *FDMap
+	var fdTable *FDTable
 	if opts.NewFiles {
-		fds = t.fds.Fork()
+		fdTable = t.fdTable.Fork()
 	} else {
-		fds = t.fds
-		fds.IncRef()
+		fdTable = t.fdTable
+		fdTable.IncRef()
 	}
 
 	pidns := t.tg.pidns
@@ -251,8 +251,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		ThreadGroup:             tg,
 		SignalMask:              t.SignalMask(),
 		TaskContext:             tc,
-		FSContext:               fsc,
-		FDMap:                   fds,
+		FSContext:               fsContext,
+		FDTable:                 fdTable,
 		Credentials:             creds,
 		Niceness:                t.Niceness(),
 		NetworkNamespaced:       t.netns,
@@ -425,6 +425,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 	if opts.NewAddressSpace || opts.NewSignalHandlers {
 		return syserror.EINVAL
 	}
+	creds := t.Credentials()
 	if opts.NewThreadGroup {
 		t.tg.signalHandlers.mu.Lock()
 		if t.tg.tasksCount != 1 {
@@ -439,8 +440,6 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 		if t.IsChrooted() {
 			return syserror.EPERM
 		}
-		// This temporary is needed because Go.
-		creds := t.Credentials()
 		newUserNS, err := creds.NewChildUserNamespace()
 		if err != nil {
 			return err
@@ -449,6 +448,8 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 		if err != nil {
 			return err
 		}
+		// Need to reload creds, becaue t.SetUserNamespace() changed task credentials.
+		creds = t.Credentials()
 	}
 	haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
 	if opts.NewPIDNamespace {
@@ -473,7 +474,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 		}
 		// Note that this must happen after NewUserNamespace, so the
 		// new user namespace is used if there is one.
-		t.utsns = t.utsns.Clone(t.creds.UserNamespace)
+		t.utsns = t.utsns.Clone(creds.UserNamespace)
 	}
 	if opts.NewIPCNamespace {
 		if !haveCapSysAdmin {
@@ -482,24 +483,24 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 		}
 		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
 		// namespace"
-		t.ipcns = NewIPCNamespace(t.creds.UserNamespace)
+		t.ipcns = NewIPCNamespace(creds.UserNamespace)
 	}
-	var oldfds *FDMap
+	var oldFDTable *FDTable
 	if opts.NewFiles {
-		oldfds = t.fds
-		t.fds = oldfds.Fork()
+		oldFDTable = t.fdTable
+		t.fdTable = oldFDTable.Fork()
 	}
-	var oldfsc *FSContext
+	var oldFSContext *FSContext
 	if opts.NewFSContext {
-		oldfsc = t.fsc
-		t.fsc = oldfsc.Fork()
+		oldFSContext = t.fsContext
+		t.fsContext = oldFSContext.Fork()
 	}
 	t.mu.Unlock()
-	if oldfds != nil {
-		oldfds.DecRef()
+	if oldFDTable != nil {
+		oldFDTable.DecRef()
 	}
-	if oldfsc != nil {
-		oldfsc.DecRef()
+	if oldFSContext != nil {
+		oldFSContext.DecRef()
 	}
 	return nil
 }
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index cd85acaef..17a089b90 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -195,7 +195,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.tg.pidns.owner.mu.Unlock()
 
 	// Remove FDs with the CloseOnExec flag set.
-	t.fds.RemoveIf(func(file *fs.File, flags FDFlags) bool {
+	t.fdTable.RemoveIf(func(file *fs.File, flags FDFlags) bool {
 		return flags.CloseOnExec
 	})
 
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index b97d65185..535f03e50 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -265,8 +265,8 @@ func (*runExitMain) execute(t *Task) taskRunState {
 	// Releasing the MM unblocks a blocked CLONE_VFORK parent.
 	t.unstopVforkParent()
 
-	t.fsc.DecRef()
-	t.fds.DecRef()
+	t.fsContext.DecRef()
+	t.fdTable.DecRef()
 
 	// If this is the last task to exit from the thread group, release the
 	// thread group's resources.
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index d77dabc05..c211b5b74 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -34,14 +34,14 @@ func (t *Task) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
 	})
 }
 
-// CompareAndSwapUint32 implemets futex.Target.CompareAndSwapUint32.
+// CompareAndSwapUint32 implements futex.Target.CompareAndSwapUint32.
 func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
 	return t.MemoryManager().CompareAndSwapUint32(t, addr, old, new, usermem.IOOpts{
 		AddressSpaceActive: true,
 	})
 }
 
-// LoadUint32 implemets futex.Target.LoadUint32.
+// LoadUint32 implements futex.Target.LoadUint32.
 func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) {
 	return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{
 		AddressSpaceActive: true,
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index 39c138925..78ff14b20 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -25,30 +25,22 @@ import (
 //
 // This value must be considered immutable.
 func (t *Task) Credentials() *auth.Credentials {
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	return t.creds
+	return t.creds.Load()
 }
 
 // UserNamespace returns the user namespace associated with the task.
 func (t *Task) UserNamespace() *auth.UserNamespace {
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	return t.creds.UserNamespace
+	return t.Credentials().UserNamespace
 }
 
 // HasCapabilityIn checks if the task has capability cp in user namespace ns.
 func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool {
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	return t.creds.HasCapabilityIn(cp, ns)
+	return t.Credentials().HasCapabilityIn(cp, ns)
 }
 
 // HasCapability checks if the task has capability cp in its user namespace.
 func (t *Task) HasCapability(cp linux.Capability) bool {
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	return t.creds.HasCapability(cp)
+	return t.Credentials().HasCapability(cp)
 }
 
 // SetUID implements the semantics of setuid(2).
@@ -57,9 +49,12 @@ func (t *Task) SetUID(uid auth.UID) error {
 	if !uid.Ok() {
 		return syserror.EINVAL
 	}
+
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	kuid := t.creds.UserNamespace.MapToKUID(uid)
+
+	creds := t.Credentials()
+	kuid := creds.UserNamespace.MapToKUID(uid)
 	if !kuid.Ok() {
 		return syserror.EINVAL
 	}
@@ -67,17 +62,17 @@ func (t *Task) SetUID(uid auth.UID) error {
 	// effective UID of the caller is root (more precisely: if the caller has
 	// the CAP_SETUID capability), the real UID and saved set-user-ID are also
 	// set." - setuid(2)
-	if t.creds.HasCapability(linux.CAP_SETUID) {
+	if creds.HasCapability(linux.CAP_SETUID) {
 		t.setKUIDsUncheckedLocked(kuid, kuid, kuid)
 		return nil
 	}
 	// "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID
 	// capability) and uid does not match the real UID or saved set-user-ID of
 	// the calling process."
-	if kuid != t.creds.RealKUID && kuid != t.creds.SavedKUID {
+	if kuid != creds.RealKUID && kuid != creds.SavedKUID {
 		return syserror.EPERM
 	}
-	t.setKUIDsUncheckedLocked(t.creds.RealKUID, kuid, t.creds.SavedKUID)
+	t.setKUIDsUncheckedLocked(creds.RealKUID, kuid, creds.SavedKUID)
 	return nil
 }
 
@@ -87,37 +82,38 @@ func (t *Task) SetREUID(r, e auth.UID) error {
 	defer t.mu.Unlock()
 	// "Supplying a value of -1 for either the real or effective user ID forces
 	// the system to leave that ID unchanged." - setreuid(2)
-	newR := t.creds.RealKUID
+	creds := t.Credentials()
+	newR := creds.RealKUID
 	if r.Ok() {
-		newR = t.creds.UserNamespace.MapToKUID(r)
+		newR = creds.UserNamespace.MapToKUID(r)
 		if !newR.Ok() {
 			return syserror.EINVAL
 		}
 	}
-	newE := t.creds.EffectiveKUID
+	newE := creds.EffectiveKUID
 	if e.Ok() {
-		newE = t.creds.UserNamespace.MapToKUID(e)
+		newE = creds.UserNamespace.MapToKUID(e)
 		if !newE.Ok() {
 			return syserror.EINVAL
 		}
 	}
-	if !t.creds.HasCapability(linux.CAP_SETUID) {
+	if !creds.HasCapability(linux.CAP_SETUID) {
 		// "Unprivileged processes may only set the effective user ID to the
 		// real user ID, the effective user ID, or the saved set-user-ID."
-		if newE != t.creds.RealKUID && newE != t.creds.EffectiveKUID && newE != t.creds.SavedKUID {
+		if newE != creds.RealKUID && newE != creds.EffectiveKUID && newE != creds.SavedKUID {
 			return syserror.EPERM
 		}
 		// "Unprivileged users may only set the real user ID to the real user
 		// ID or the effective user ID."
-		if newR != t.creds.RealKUID && newR != t.creds.EffectiveKUID {
+		if newR != creds.RealKUID && newR != creds.EffectiveKUID {
 			return syserror.EPERM
 		}
 	}
 	// "If the real user ID is set (i.e., ruid is not -1) or the effective user
 	// ID is set to a value not equal to the previous real user ID, the saved
 	// set-user-ID will be set to the new effective user ID."
-	newS := t.creds.SavedKUID
-	if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKUID) {
+	newS := creds.SavedKUID
+	if r.Ok() || (e.Ok() && newE != creds.EffectiveKUID) {
 		newS = newE
 	}
 	t.setKUIDsUncheckedLocked(newR, newE, newS)
@@ -136,23 +132,24 @@ func (t *Task) SetRESUID(r, e, s auth.UID) error {
 	// arguments equals -1, the corresponding value is not changed." -
 	// setresuid(2)
 	var err error
-	newR := t.creds.RealKUID
+	creds := t.Credentials()
+	newR := creds.RealKUID
 	if r.Ok() {
-		newR, err = t.creds.UseUID(r)
+		newR, err = creds.UseUID(r)
 		if err != nil {
 			return err
 		}
 	}
-	newE := t.creds.EffectiveKUID
+	newE := creds.EffectiveKUID
 	if e.Ok() {
-		newE, err = t.creds.UseUID(e)
+		newE, err = creds.UseUID(e)
 		if err != nil {
 			return err
 		}
 	}
-	newS := t.creds.SavedKUID
+	newS := creds.SavedKUID
 	if s.Ok() {
-		newS, err = t.creds.UseUID(s)
+		newS, err = creds.UseUID(s)
 		if err != nil {
 			return err
 		}
@@ -163,10 +160,10 @@ func (t *Task) SetRESUID(r, e, s auth.UID) error {
 
 // Preconditions: t.mu must be locked.
 func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
-	root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
-	oldR, oldE, oldS := t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID = newR, newE, newS
+	creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds.
+	root := creds.UserNamespace.MapToKUID(auth.RootUID)
+	oldR, oldE, oldS := creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID
+	creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID = newR, newE, newS
 
 	// "1. If one or more of the real, effective or saved set user IDs was
 	// previously 0, and as a result of the UID changes all of these IDs have a
@@ -184,9 +181,9 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
 		// being cleared." (A thread's effective capability set is always
 		// cleared when such a credential change is made,
 		// regardless of the setting of the "keep capabilities" flag.)
-		if !t.creds.KeepCaps {
-			t.creds.PermittedCaps = 0
-			t.creds.EffectiveCaps = 0
+		if !creds.KeepCaps {
+			creds.PermittedCaps = 0
+			creds.EffectiveCaps = 0
 		}
 	}
 	// """
@@ -197,9 +194,9 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
 	// permitted set is copied to the effective set.
 	// """
 	if oldE == root && newE != root {
-		t.creds.EffectiveCaps = 0
+		creds.EffectiveCaps = 0
 	} else if oldE != root && newE == root {
-		t.creds.EffectiveCaps = t.creds.PermittedCaps
+		creds.EffectiveCaps = creds.PermittedCaps
 	}
 	// "4. If the filesystem user ID is changed from 0 to nonzero (see
 	// setfsuid(2)), then the following capabilities are cleared from the
@@ -220,6 +217,7 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
 		// Not documented, but compare Linux's kernel/cred.c:commit_creds().
 		t.parentDeathSignal = 0
 	}
+	t.creds.Store(creds)
 }
 
 // SetGID implements the semantics of setgid(2).
@@ -227,20 +225,23 @@ func (t *Task) SetGID(gid auth.GID) error {
 	if !gid.Ok() {
 		return syserror.EINVAL
 	}
+
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	kgid := t.creds.UserNamespace.MapToKGID(gid)
+
+	creds := t.Credentials()
+	kgid := creds.UserNamespace.MapToKGID(gid)
 	if !kgid.Ok() {
 		return syserror.EINVAL
 	}
-	if t.creds.HasCapability(linux.CAP_SETGID) {
+	if creds.HasCapability(linux.CAP_SETGID) {
 		t.setKGIDsUncheckedLocked(kgid, kgid, kgid)
 		return nil
 	}
-	if kgid != t.creds.RealKGID && kgid != t.creds.SavedKGID {
+	if kgid != creds.RealKGID && kgid != creds.SavedKGID {
 		return syserror.EPERM
 	}
-	t.setKGIDsUncheckedLocked(t.creds.RealKGID, kgid, t.creds.SavedKGID)
+	t.setKGIDsUncheckedLocked(creds.RealKGID, kgid, creds.SavedKGID)
 	return nil
 }
 
@@ -248,30 +249,32 @@ func (t *Task) SetGID(gid auth.GID) error {
 func (t *Task) SetREGID(r, e auth.GID) error {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	newR := t.creds.RealKGID
+
+	creds := t.Credentials()
+	newR := creds.RealKGID
 	if r.Ok() {
-		newR = t.creds.UserNamespace.MapToKGID(r)
+		newR = creds.UserNamespace.MapToKGID(r)
 		if !newR.Ok() {
 			return syserror.EINVAL
 		}
 	}
-	newE := t.creds.EffectiveKGID
+	newE := creds.EffectiveKGID
 	if e.Ok() {
-		newE = t.creds.UserNamespace.MapToKGID(e)
+		newE = creds.UserNamespace.MapToKGID(e)
 		if !newE.Ok() {
 			return syserror.EINVAL
 		}
 	}
-	if !t.creds.HasCapability(linux.CAP_SETGID) {
-		if newE != t.creds.RealKGID && newE != t.creds.EffectiveKGID && newE != t.creds.SavedKGID {
+	if !creds.HasCapability(linux.CAP_SETGID) {
+		if newE != creds.RealKGID && newE != creds.EffectiveKGID && newE != creds.SavedKGID {
 			return syserror.EPERM
 		}
-		if newR != t.creds.RealKGID && newR != t.creds.EffectiveKGID {
+		if newR != creds.RealKGID && newR != creds.EffectiveKGID {
 			return syserror.EPERM
 		}
 	}
-	newS := t.creds.SavedKGID
-	if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKGID) {
+	newS := creds.SavedKGID
+	if r.Ok() || (e.Ok() && newE != creds.EffectiveKGID) {
 		newS = newE
 	}
 	t.setKGIDsUncheckedLocked(newR, newE, newS)
@@ -280,26 +283,29 @@ func (t *Task) SetREGID(r, e auth.GID) error {
 
 // SetRESGID implements the semantics of the setresgid(2) syscall.
 func (t *Task) SetRESGID(r, e, s auth.GID) error {
+	var err error
+
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	var err error
-	newR := t.creds.RealKGID
+
+	creds := t.Credentials()
+	newR := creds.RealKGID
 	if r.Ok() {
-		newR, err = t.creds.UseGID(r)
+		newR, err = creds.UseGID(r)
 		if err != nil {
 			return err
 		}
 	}
-	newE := t.creds.EffectiveKGID
+	newE := creds.EffectiveKGID
 	if e.Ok() {
-		newE, err = t.creds.UseGID(e)
+		newE, err = creds.UseGID(e)
 		if err != nil {
 			return err
 		}
 	}
-	newS := t.creds.SavedKGID
+	newS := creds.SavedKGID
 	if s.Ok() {
-		newS, err = t.creds.UseGID(s)
+		newS, err = creds.UseGID(s)
 		if err != nil {
 			return err
 		}
@@ -309,9 +315,9 @@ func (t *Task) SetRESGID(r, e, s auth.GID) error {
 }
 
 func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
-	oldE := t.creds.EffectiveKGID
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS
+	creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds.
+	oldE := creds.EffectiveKGID
+	creds.RealKGID, creds.EffectiveKGID, creds.SavedKGID = newR, newE, newS
 
 	if oldE != newE {
 		// "[dumpability] is reset to the current value contained in
@@ -327,6 +333,7 @@ func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
 		// kernel/cred.c:commit_creds().
 		t.parentDeathSignal = 0
 	}
+	t.creds.Store(creds)
 }
 
 // SetExtraGIDs attempts to change t's supplemental groups. All IDs are
@@ -334,19 +341,21 @@ func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
 func (t *Task) SetExtraGIDs(gids []auth.GID) error {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	if !t.creds.HasCapability(linux.CAP_SETGID) {
+	creds := t.Credentials()
+	if !creds.HasCapability(linux.CAP_SETGID) {
 		return syserror.EPERM
 	}
 	kgids := make([]auth.KGID, len(gids))
 	for i, gid := range gids {
-		kgid := t.creds.UserNamespace.MapToKGID(gid)
+		kgid := creds.UserNamespace.MapToKGID(gid)
 		if !kgid.Ok() {
 			return syserror.EINVAL
 		}
 		kgids[i] = kgid
 	}
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.ExtraKGIDs = kgids
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
+	creds.ExtraKGIDs = kgids
+	t.creds.Store(creds)
 	return nil
 }
 
@@ -360,27 +369,29 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili
 	if effective & ^permitted != 0 {
 		return syserror.EPERM
 	}
+	creds := t.Credentials()
 	// "It is also a limiting superset for the capabilities that may be added
 	// to the inheritable set by a thread that does not have the CAP_SETPCAP
 	// capability in its effective set."
-	if !t.creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(t.creds.InheritableCaps|t.creds.PermittedCaps) != 0) {
+	if !creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(creds.InheritableCaps|creds.PermittedCaps) != 0) {
 		return syserror.EPERM
 	}
 	// "If a thread drops a capability from its permitted set, it can never
 	// reacquire that capability (unless it execve(2)s ..."
-	if permitted & ^t.creds.PermittedCaps != 0 {
+	if permitted & ^creds.PermittedCaps != 0 {
 		return syserror.EPERM
 	}
 	// "... if a capability is not in the bounding set, then a thread can't add
 	// this capability to its inheritable set, even if it was in its permitted
 	// capabilities ..."
-	if inheritable & ^(t.creds.InheritableCaps|t.creds.BoundingCaps) != 0 {
+	if inheritable & ^(creds.InheritableCaps|creds.BoundingCaps) != 0 {
 		return syserror.EPERM
 	}
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.PermittedCaps = permitted
-	t.creds.InheritableCaps = inheritable
-	t.creds.EffectiveCaps = effective
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
+	creds.PermittedCaps = permitted
+	creds.InheritableCaps = inheritable
+	creds.EffectiveCaps = effective
+	t.creds.Store(creds)
 	return nil
 }
 
@@ -389,11 +400,13 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili
 func (t *Task) DropBoundingCapability(cp linux.Capability) error {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	if !t.creds.HasCapability(linux.CAP_SETPCAP) {
+	creds := t.Credentials()
+	if !creds.HasCapability(linux.CAP_SETPCAP) {
 		return syserror.EPERM
 	}
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.BoundingCaps &^= auth.CapabilitySetOf(cp)
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
+	creds.BoundingCaps &^= auth.CapabilitySetOf(cp)
+	t.creds.Store(creds)
 	return nil
 }
 
@@ -402,31 +415,33 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 
+	creds := t.Credentials()
 	// "A process reassociating itself with a user namespace must have the
 	// CAP_SYS_ADMIN capability in the target user namespace." - setns(2)
 	//
 	// If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN
 	// in ns (by rule 3 in auth.Credentials.HasCapability).
-	if !t.creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
+	if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
 		return syserror.EPERM
 	}
 
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.UserNamespace = ns
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
+	creds.UserNamespace = ns
 	// "The child process created by clone(2) with the CLONE_NEWUSER flag
 	// starts out with a complete set of capabilities in the new user
 	// namespace. Likewise, a process that creates a new user namespace using
 	// unshare(2) or joins an existing user namespace using setns(2) gains a
 	// full set of capabilities in that namespace."
-	t.creds.PermittedCaps = auth.AllCapabilities
-	t.creds.InheritableCaps = 0
-	t.creds.EffectiveCaps = auth.AllCapabilities
-	t.creds.BoundingCaps = auth.AllCapabilities
+	creds.PermittedCaps = auth.AllCapabilities
+	creds.InheritableCaps = 0
+	creds.EffectiveCaps = auth.AllCapabilities
+	creds.BoundingCaps = auth.AllCapabilities
 	// "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER
 	// flag sets the "securebits" flags (see capabilities(7)) to their default
 	// values (all flags disabled) in the child (for clone(2)) or caller (for
 	// unshare(2), or setns(2)." - user_namespaces(7)
-	t.creds.KeepCaps = false
+	creds.KeepCaps = false
+	t.creds.Store(creds)
 
 	return nil
 }
@@ -435,8 +450,9 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
 func (t *Task) SetKeepCaps(k bool) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.KeepCaps = k
+	creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds.
+	creds.KeepCaps = k
+	t.creds.Store(creds)
 }
 
 // updateCredsForExec updates t.creds to reflect an execve().
@@ -512,15 +528,16 @@ func (t *Task) updateCredsForExecLocked() {
 	// the effective user ID.
 	var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0
 	fileEffective := false
-	root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
-	if t.creds.EffectiveKUID == root || t.creds.RealKUID == root {
-		newPermitted = t.creds.InheritableCaps | t.creds.BoundingCaps
-		if t.creds.EffectiveKUID == root {
+	creds := t.Credentials()
+	root := creds.UserNamespace.MapToKUID(auth.RootUID)
+	if creds.EffectiveKUID == root || creds.RealKUID == root {
+		newPermitted = creds.InheritableCaps | creds.BoundingCaps
+		if creds.EffectiveKUID == root {
 			fileEffective = true
 		}
 	}
 
-	t.creds = t.creds.Fork() // See doc for creds.
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
 
 	// Now we enter poorly-documented, somewhat confusing territory. (The
 	// accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds
@@ -562,27 +579,28 @@ func (t *Task) updateCredsForExecLocked() {
 	// But since no_new_privs is always set (A3 is always true), this becomes
 	// much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1
 	// is a no-op. So we can just do C1 and C2 unconditionally.
-	if t.creds.EffectiveKUID != t.creds.RealKUID || t.creds.EffectiveKGID != t.creds.RealKGID {
-		t.creds.EffectiveKUID = t.creds.RealKUID
-		t.creds.EffectiveKGID = t.creds.RealKGID
+	if creds.EffectiveKUID != creds.RealKUID || creds.EffectiveKGID != creds.RealKGID {
+		creds.EffectiveKUID = creds.RealKUID
+		creds.EffectiveKGID = creds.RealKGID
 		t.parentDeathSignal = 0
 	}
 	// (Saved set-user-ID is always set to the new effective user ID, and saved
 	// set-group-ID is always set to the new effective group ID, regardless of
 	// the above.)
-	t.creds.SavedKUID = t.creds.RealKUID
-	t.creds.SavedKGID = t.creds.RealKGID
-	t.creds.PermittedCaps &= newPermitted
+	creds.SavedKUID = creds.RealKUID
+	creds.SavedKGID = creds.RealKGID
+	creds.PermittedCaps &= newPermitted
 	if fileEffective {
-		t.creds.EffectiveCaps = t.creds.PermittedCaps
+		creds.EffectiveCaps = creds.PermittedCaps
 	} else {
-		t.creds.EffectiveCaps = 0
+		creds.EffectiveCaps = 0
 	}
 
 	// prctl(2): The "keep capabilities" value will be reset to 0 on subsequent
 	// calls to execve(2).
-	t.creds.KeepCaps = false
+	creds.KeepCaps = false
 
 	// "The bounding set is inherited at fork(2) from the thread's parent, and
 	// is preserved across an execve(2)". So we're done.
+	t.creds.Store(creds)
 }
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index cf48663b6..a29e9b9eb 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -63,7 +63,7 @@ func (t *Task) DebugDumpState() {
 	if mm := t.MemoryManager(); mm != nil {
 		t.Debugf("Mappings:\n%s", mm)
 	}
-	t.Debugf("FDMap:\n%s", t.fds)
+	t.Debugf("FDTable:\n%s", t.fdTable)
 }
 
 // debugDumpRegisters logs register state at log level debug.
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 9458f5c2a..a88bf3951 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -52,9 +52,10 @@ type TaskConfig struct {
 	// succeeds.
 	FSContext *FSContext
 
-	// FDMap is the FDMap of the new task. A reference must be held on FDMap,
-	// which is transferred to TaskSet.NewTask whether or not it succeeds.
-	FDMap *FDMap
+	// FDTable is the FDTableof the new task. A reference must be held on
+	// FDMap, which is transferred to TaskSet.NewTask whether or not it
+	// succeeds.
+	FDTable *FDTable
 
 	// Credentials is the Credentials of the new task.
 	Credentials *auth.Credentials
@@ -90,7 +91,7 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
 	if err != nil {
 		cfg.TaskContext.release()
 		cfg.FSContext.DecRef()
-		cfg.FDMap.DecRef()
+		cfg.FDTable.DecRef()
 		return nil, err
 	}
 	return t, nil
@@ -112,14 +113,13 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		signalMask:      cfg.SignalMask,
 		signalStack:     arch.SignalStack{Flags: arch.SignalStackFlagDisable},
 		tc:              *tc,
-		fsc:             cfg.FSContext,
-		fds:             cfg.FDMap,
+		fsContext:       cfg.FSContext,
+		fdTable:         cfg.FDTable,
 		p:               cfg.Kernel.Platform.NewContext(),
 		k:               cfg.Kernel,
 		ptraceTracees:   make(map[*Task]struct{}),
 		allowedCPUMask:  cfg.AllowedCPUMask.Copy(),
 		ioUsage:         &usage.IO{},
-		creds:           cfg.Credentials,
 		niceness:        cfg.Niceness,
 		netns:           cfg.NetworkNamespaced,
 		utsns:           cfg.UTSNamespace,
@@ -129,6 +129,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		futexWaiter:     futex.NewWaiter(),
 		containerID:     cfg.ContainerID,
 	}
+	t.creds.Store(cfg.Credentials)
 	t.endStopCond.L = &t.tg.signalHandlers.mu
 	t.ptraceTracer.Store((*Task)(nil))
 	// We don't construct t.blockingTimer until Task.run(); see that function
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index e735a5dd0..10c6e455c 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -172,7 +172,7 @@ func (t *Task) beginStopLocked() {
 	}
 }
 
-// endStopLocked decerements t.stopCount to indicate that an existing internal
+// endStopLocked decrements t.stopCount to indicate that an existing internal
 // or external stop no longer applies to t.
 //
 // Preconditions: The signal mutex must be locked.
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 3562ef179..2a97e3e8e 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -268,7 +268,7 @@ func (k *Kernel) newThreadGroup(mounts *fs.MountNamespace, ns *PIDNamespace, sh
 	return tg
 }
 
-// saveRscr is invopked by stateify.
+// saveRscr is invoked by stateify.
 func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion {
 	return tg.rscr.Load().(*RSEQCriticalRegion)
 }
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 9c3c05239..aa6c75d25 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -142,6 +142,11 @@ func (t Time) Timeval() linux.Timeval {
 	return linux.NsecToTimeval(t.Nanoseconds())
 }
 
+// StatxTimestamp converts Time to a Linux statx_timestamp.
+func (t Time) StatxTimestamp() linux.StatxTimestamp {
+	return linux.NsecToStatxTimestamp(t.Nanoseconds())
+}
+
 // Add adds the duration of d to t.
 func (t Time) Add(d time.Duration) Time {
 	if t.ns > 0 && d.Nanoseconds() > math.MaxInt64-int64(t.ns) {
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index eadacfea2..76417342a 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -122,7 +122,7 @@ func (t *Timekeeper) SetClocks(c sentrytime.Clocks) {
 	//
 	// In a restored sentry, monotonic time jumps forward by approximately
 	// the same amount as real time. There are no guarantees here, we are
-	// just making a best-effort attempt to to make it appear that the app
+	// just making a best-effort attempt to make it appear that the app
 	// was simply not scheduled for a long period, rather than that the
 	// real time clock was changed.
 	//