126 files changed, 4092 insertions, 1074 deletions
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index e35c9214a..aa31169e0 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 i386
+// +build amd64 386
 
 package arch
 
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 88b40a9d1..7fc4c0473 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 i386
+// +build amd64 386
 
 package arch
 
diff --git a/pkg/sentry/arch/arch_x86_impl.go b/pkg/sentry/arch/arch_x86_impl.go
index 04ac283c6..3edf40764 100644
--- a/pkg/sentry/arch/arch_x86_impl.go
+++ b/pkg/sentry/arch/arch_x86_impl.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 i386
+// +build amd64 386
 
 package arch
 
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index 1a6056171..e58f055c7 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64 arm64
+// +build 386 amd64 arm64
 
 package arch
 
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index 09bceabc9..1108fa0bd 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -97,7 +97,6 @@ func (s *Stack) Push(vals ...interface{}) (usermem.Addr, error) {
 		if c < 0 {
 			return 0, fmt.Errorf("bad binary.Size for %T", v)
 		}
-		// TODO(b/38173783): Use a real context.Context.
 		n, err := usermem.CopyObjectOut(context.Background(), s.IO, s.Bottom-usermem.Addr(c), norm, usermem.IOOpts{})
 		if err != nil || c != n {
 			return 0, err
@@ -121,11 +120,9 @@ func (s *Stack) Pop(vals ...interface{}) (usermem.Addr, error) {
 		var err error
 		if isVaddr {
 			value := s.Arch.Native(uintptr(0))
-			// TODO(b/38173783): Use a real context.Context.
 			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, value, usermem.IOOpts{})
 			*vaddr = usermem.Addr(s.Arch.Value(value))
 		} else {
-			// TODO(b/38173783): Use a real context.Context.
 			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, v, usermem.IOOpts{})
 		}
 		if err != nil {
diff --git a/pkg/sentry/contexttest/contexttest.go b/pkg/sentry/contexttest/contexttest.go
index 031fc64ec..8e5658c7a 100644
--- a/pkg/sentry/contexttest/contexttest.go
+++ b/pkg/sentry/contexttest/contexttest.go
@@ -97,7 +97,7 @@ type hostClock struct {
 }
 
 // Now implements ktime.Clock.Now.
-func (hostClock) Now() ktime.Time {
+func (*hostClock) Now() ktime.Time {
 	return ktime.FromNanoseconds(time.Now().UnixNano())
 }
 
@@ -127,7 +127,7 @@ func (t *TestContext) Value(key interface{}) interface{} {
 	case uniqueid.CtxInotifyCookie:
 		return atomic.AddUint32(&lastInotifyCookie, 1)
 	case ktime.CtxRealtimeClock:
-		return hostClock{}
+		return &hostClock{}
 	default:
 		if val, ok := t.otherValues[key]; ok {
 			return val
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index b060a12ff..ab1424c95 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -222,8 +222,8 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 		}
 		childUpper, err := parentUpper.Lookup(ctx, next.name)
 		if err != nil {
-			log.Warningf("copy up failed to lookup directory: %v", err)
-			cleanupUpper(ctx, parentUpper, next.name)
+			werr := fmt.Errorf("copy up failed to lookup directory: %v", err)
+			cleanupUpper(ctx, parentUpper, next.name, werr)
 			return syserror.EIO
 		}
 		defer childUpper.DecRef()
@@ -242,8 +242,8 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 		}
 		childUpper, err := parentUpper.Lookup(ctx, next.name)
 		if err != nil {
-			log.Warningf("copy up failed to lookup symlink: %v", err)
-			cleanupUpper(ctx, parentUpper, next.name)
+			werr := fmt.Errorf("copy up failed to lookup symlink: %v", err)
+			cleanupUpper(ctx, parentUpper, next.name, werr)
 			return syserror.EIO
 		}
 		defer childUpper.DecRef()
@@ -256,23 +256,23 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 	// Bring file attributes up to date. This does not include size, which will be
 	// brought up to date with copyContentsLocked.
 	if err := copyAttributesLocked(ctx, childUpperInode, next.Inode.overlay.lower); err != nil {
-		log.Warningf("copy up failed to copy up attributes: %v", err)
-		cleanupUpper(ctx, parentUpper, next.name)
+		werr := fmt.Errorf("copy up failed to copy up attributes: %v", err)
+		cleanupUpper(ctx, parentUpper, next.name, werr)
 		return syserror.EIO
 	}
 
 	// Copy the entire file.
 	if err := copyContentsLocked(ctx, childUpperInode, next.Inode.overlay.lower, attrs.Size); err != nil {
-		log.Warningf("copy up failed to copy up contents: %v", err)
-		cleanupUpper(ctx, parentUpper, next.name)
+		werr := fmt.Errorf("copy up failed to copy up contents: %v", err)
+		cleanupUpper(ctx, parentUpper, next.name, werr)
 		return syserror.EIO
 	}
 
 	lowerMappable := next.Inode.overlay.lower.Mappable()
 	upperMappable := childUpperInode.Mappable()
 	if lowerMappable != nil && upperMappable == nil {
-		log.Warningf("copy up failed: cannot ensure memory mapping coherence")
-		cleanupUpper(ctx, parentUpper, next.name)
+		werr := fmt.Errorf("copy up failed: cannot ensure memory mapping coherence")
+		cleanupUpper(ctx, parentUpper, next.name, werr)
 		return syserror.EIO
 	}
 
@@ -324,12 +324,14 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 	return nil
 }
 
-// cleanupUpper removes name from parent, and panics if it is unsuccessful.
-func cleanupUpper(ctx context.Context, parent *Inode, name string) {
+// cleanupUpper is called when copy-up fails. It logs the copy-up error and
+// attempts to remove name from parent. If that fails, then it panics.
+func cleanupUpper(ctx context.Context, parent *Inode, name string, copyUpErr error) {
+	log.Warningf(copyUpErr.Error())
 	if err := parent.InodeOperations.Remove(ctx, parent, name); err != nil {
 		// Unfortunately we don't have much choice. We shouldn't
 		// willingly give the caller access to a nonsense filesystem.
-		panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: failed to remove %q from upper filesystem: %v", name, err))
+		panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: copyUp got error: %v; then cleanup failed to remove %q from upper filesystem: %v.", copyUpErr, name, err))
 	}
 }
 
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 0266a5287..65be12175 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -312,9 +312,9 @@ func (d *Dirent) SyncAll(ctx context.Context) {
 
 	// There is nothing to sync for a read-only filesystem.
 	if !d.Inode.MountSource.Flags.ReadOnly {
-		// FIXME(b/34856369): This should be a mount traversal, not a
-		// Dirent traversal, because some Inodes that need to be synced
-		// may no longer be reachable by name (after sys_unlink).
+		// NOTE(b/34856369): This should be a mount traversal, not a Dirent
+		// traversal, because some Inodes that need to be synced may no longer
+		// be reachable by name (after sys_unlink).
 		//
 		// Write out metadata, dirty page cached pages, and sync disk/remote
 		// caches.
diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go
index 5aff0cc95..a0082ecca 100644
--- a/pkg/sentry/fs/fdpipe/pipe_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_test.go
@@ -119,7 +119,7 @@ func TestNewPipe(t *testing.T) {
 				continue
 			}
 			if flags := p.flags; test.flags != flags {
-				t.Errorf("%s: got file flags %s, want %s", test.desc, flags, test.flags)
+				t.Errorf("%s: got file flags %v, want %v", test.desc, flags, test.flags)
 				continue
 			}
 			if len(test.readAheadBuffer) != len(p.readAheadBuffer) {
@@ -136,7 +136,7 @@ func TestNewPipe(t *testing.T) {
 				continue
 			}
 			if !fdnotifier.HasFD(int32(f.FD())) {
-				t.Errorf("%s: pipe fd %d is not registered for events", test.desc, f.FD)
+				t.Errorf("%s: pipe fd %d is not registered for events", test.desc, f.FD())
 			}
 		}
 	}
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index ff96b28ba..edd6576aa 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -34,7 +34,6 @@ func (f *fileOperations) afterLoad() {
 		flags := f.flags
 		flags.Truncate = false
 
-		// TODO(b/38173783): Context is not plumbed to save/restore.
 		f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), flags, f.inodeOperations.cachingInodeOps)
 		if err != nil {
 			return fmt.Errorf("failed to re-open handle: %v", err)
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index 9f7c3e89f..fc14249be 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -57,7 +57,6 @@ func (h *handles) DecRef() {
 				}
 			}
 		}
-		// FIXME(b/38173783): Context is not plumbed here.
 		if err := h.File.close(context.Background()); err != nil {
 			log.Warningf("error closing p9 file: %v", err)
 		}
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 1c934981b..a016c896e 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -273,7 +273,7 @@ func (i *inodeFileState) recreateReadHandles(ctx context.Context, writer *handle
 	// operations on the old will see the new data. Then, make the new handle take
 	// ownereship of the old FD and mark the old readHandle to not close the FD
 	// when done.
-	if err := syscall.Dup3(h.Host.FD(), i.readHandles.Host.FD(), 0); err != nil {
+	if err := syscall.Dup3(h.Host.FD(), i.readHandles.Host.FD(), syscall.O_CLOEXEC); err != nil {
 		return err
 	}
 
@@ -710,13 +710,10 @@ func init() {
 }
 
 // AddLink implements InodeOperations.AddLink, but is currently a noop.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (*inodeOperations) AddLink() {}
 
 // DropLink implements InodeOperations.DropLink, but is currently a noop.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (*inodeOperations) DropLink() {}
 
 // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index 238f7804c..a3402e343 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -123,7 +123,6 @@ func (i *inodeFileState) afterLoad() {
 			// beforeSave.
 			return fmt.Errorf("failed to find path for inode number %d. Device %s contains %s", i.sattr.InodeID, i.s.connID, fs.InodeMappings(i.s.inodeMappings))
 		}
-		// TODO(b/38173783): Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
 
 		_, i.file, err = i.s.attach.walk(ctx, splitAbsolutePath(name))
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 111da59f9..2d398b753 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -104,7 +104,6 @@ func (s *session) afterLoad() {
 	// If private unix sockets are enabled, create and fill the session's endpoint
 	// maps.
 	if opts.privateunixsocket {
-		// TODO(b/38173783): Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
 
 		if err = s.restoreEndpointMaps(ctx); err != nil {
diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go
index 2d8d3a2ea..47a6c69bf 100644
--- a/pkg/sentry/fs/gofer/util.go
+++ b/pkg/sentry/fs/gofer/util.go
@@ -20,17 +20,29 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 )
 
 func utimes(ctx context.Context, file contextFile, ts fs.TimeSpec) error {
 	if ts.ATimeOmit && ts.MTimeOmit {
 		return nil
 	}
+
+	// Replace requests to use the "system time" with the current time to
+	// ensure that timestamps remain consistent with the remote
+	// filesystem.
+	now := ktime.NowFromContext(ctx)
+	if ts.ATimeSetSystemTime {
+		ts.ATime = now
+	}
+	if ts.MTimeSetSystemTime {
+		ts.MTime = now
+	}
 	mask := p9.SetAttrMask{
 		ATime:              !ts.ATimeOmit,
-		ATimeNotSystemTime: !ts.ATimeSetSystemTime,
+		ATimeNotSystemTime: true,
 		MTime:              !ts.MTimeOmit,
-		MTimeNotSystemTime: !ts.MTimeSetSystemTime,
+		MTimeNotSystemTime: true,
 	}
 	as, ans := ts.ATime.Unix()
 	ms, mns := ts.MTime.Unix()
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 1da3c0a17..62f1246aa 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -397,15 +397,12 @@ func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) {
 }
 
 // AddLink implements fs.InodeOperations.AddLink.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) AddLink() {}
 
 // DropLink implements fs.InodeOperations.DropLink.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) DropLink() {}
 
 // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
 
 // readdirAll returns all of the directory entries in i.
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index eb4afe520..affdbcacb 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -199,14 +199,14 @@ func TestListen(t *testing.T) {
 }
 
 func TestPasscred(t *testing.T) {
-	e := ConnectedEndpoint{}
+	e := &ConnectedEndpoint{}
 	if got, want := e.Passcred(), false; got != want {
 		t.Errorf("Got %#v.Passcred() = %t, want = %t", e, got, want)
 	}
 }
 
 func TestGetLocalAddress(t *testing.T) {
-	e := ConnectedEndpoint{path: "foo"}
+	e := &ConnectedEndpoint{path: "foo"}
 	want := tcpip.FullAddress{Addr: tcpip.Address("foo")}
 	if got, err := e.GetLocalAddress(); err != nil || got != want {
 		t.Errorf("Got %#v.GetLocalAddress() = %#v, %v, want = %#v, %v", e, got, err, want, nil)
@@ -214,7 +214,7 @@ func TestGetLocalAddress(t *testing.T) {
 }
 
 func TestQueuedSize(t *testing.T) {
-	e := ConnectedEndpoint{}
+	e := &ConnectedEndpoint{}
 	tests := []struct {
 		name string
 		f    func() int64
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 55fb71c16..a34fbc946 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -102,7 +102,6 @@ func (i *Inode) DecRef() {
 
 // destroy releases the Inode and releases the msrc reference taken.
 func (i *Inode) destroy() {
-	// FIXME(b/38173783): Context is not plumbed here.
 	ctx := context.Background()
 	if err := i.WriteOut(ctx); err != nil {
 		// FIXME(b/65209558): Mark as warning again once noatime is
@@ -397,8 +396,6 @@ func (i *Inode) Getlink(ctx context.Context) (*Dirent, error) {
 // AddLink calls i.InodeOperations.AddLink.
 func (i *Inode) AddLink() {
 	if i.overlay != nil {
-		// FIXME(b/63117438): Remove this from InodeOperations altogether.
-		//
 		// This interface is only used by ramfs to update metadata of
 		// children. These filesystems should _never_ have overlay
 		// Inodes cached as children. So explicitly disallow this
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 5ada33a32..537c8d257 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -231,7 +231,8 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st
 	upperFile.Dirent.Inode.IncRef()
 	entry, err := newOverlayEntry(ctx, upperFile.Dirent.Inode, nil, false)
 	if err != nil {
-		cleanupUpper(ctx, o.upper, name)
+		werr := fmt.Errorf("newOverlayEntry failed: %v", err)
+		cleanupUpper(ctx, o.upper, name, werr)
 		return nil, err
 	}
 
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 94deb553b..1fc9c703c 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -170,7 +170,8 @@ func superBlockOpts(mountPath string, msrc *fs.MountSource) string {
 	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
 	// the cgroup name in the options. For now we just read that from the
 	// path.
-	// TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we
+	//
+	// TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we
 	// should get this value from the cgroup itself, and not rely on the
 	// path.
 	if msrc.FilesystemType == "cgroup" {
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index d4c4b533d..702fdd392 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -80,7 +80,7 @@ func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error {
+func (*tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
@@ -196,7 +196,7 @@ func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *f
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (tcpSack) Truncate(context.Context, *fs.Inode, int64) error {
+func (*tcpSack) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index d6c5dd2c1..4d42eac83 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -57,6 +57,16 @@ func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) {
 	return m, nil
 }
 
+func checkTaskState(t *kernel.Task) error {
+	switch t.ExitState() {
+	case kernel.TaskExitZombie:
+		return syserror.EACCES
+	case kernel.TaskExitDead:
+		return syserror.ESRCH
+	}
+	return nil
+}
+
 // taskDir represents a task-level directory.
 //
 // +stateify savable
@@ -254,11 +264,12 @@ func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 }
 
 func (e *exe) executable() (file fsbridge.File, err error) {
+	if err := checkTaskState(e.t); err != nil {
+		return nil, err
+	}
 	e.t.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
-			// TODO(b/34851096): Check shouldn't allow Readlink once the
-			// Task is zombied.
 			err = syserror.EACCES
 			return
 		}
@@ -268,7 +279,7 @@ func (e *exe) executable() (file fsbridge.File, err error) {
 		// (with locks held).
 		file = mm.Executable()
 		if file == nil {
-			err = syserror.ENOENT
+			err = syserror.ESRCH
 		}
 	})
 	return
@@ -313,11 +324,22 @@ func newNamespaceSymlink(t *kernel.Task, msrc *fs.MountSource, name string) *fs.
 	return newProcInode(t, n, msrc, fs.Symlink, t)
 }
 
+// Readlink reads the symlink value.
+func (n *namespaceSymlink) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+	if err := checkTaskState(n.t); err != nil {
+		return "", err
+	}
+	return n.Symlink.Readlink(ctx, inode)
+}
+
 // Getlink implements fs.InodeOperations.Getlink.
 func (n *namespaceSymlink) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) {
 	if !kernel.ContextCanTrace(ctx, n.t, false) {
 		return nil, syserror.EACCES
 	}
+	if err := checkTaskState(n.t); err != nil {
+		return nil, err
+	}
 
 	// Create a new regular file to fake the namespace file.
 	iops := fsutil.NewNoReadWriteFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0777), linux.PROC_SUPER_MAGIC)
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index d5be56c3f..bc117ca6a 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -44,9 +44,6 @@ const (
 	// lookup.
 	cacheRevalidate = "revalidate"
 
-	// TODO(edahlgren/mpratt): support a tmpfs size limit.
-	// size = "size"
-
 	// Permissions that exceed modeMask will be rejected.
 	modeMask = 01777
 
diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
index 79b808359..89168220a 100644
--- a/pkg/sentry/fsbridge/vfs.go
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -26,22 +26,22 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// fsFile implements File interface over vfs.FileDescription.
+// VFSFile implements File interface over vfs.FileDescription.
 //
 // +stateify savable
-type vfsFile struct {
+type VFSFile struct {
 	file *vfs.FileDescription
 }
 
-var _ File = (*vfsFile)(nil)
+var _ File = (*VFSFile)(nil)
 
 // NewVFSFile creates a new File over fs.File.
 func NewVFSFile(file *vfs.FileDescription) File {
-	return &vfsFile{file: file}
+	return &VFSFile{file: file}
 }
 
 // PathnameWithDeleted implements File.
-func (f *vfsFile) PathnameWithDeleted(ctx context.Context) string {
+func (f *VFSFile) PathnameWithDeleted(ctx context.Context) string {
 	root := vfs.RootFromContext(ctx)
 	defer root.DecRef()
 
@@ -51,7 +51,7 @@ func (f *vfsFile) PathnameWithDeleted(ctx context.Context) string {
 }
 
 // ReadFull implements File.
-func (f *vfsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+func (f *VFSFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
 	var total int64
 	for dst.NumBytes() > 0 {
 		n, err := f.file.PRead(ctx, dst, offset+total, vfs.ReadOptions{})
@@ -67,12 +67,12 @@ func (f *vfsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset i
 }
 
 // ConfigureMMap implements File.
-func (f *vfsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+func (f *VFSFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
 	return f.file.ConfigureMMap(ctx, opts)
 }
 
 // Type implements File.
-func (f *vfsFile) Type(ctx context.Context) (linux.FileMode, error) {
+func (f *VFSFile) Type(ctx context.Context) (linux.FileMode, error) {
 	stat, err := f.file.Stat(ctx, vfs.StatOptions{})
 	if err != nil {
 		return 0, err
@@ -81,15 +81,21 @@ func (f *vfsFile) Type(ctx context.Context) (linux.FileMode, error) {
 }
 
 // IncRef implements File.
-func (f *vfsFile) IncRef() {
+func (f *VFSFile) IncRef() {
 	f.file.IncRef()
 }
 
 // DecRef implements File.
-func (f *vfsFile) DecRef() {
+func (f *VFSFile) DecRef() {
 	f.file.DecRef()
 }
 
+// FileDescription returns the FileDescription represented by f. It does not
+// take an additional reference on the returned FileDescription.
+func (f *VFSFile) FileDescription() *vfs.FileDescription {
+	return f.file
+}
+
 // fsLookup implements Lookup interface using fs.File.
 //
 // +stateify savable
@@ -132,5 +138,5 @@ func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.Open
 	if err != nil {
 		return nil, err
 	}
-	return &vfsFile{file: fd}, nil
+	return &VFSFile{file: fd}, nil
 }
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 48eaccdbc..afea58f65 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -476,7 +476,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 }
 
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	_, _, err := fs.walk(rp, false)
 	if err != nil {
 		return nil, err
@@ -485,7 +485,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	_, _, err := fs.walk(rp, false)
 	if err != nil {
 		return "", err
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index d15a36709..99d1e3f8f 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
@@ -54,3 +54,13 @@ go_library(
         "//pkg/usermem",
     ],
 )
+
+go_test(
+    name = "gofer_test",
+    srcs = ["gofer_test.go"],
+    library = ":gofer",
+    deps = [
+        "//pkg/p9",
+        "//pkg/sentry/contexttest",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 5dbfc6250..49d9f859b 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -56,14 +56,19 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	fd.mu.Lock()
 	defer fd.mu.Unlock()
 
+	d := fd.dentry()
 	if fd.dirents == nil {
-		ds, err := fd.dentry().getDirents(ctx)
+		ds, err := d.getDirents(ctx)
 		if err != nil {
 			return err
 		}
 		fd.dirents = ds
 	}
 
+	if d.fs.opts.interop != InteropModeShared {
+		d.touchAtime(fd.vfsfd.Mount())
+	}
+
 	for fd.off < int64(len(fd.dirents)) {
 		if err := cb.Handle(fd.dirents[fd.off]); err != nil {
 			return err
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 269624362..cd744bf5e 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -356,7 +356,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if err := create(parent, name); err != nil {
 		return err
 	}
-	parent.touchCMtime(ctx)
+	if fs.opts.interop != InteropModeShared {
+		parent.touchCMtime()
+	}
 	delete(parent.negativeChildren, name)
 	parent.dirents = nil
 	return nil
@@ -435,14 +437,19 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	flags := uint32(0)
 	if dir {
 		if child != nil && !child.isDir() {
+			vfsObj.AbortDeleteDentry(childVFSD)
 			return syserror.ENOTDIR
 		}
 		flags = linux.AT_REMOVEDIR
 	} else {
 		if child != nil && child.isDir() {
+			vfsObj.AbortDeleteDentry(childVFSD)
 			return syserror.EISDIR
 		}
 		if rp.MustBeDir() {
+			if childVFSD != nil {
+				vfsObj.AbortDeleteDentry(childVFSD)
+			}
 			return syserror.ENOTDIR
 		}
 	}
@@ -454,7 +461,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 		return err
 	}
 	if fs.opts.interop != InteropModeShared {
-		parent.touchCMtime(ctx)
+		parent.touchCMtime()
 		if dir {
 			parent.decLinks()
 		}
@@ -802,7 +809,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 	d.IncRef() // reference held by child on its parent d
 	d.vfsd.InsertChild(&child.vfsd, name)
 	if d.fs.opts.interop != InteropModeShared {
-		d.touchCMtime(ctx)
 		delete(d.negativeChildren, name)
 		d.dirents = nil
 	}
@@ -834,6 +840,9 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		}
 		childVFSFD = &fd.vfsfd
 	}
+	if d.fs.opts.interop != InteropModeShared {
+		d.touchCMtime()
+	}
 	return childVFSFD, nil
 }
 
@@ -975,6 +984,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			oldParent.decLinks()
 			newParent.incLinks()
 		}
+		oldParent.touchCMtime()
+		newParent.touchCMtime()
+		renamed.touchCtime()
 	}
 	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD)
 	return nil
@@ -1068,7 +1080,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 }
 
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
@@ -1076,11 +1088,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 	if err != nil {
 		return nil, err
 	}
-	return d.listxattr(ctx)
+	return d.listxattr(ctx, rp.Credentials(), size)
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
@@ -1088,7 +1100,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, nam
 	if err != nil {
 		return "", err
 	}
-	return d.getxattr(ctx, name)
+	return d.getxattr(ctx, rp.Credentials(), &opts)
 }
 
 // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
@@ -1100,7 +1112,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	if err != nil {
 		return err
 	}
-	return d.setxattr(ctx, &opts)
+	return d.setxattr(ctx, rp.Credentials(), &opts)
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
@@ -1112,7 +1124,7 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 	if err != nil {
 		return err
 	}
-	return d.removexattr(ctx, name)
+	return d.removexattr(ctx, rp.Credentials(), name)
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 8e41b6b1c..2485cdb53 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -34,6 +34,7 @@ package gofer
 import (
 	"fmt"
 	"strconv"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"syscall"
@@ -44,6 +45,7 @@ import (
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -72,6 +74,9 @@ type filesystem struct {
 	// client is the client used by this filesystem. client is immutable.
 	client *p9.Client
 
+	// clock is a realtime clock used to set timestamps in file operations.
+	clock ktime.Clock
+
 	// uid and gid are the effective KUID and KGID of the filesystem's creator,
 	// and are used as the owner and group for files that don't specify one.
 	// uid and gid are immutable.
@@ -376,6 +381,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		uid:            creds.EffectiveKUID,
 		gid:            creds.EffectiveKGID,
 		client:         client,
+		clock:          ktime.RealtimeClockFromContext(ctx),
 		dentries:       make(map[*dentry]struct{}),
 		specialFileFDs: make(map[*specialFileFD]struct{}),
 	}
@@ -439,7 +445,8 @@ type dentry struct {
 
 	// refs is the reference count. Each dentry holds a reference on its
 	// parent, even if disowned. refs is accessed using atomic memory
-	// operations.
+	// operations. When refs reaches 0, the dentry may be added to the cache or
+	// destroyed. If refs==-1 the dentry has already been destroyed.
 	refs int64
 
 	// fs is the owning filesystem. fs is immutable.
@@ -779,10 +786,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 		// data, so there's no cache to truncate either.)
 		return nil
 	}
-	now, haveNow := nowFromContext(ctx)
-	if !haveNow {
-		ctx.Warningf("gofer.dentry.setStat: current time not available")
-	}
+	now := d.fs.clock.Now().Nanoseconds()
 	if stat.Mask&linux.STATX_MODE != 0 {
 		atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
 	}
@@ -794,25 +798,19 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	}
 	if setLocalAtime {
 		if stat.Atime.Nsec == linux.UTIME_NOW {
-			if haveNow {
-				atomic.StoreInt64(&d.atime, now)
-			}
+			atomic.StoreInt64(&d.atime, now)
 		} else {
 			atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
 		}
 	}
 	if setLocalMtime {
 		if stat.Mtime.Nsec == linux.UTIME_NOW {
-			if haveNow {
-				atomic.StoreInt64(&d.mtime, now)
-			}
+			atomic.StoreInt64(&d.mtime, now)
 		} else {
 			atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
 		}
 	}
-	if haveNow {
-		atomic.StoreInt64(&d.ctime, now)
-	}
+	atomic.StoreInt64(&d.ctime, now)
 	if stat.Mask&linux.STATX_SIZE != 0 {
 		d.dataMu.Lock()
 		oldSize := d.size
@@ -864,7 +862,7 @@ func (d *dentry) IncRef() {
 func (d *dentry) TryIncRef() bool {
 	for {
 		refs := atomic.LoadInt64(&d.refs)
-		if refs == 0 {
+		if refs <= 0 {
 			return false
 		}
 		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
@@ -887,13 +885,20 @@ func (d *dentry) DecRef() {
 // checkCachingLocked should be called after d's reference count becomes 0 or it
 // becomes disowned.
 //
+// It may be called on a destroyed dentry. For example,
+// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times
+// for the same dentry when the dentry is visited more than once in the same
+// operation. One of the calls may destroy the dentry, so subsequent calls will
+// do nothing.
+//
 // Preconditions: d.fs.renameMu must be locked for writing.
 func (d *dentry) checkCachingLocked() {
 	// Dentries with a non-zero reference count must be retained. (The only way
 	// to obtain a reference on a dentry with zero references is via path
 	// resolution, which requires renameMu, so if d.refs is zero then it will
 	// remain zero while we hold renameMu for writing.)
-	if atomic.LoadInt64(&d.refs) != 0 {
+	refs := atomic.LoadInt64(&d.refs)
+	if refs > 0 {
 		if d.cached {
 			d.fs.cachedDentries.Remove(d)
 			d.fs.cachedDentriesLen--
@@ -901,6 +906,10 @@ func (d *dentry) checkCachingLocked() {
 		}
 		return
 	}
+	if refs == -1 {
+		// Dentry has already been destroyed.
+		return
+	}
 	// Non-child dentries with zero references are no longer reachable by path
 	// resolution and should be dropped immediately.
 	if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() {
@@ -953,9 +962,22 @@ func (d *dentry) checkCachingLocked() {
 	}
 }
 
+// destroyLocked destroys the dentry. It may flushes dirty pages from cache,
+// close p9 file and remove reference on parent dentry.
+//
 // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is
 // not a child dentry.
 func (d *dentry) destroyLocked() {
+	switch atomic.LoadInt64(&d.refs) {
+	case 0:
+		// Mark the dentry destroyed.
+		atomic.StoreInt64(&d.refs, -1)
+	case -1:
+		panic("dentry.destroyLocked() called on already destroyed dentry")
+	default:
+		panic("dentry.destroyLocked() called with references on the dentry")
+	}
+
 	ctx := context.Background()
 	d.handleMu.Lock()
 	if !d.handle.file.isNil() {
@@ -975,7 +997,10 @@ func (d *dentry) destroyLocked() {
 		d.handle.close(ctx)
 	}
 	d.handleMu.Unlock()
-	d.file.close(ctx)
+	if !d.file.isNil() {
+		d.file.close(ctx)
+		d.file = p9file{}
+	}
 	// Remove d from the set of all dentries.
 	d.fs.syncMu.Lock()
 	delete(d.fs.dentries, d)
@@ -1000,21 +1025,50 @@ func (d *dentry) setDeleted() {
 	atomic.StoreUint32(&d.deleted, 1)
 }
 
-func (d *dentry) listxattr(ctx context.Context) ([]string, error) {
-	return nil, syserror.ENOTSUP
+// We only support xattrs prefixed with "user." (see b/148380782). Currently,
+// there is no need to expose any other xattrs through a gofer.
+func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+	xattrMap, err := d.file.listXattr(ctx, size)
+	if err != nil {
+		return nil, err
+	}
+	xattrs := make([]string, 0, len(xattrMap))
+	for x := range xattrMap {
+		if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
+			xattrs = append(xattrs, x)
+		}
+	}
+	return xattrs, nil
 }
 
-func (d *dentry) getxattr(ctx context.Context, name string) (string, error) {
-	// TODO(jamieliu): add vfs.GetxattrOptions.Size
-	return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX)
+func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+	if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
+		return "", err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return "", syserror.EOPNOTSUPP
+	}
+	return d.file.getXattr(ctx, opts.Name, opts.Size)
 }
 
-func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error {
+func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
 	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
 }
 
-func (d *dentry) removexattr(ctx context.Context, name string) error {
-	return syserror.ENOTSUP
+func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	return d.file.removeXattr(ctx, name)
 }
 
 // Preconditions: d.isRegularFile() || d.isDirectory().
@@ -1065,7 +1119,7 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 				// description, but this doesn't matter since they refer to the
 				// same file (unless d.fs.opts.overlayfsStaleRead is true,
 				// which we handle separately).
-				if err := syscall.Dup3(int(h.fd), int(d.handle.fd), 0); err != nil {
+				if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil {
 					d.handleMu.Unlock()
 					ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err)
 					h.close(ctx)
@@ -1165,21 +1219,21 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 }
 
 // Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) {
-	return fd.dentry().listxattr(ctx)
+func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size)
 }
 
 // Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) {
-	return fd.dentry().getxattr(ctx, name)
+func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
+	return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
 }
 
 // Setxattr implements vfs.FileDescriptionImpl.Setxattr.
 func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
-	return fd.dentry().setxattr(ctx, &opts)
+	return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
 }
 
 // Removexattr implements vfs.FileDescriptionImpl.Removexattr.
 func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
-	return fd.dentry().removexattr(ctx, name)
+	return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name)
 }
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
new file mode 100644
index 000000000..82bc239db
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -0,0 +1,64 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync/atomic"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+)
+
+func TestDestroyIdempotent(t *testing.T) {
+	fs := filesystem{
+		dentries: make(map[*dentry]struct{}),
+		opts: filesystemOptions{
+			// Test relies on no dentry being held in the cache.
+			maxCachedDentries: 0,
+		},
+	}
+
+	ctx := contexttest.Context(t)
+	attr := &p9.Attr{
+		Mode: p9.ModeRegular,
+	}
+	mask := p9.AttrMask{
+		Mode: true,
+		Size: true,
+	}
+	parent, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr)
+	if err != nil {
+		t.Fatalf("fs.newDentry(): %v", err)
+	}
+
+	child, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr)
+	if err != nil {
+		t.Fatalf("fs.newDentry(): %v", err)
+	}
+	parent.IncRef() // reference held by child on its parent.
+	parent.vfsd.InsertChild(&child.vfsd, "child")
+
+	child.checkCachingLocked()
+	if got := atomic.LoadInt64(&child.refs); got != -1 {
+		t.Fatalf("child.refs=%d, want: -1", got)
+	}
+	// Parent will also be destroyed when child reference is removed.
+	if got := atomic.LoadInt64(&parent.refs); got != -1 {
+		t.Fatalf("parent.refs=%d, want: -1", got)
+	}
+	child.checkCachingLocked()
+	child.checkCachingLocked()
+}
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
index 755ac2985..87f0b877f 100644
--- a/pkg/sentry/fsimpl/gofer/p9file.go
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -85,6 +85,13 @@ func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAt
 	return err
 }
 
+func (f p9file) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
+	ctx.UninterruptibleSleepStart(false)
+	xattrs, err := f.file.ListXattr(size)
+	ctx.UninterruptibleSleepFinish(false)
+	return xattrs, err
+}
+
 func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) {
 	ctx.UninterruptibleSleepStart(false)
 	val, err := f.file.GetXattr(name, size)
@@ -99,6 +106,13 @@ func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32)
 	return err
 }
 
+func (f p9file) removeXattr(ctx context.Context, name string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.RemoveXattr(name)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
 	ctx.UninterruptibleSleepStart(false)
 	err := f.file.Allocate(mode, offset, length)
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 3593eb1d5..857f7c74e 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -104,7 +104,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 	putDentryReadWriter(rw)
 	if d.fs.opts.interop != InteropModeShared {
 		// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
-		d.touchAtime(ctx, fd.vfsfd.Mount())
+		d.touchAtime(fd.vfsfd.Mount())
 	}
 	return n, err
 }
@@ -139,10 +139,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 		// Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
 		// file_update_time(). This is d.touchCMtime(), but without locking
 		// d.metadataMu (recursively).
-		if now, ok := nowFromContext(ctx); ok {
-			atomic.StoreInt64(&d.mtime, now)
-			atomic.StoreInt64(&d.ctime, now)
-		}
+		d.touchCMtimeLocked()
 	}
 	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
 		// Write dirty cached pages that will be touched by the write back to
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 274f7346f..507e0e276 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -76,7 +76,7 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 	// hold here since specialFileFD doesn't client-cache data. Just buffer the
 	// read instead.
 	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
-		d.touchAtime(ctx, fd.vfsfd.Mount())
+		d.touchAtime(fd.vfsfd.Mount())
 	}
 	buf := make([]byte, dst.NumBytes())
 	n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
@@ -117,7 +117,7 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 
 	// Do a buffered write. See rationale in PRead.
 	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
-		d.touchCMtime(ctx)
+		d.touchCMtime()
 	}
 	buf := make([]byte, src.NumBytes())
 	// Don't do partial writes if we get a partial read from src.
diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go
index adf43be60..2ec819f86 100644
--- a/pkg/sentry/fsimpl/gofer/symlink.go
+++ b/pkg/sentry/fsimpl/gofer/symlink.go
@@ -27,7 +27,7 @@ func (d *dentry) isSymlink() bool {
 // Precondition: d.isSymlink().
 func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
 	if d.fs.opts.interop != InteropModeShared {
-		d.touchAtime(ctx, mnt)
+		d.touchAtime(mnt)
 		d.dataMu.Lock()
 		if d.haveTarget {
 			target := d.target
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index 7598ec6a8..2608e7e1d 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -18,8 +18,6 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
@@ -38,23 +36,12 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
 	}
 }
 
-func nowFromContext(ctx context.Context) (int64, bool) {
-	if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
-		return clock.Now().Nanoseconds(), true
-	}
-	return 0, false
-}
-
 // Preconditions: fs.interop != InteropModeShared.
-func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) {
+func (d *dentry) touchAtime(mnt *vfs.Mount) {
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return
 	}
-	now, ok := nowFromContext(ctx)
-	if !ok {
-		mnt.EndWrite()
-		return
-	}
+	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
 	atomic.StoreInt64(&d.atime, now)
 	d.metadataMu.Unlock()
@@ -63,13 +50,25 @@ func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) {
 
 // Preconditions: fs.interop != InteropModeShared. The caller has successfully
 // called vfs.Mount.CheckBeginWrite().
-func (d *dentry) touchCMtime(ctx context.Context) {
-	now, ok := nowFromContext(ctx)
-	if !ok {
-		return
-	}
+func (d *dentry) touchCtime() {
+	now := d.fs.clock.Now().Nanoseconds()
+	d.metadataMu.Lock()
+	atomic.StoreInt64(&d.ctime, now)
+	d.metadataMu.Unlock()
+}
+
+// Preconditions: fs.interop != InteropModeShared. The caller has successfully
+// called vfs.Mount.CheckBeginWrite().
+func (d *dentry) touchCMtime() {
+	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
 	atomic.StoreInt64(&d.mtime, now)
 	atomic.StoreInt64(&d.ctime, now)
 	d.metadataMu.Unlock()
 }
+
+func (d *dentry) touchCMtimeLocked() {
+	now := d.fs.clock.Now().Nanoseconds()
+	atomic.StoreInt64(&d.mtime, now)
+	atomic.StoreInt64(&d.ctime, now)
+}
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 7d9dcd4c9..97fa7f7ab 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -74,31 +74,34 @@ func ImportFD(mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, err
 	}
 
 	// Retrieve metadata.
-	var s syscall.Stat_t
-	if err := syscall.Fstat(hostFD, &s); err != nil {
+	var s unix.Stat_t
+	if err := unix.Fstat(hostFD, &s); err != nil {
 		return nil, err
 	}
 
 	fileMode := linux.FileMode(s.Mode)
 	fileType := fileMode.FileType()
-	// Pipes, character devices, and sockets.
-	isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
+
+	// Determine if hostFD is seekable. If not, this syscall will return ESPIPE
+	// (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
+	// devices.
+	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
+	seekable := err != syserror.ESPIPE
 
 	i := &inode{
 		hostFD:   hostFD,
-		isStream: isStream,
+		seekable: seekable,
 		isTTY:    isTTY,
 		canMap:   canMap(uint32(fileType)),
 		ino:      fs.NextIno(),
 		mode:     fileMode,
-		// For simplicity, set offset to 0. Technically, we should
-		// only set to 0 on files that are not seekable (sockets, pipes, etc.),
-		// and use the offset from the host fd otherwise.
+		// For simplicity, set offset to 0. Technically, we should use the existing
+		// offset on the host if the file is seekable.
 		offset: 0,
 	}
 
-	// These files can't be memory mapped, assert this.
-	if i.isStream && i.canMap {
+	// Non-seekable files can't be memory mapped, assert this.
+	if !i.seekable && i.canMap {
 		panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
 	}
 
@@ -124,12 +127,12 @@ type inode struct {
 	// This field is initialized at creation time and is immutable.
 	hostFD int
 
-	// isStream is true if the host fd points to a file representing a stream,
+	// seekable is false if the host fd points to a file representing a stream,
 	// e.g. a socket or a pipe. Such files are not seekable and can return
 	// EWOULDBLOCK for I/O operations.
 	//
 	// This field is initialized at creation time and is immutable.
-	isStream bool
+	seekable bool
 
 	// isTTY is true if this file represents a TTY.
 	//
@@ -481,8 +484,7 @@ func (f *fileDescription) Release() {
 // PRead implements FileDescriptionImpl.
 func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	i := f.inode
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if i.isStream {
+	if !i.seekable {
 		return 0, syserror.ESPIPE
 	}
 
@@ -492,8 +494,7 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off
 // Read implements FileDescriptionImpl.
 func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
 	i := f.inode
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if i.isStream {
+	if !i.seekable {
 		n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags)
 		if isBlockError(err) {
 			// If we got any data at all, return it as a "completed" partial read
@@ -538,8 +539,7 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off
 // PWrite implements FileDescriptionImpl.
 func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	i := f.inode
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if i.isStream {
+	if !i.seekable {
 		return 0, syserror.ESPIPE
 	}
 
@@ -549,8 +549,7 @@ func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, of
 // Write implements FileDescriptionImpl.
 func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	i := f.inode
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if i.isStream {
+	if !i.seekable {
 		n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags)
 		if isBlockError(err) {
 			err = syserror.ErrWouldBlock
@@ -593,8 +592,7 @@ func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offs
 // allow directory fds to be imported at all.
 func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) {
 	i := f.inode
-	// TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null.
-	if i.isStream {
+	if !i.seekable {
 		return 0, syserror.ESPIPE
 	}
 
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index a429fa23d..baf81b4db 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -63,6 +63,9 @@ afterSymlink:
 		rp.Advance()
 		return nextVFSD, nil
 	}
+	if len(name) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
 	d.dirMu.Lock()
 	nextVFSD, err := rp.ResolveChild(vfsd, name)
 	if err != nil {
@@ -76,16 +79,22 @@ afterSymlink:
 	}
 	// Resolve any symlink at current path component.
 	if rp.ShouldFollowSymlink() && next.isSymlink() {
-		// TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks".
-		target, err := next.inode.Readlink(ctx)
+		targetVD, targetPathname, err := next.inode.Getlink(ctx)
 		if err != nil {
 			return nil, err
 		}
-		if err := rp.HandleSymlink(target); err != nil {
-			return nil, err
+		if targetVD.Ok() {
+			err := rp.HandleJump(targetVD)
+			targetVD.DecRef()
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			if err := rp.HandleSymlink(targetPathname); err != nil {
+				return nil, err
+			}
 		}
 		goto afterSymlink
-
 	}
 	rp.Advance()
 	return &next.vfsd, nil
@@ -191,6 +200,9 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
 	if pc == "." || pc == ".." {
 		return "", syserror.EEXIST
 	}
+	if len(pc) > linux.NAME_MAX {
+		return "", syserror.ENAMETOOLONG
+	}
 	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
 	if err != nil {
 		return "", err
@@ -433,6 +445,9 @@ afterTrailingSymlink:
 	if pc == "." || pc == ".." {
 		return nil, syserror.EISDIR
 	}
+	if len(pc) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
 	// Determine whether or not we need to create a file.
 	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
 	if err != nil {
@@ -461,19 +476,25 @@ afterTrailingSymlink:
 	}
 	childDentry := childVFSD.Impl().(*Dentry)
 	childInode := childDentry.inode
-	if rp.ShouldFollowSymlink() {
-		if childDentry.isSymlink() {
-			target, err := childInode.Readlink(ctx)
+	if rp.ShouldFollowSymlink() && childDentry.isSymlink() {
+		targetVD, targetPathname, err := childInode.Getlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if targetVD.Ok() {
+			err := rp.HandleJump(targetVD)
+			targetVD.DecRef()
 			if err != nil {
 				return nil, err
 			}
-			if err := rp.HandleSymlink(target); err != nil {
+		} else {
+			if err := rp.HandleSymlink(targetPathname); err != nil {
 				return nil, err
 			}
-			// rp.Final() may no longer be true since we now need to resolve the
-			// symlink target.
-			goto afterTrailingSymlink
 		}
+		// rp.Final() may no longer be true since we now need to resolve the
+		// symlink target.
+		goto afterTrailingSymlink
 	}
 	if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 		return nil, err
@@ -661,7 +682,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if err != nil {
 		return linux.Statfs{}, err
 	}
-	// TODO: actually implement statfs
+	// TODO(gvisor.dev/issue/1193): actually implement statfs.
 	return linux.Statfs{}, syserror.ENOSYS
 }
 
@@ -742,7 +763,7 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 }
 
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
 	_, _, err := fs.walkExistingLocked(ctx, rp)
 	fs.mu.RUnlock()
@@ -755,7 +776,7 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	fs.mu.RLock()
 	_, _, err := fs.walkExistingLocked(ctx, rp)
 	fs.mu.RUnlock()
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 5c84b10c9..65f09af5d 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -181,6 +181,11 @@ func (InodeNotSymlink) Readlink(context.Context) (string, error) {
 	return "", syserror.EINVAL
 }
 
+// Getlink implements Inode.Getlink.
+func (InodeNotSymlink) Getlink(context.Context) (vfs.VirtualDentry, string, error) {
+	return vfs.VirtualDentry{}, "", syserror.EINVAL
+}
+
 // InodeAttrs partially implements the Inode interface, specifically the
 // inodeMetadata sub interface. InodeAttrs provides functionality related to
 // inode attributes.
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 2cefef020..ad76b9f64 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -414,7 +414,21 @@ type inodeDynamicLookup interface {
 }
 
 type inodeSymlink interface {
-	// Readlink resolves the target of a symbolic link. If an inode is not a
+	// Readlink returns the target of a symbolic link. If an inode is not a
 	// symlink, the implementation should return EINVAL.
 	Readlink(ctx context.Context) (string, error)
+
+	// Getlink returns the target of a symbolic link, as used by path
+	// resolution:
+	//
+	// - If the inode is a "magic link" (a link whose target is most accurately
+	// represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "",
+	// nil). A reference is taken on the returned VirtualDentry.
+	//
+	// - If the inode is an ordinary symlink, Getlink returns (zero-value
+	// VirtualDentry, symlink target, nil).
+	//
+	// - If the inode is not a symlink, Getlink returns (zero-value
+	// VirtualDentry, "", EINVAL).
+	Getlink(ctx context.Context) (vfs.VirtualDentry, string, error)
 }
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 5918d3309..018aa503c 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -55,6 +55,11 @@ func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
 	return s.target, nil
 }
 
+// Getlink implements Inode.Getlink.
+func (s *StaticSymlink) Getlink(_ context.Context) (vfs.VirtualDentry, string, error) {
+	return vfs.VirtualDentry{}, s.target, nil
+}
+
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
 func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 8156984eb..17c1342b5 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -22,7 +22,6 @@ go_library(
         "//pkg/log",
         "//pkg/refs",
         "//pkg/safemem",
-        "//pkg/sentry/fs",
         "//pkg/sentry/fsbridge",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/inet",
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index aee2a4392..888afc0fd 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -214,22 +214,6 @@ func newIO(t *kernel.Task, isThreadGroup bool) *ioData {
 	return &ioData{ioUsage: t}
 }
 
-func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
-	// Namespace symlinks should contain the namespace name and the inode number
-	// for the namespace instance, so for example user:[123456]. We currently fake
-	// the inode number by sticking the symlink inode in its place.
-	target := fmt.Sprintf("%s:[%d]", ns, ino)
-
-	inode := &kernfs.StaticSymlink{}
-	// Note: credentials are overridden by taskOwnedInode.
-	inode.Init(task.Credentials(), ino, target)
-
-	taskInode := &taskOwnedInode{Inode: inode, owner: task}
-	d := &kernfs.Dentry{}
-	d.Init(taskInode)
-	return d
-}
-
 // newCgroupData creates inode that shows cgroup information.
 // From man 7 cgroups: "For each cgroup hierarchy of which the process is a
 // member, there is one entry containing three colon-separated fields:
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 76bfc5307..046265eca 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -30,34 +30,35 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-type fdDir struct {
-	inoGen InoGenerator
-	task   *kernel.Task
-
-	// When produceSymlinks is set, dirents produces for the FDs are reported
-	// as symlink. Otherwise, they are reported as regular files.
-	produceSymlink bool
-}
-
-func (i *fdDir) lookup(name string) (*vfs.FileDescription, kernel.FDFlags, error) {
-	fd, err := strconv.ParseUint(name, 10, 64)
-	if err != nil {
-		return nil, kernel.FDFlags{}, syserror.ENOENT
-	}
-
+func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) {
 	var (
 		file  *vfs.FileDescription
 		flags kernel.FDFlags
 	)
-	i.task.WithMuLocked(func(t *kernel.Task) {
-		if fdTable := t.FDTable(); fdTable != nil {
-			file, flags = fdTable.GetVFS2(int32(fd))
+	t.WithMuLocked(func(t *kernel.Task) {
+		if fdt := t.FDTable(); fdt != nil {
+			file, flags = fdt.GetVFS2(fd)
 		}
 	})
+	return file, flags
+}
+
+func taskFDExists(t *kernel.Task, fd int32) bool {
+	file, _ := getTaskFD(t, fd)
 	if file == nil {
-		return nil, kernel.FDFlags{}, syserror.ENOENT
+		return false
 	}
-	return file, flags, nil
+	file.DecRef()
+	return true
+}
+
+type fdDir struct {
+	inoGen InoGenerator
+	task   *kernel.Task
+
+	// When produceSymlinks is set, dirents produces for the FDs are reported
+	// as symlink. Otherwise, they are reported as regular files.
+	produceSymlink bool
 }
 
 // IterDirents implements kernfs.inodeDynamicLookup.
@@ -128,11 +129,15 @@ func newFDDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry {
 
 // Lookup implements kernfs.inodeDynamicLookup.
 func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
-	file, _, err := i.lookup(name)
+	fdInt, err := strconv.ParseInt(name, 10, 32)
 	if err != nil {
-		return nil, err
+		return nil, syserror.ENOENT
+	}
+	fd := int32(fdInt)
+	if !taskFDExists(i.task, fd) {
+		return nil, syserror.ENOENT
 	}
-	taskDentry := newFDSymlink(i.task.Credentials(), file, i.inoGen.NextIno())
+	taskDentry := newFDSymlink(i.task, fd, i.inoGen.NextIno())
 	return taskDentry.VFSDentry(), nil
 }
 
@@ -169,19 +174,22 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia
 //
 // +stateify savable
 type fdSymlink struct {
-	refs.AtomicRefCount
 	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
 
-	file *vfs.FileDescription
+	task *kernel.Task
+	fd   int32
 }
 
 var _ kernfs.Inode = (*fdSymlink)(nil)
 
-func newFDSymlink(creds *auth.Credentials, file *vfs.FileDescription, ino uint64) *kernfs.Dentry {
-	file.IncRef()
-	inode := &fdSymlink{file: file}
-	inode.Init(creds, ino, linux.ModeSymlink|0777)
+func newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry {
+	inode := &fdSymlink{
+		task: task,
+		fd:   fd,
+	}
+	inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777)
 
 	d := &kernfs.Dentry{}
 	d.Init(inode)
@@ -189,21 +197,25 @@ func newFDSymlink(creds *auth.Credentials, file *vfs.FileDescription, ino uint64
 }
 
 func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
+	file, _ := getTaskFD(s.task, s.fd)
+	if file == nil {
+		return "", syserror.ENOENT
+	}
+	defer file.DecRef()
 	root := vfs.RootFromContext(ctx)
 	defer root.DecRef()
-
-	vfsObj := s.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem()
-	return vfsObj.PathnameWithDeleted(ctx, root, s.file.VirtualDentry())
-}
-
-func (s *fdSymlink) DecRef() {
-	s.AtomicRefCount.DecRefWithDestructor(func() {
-		s.Destroy()
-	})
+	return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry())
 }
 
-func (s *fdSymlink) Destroy() {
-	s.file.DecRef()
+func (s *fdSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+	file, _ := getTaskFD(s.task, s.fd)
+	if file == nil {
+		return vfs.VirtualDentry{}, "", syserror.ENOENT
+	}
+	defer file.DecRef()
+	vd := file.VirtualDentry()
+	vd.IncRef()
+	return vd, "", nil
 }
 
 // fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory.
@@ -238,12 +250,18 @@ func newFDInfoDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry {
 
 // Lookup implements kernfs.inodeDynamicLookup.
 func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
-	file, flags, err := i.lookup(name)
+	fdInt, err := strconv.ParseInt(name, 10, 32)
 	if err != nil {
-		return nil, err
+		return nil, syserror.ENOENT
+	}
+	fd := int32(fdInt)
+	if !taskFDExists(i.task, fd) {
+		return nil, syserror.ENOENT
+	}
+	data := &fdInfoData{
+		task: i.task,
+		fd:   fd,
 	}
-
-	data := &fdInfoData{file: file, flags: flags}
 	dentry := newTaskOwnedFile(i.task, i.inoGen.NextIno(), 0444, data)
 	return dentry.VFSDentry(), nil
 }
@@ -262,26 +280,23 @@ type fdInfoData struct {
 	kernfs.DynamicBytesFile
 	refs.AtomicRefCount
 
-	file  *vfs.FileDescription
-	flags kernel.FDFlags
+	task *kernel.Task
+	fd   int32
 }
 
 var _ dynamicInode = (*fdInfoData)(nil)
 
-func (d *fdInfoData) DecRef() {
-	d.AtomicRefCount.DecRefWithDestructor(d.destroy)
-}
-
-func (d *fdInfoData) destroy() {
-	d.file.DecRef()
-}
-
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	file, descriptorFlags := getTaskFD(d.task, d.fd)
+	if file == nil {
+		return syserror.ENOENT
+	}
+	defer file.DecRef()
 	// TODO(b/121266871): Include pos, locks, and other data. For now we only
 	// have flags.
 	// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
-	flags := uint(d.file.StatusFlags()) | d.flags.ToLinuxFileFlags()
+	flags := uint(file.StatusFlags()) | descriptorFlags.ToLinuxFileFlags()
 	fmt.Fprintf(buf, "flags:\t0%o\n", flags)
 	return nil
 }
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 8c743df8d..2c6f8bdfc 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -18,13 +18,10 @@ import (
 	"bytes"
 	"fmt"
 	"io"
-	"sort"
-	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -67,6 +64,16 @@ func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) {
 	return m, nil
 }
 
+func checkTaskState(t *kernel.Task) error {
+	switch t.ExitState() {
+	case kernel.TaskExitZombie:
+		return syserror.EACCES
+	case kernel.TaskExitDead:
+		return syserror.ESRCH
+	}
+	return nil
+}
+
 type bufferWriter struct {
 	buf *bytes.Buffer
 }
@@ -613,12 +620,31 @@ func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
 	return exec.PathnameWithDeleted(ctx), nil
 }
 
+// Getlink implements kernfs.Inode.Getlink.
+func (s *exeSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+	if !kernel.ContextCanTrace(ctx, s.task, false) {
+		return vfs.VirtualDentry{}, "", syserror.EACCES
+	}
+
+	exec, err := s.executable()
+	if err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+	defer exec.DecRef()
+
+	vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
+	vd.IncRef()
+	return vd, "", nil
+}
+
 func (s *exeSymlink) executable() (file fsbridge.File, err error) {
+	if err := checkTaskState(s.task); err != nil {
+		return nil, err
+	}
+
 	s.task.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
-			// TODO(b/34851096): Check shouldn't allow Readlink once the
-			// Task is zombied.
 			err = syserror.EACCES
 			return
 		}
@@ -628,57 +654,12 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) {
 		// (with locks held).
 		file = mm.Executable()
 		if file == nil {
-			err = syserror.ENOENT
+			err = syserror.ESRCH
 		}
 	})
 	return
 }
 
-// forEachMountSource runs f for the process root mount and each mount that is
-// a descendant of the root.
-func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
-	var fsctx *kernel.FSContext
-	t.WithMuLocked(func(t *kernel.Task) {
-		fsctx = t.FSContext()
-	})
-	if fsctx == nil {
-		// The task has been destroyed. Nothing to show here.
-		return
-	}
-
-	// All mount points must be relative to the rootDir, and mounts outside
-	// will be excluded.
-	rootDir := fsctx.RootDirectory()
-	if rootDir == nil {
-		// The task has been destroyed. Nothing to show here.
-		return
-	}
-	defer rootDir.DecRef()
-
-	mnt := t.MountNamespace().FindMount(rootDir)
-	if mnt == nil {
-		// Has it just been unmounted?
-		return
-	}
-	ms := t.MountNamespace().AllMountsUnder(mnt)
-	sort.Slice(ms, func(i, j int) bool {
-		return ms[i].ID < ms[j].ID
-	})
-	for _, m := range ms {
-		mroot := m.Root()
-		if mroot == nil {
-			continue // No longer valid.
-		}
-		mountPath, desc := mroot.FullName(rootDir)
-		mroot.DecRef()
-		if !desc {
-			// MountSources that are not descendants of the chroot jail are ignored.
-			continue
-		}
-		fn(mountPath, m)
-	}
-}
-
 // mountInfoData is used to implement /proc/[pid]/mountinfo.
 //
 // +stateify savable
@@ -692,92 +673,22 @@ var _ dynamicInode = (*mountInfoData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	forEachMount(i.task, func(mountPath string, m *fs.Mount) {
-		mroot := m.Root()
-		if mroot == nil {
-			return // No longer valid.
-		}
-		defer mroot.DecRef()
-
-		// Format:
-		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
-		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
-
-		// (1) MountSource ID.
-		fmt.Fprintf(buf, "%d ", m.ID)
-
-		// (2)  Parent ID (or this ID if there is no parent).
-		pID := m.ID
-		if !m.IsRoot() && !m.IsUndo() {
-			pID = m.ParentID
-		}
-		fmt.Fprintf(buf, "%d ", pID)
-
-		// (3) Major:Minor device ID. We don't have a superblock, so we
-		// just use the root inode device number.
-		sa := mroot.Inode.StableAttr
-		fmt.Fprintf(buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor)
-
-		// (4) Root: the pathname of the directory in the filesystem
-		// which forms the root of this mount.
-		//
-		// NOTE(b/78135857): This will always be "/" until we implement
-		// bind mounts.
-		fmt.Fprintf(buf, "/ ")
-
-		// (5) Mount point (relative to process root).
-		fmt.Fprintf(buf, "%s ", mountPath)
-
-		// (6) Mount options.
-		flags := mroot.Inode.MountSource.Flags
-		opts := "rw"
-		if flags.ReadOnly {
-			opts = "ro"
-		}
-		if flags.NoAtime {
-			opts += ",noatime"
-		}
-		if flags.NoExec {
-			opts += ",noexec"
-		}
-		fmt.Fprintf(buf, "%s ", opts)
-
-		// (7) Optional fields: zero or more fields of the form "tag[:value]".
-		// (8) Separator: the end of the optional fields is marked by a single hyphen.
-		fmt.Fprintf(buf, "- ")
-
-		// (9) Filesystem type.
-		fmt.Fprintf(buf, "%s ", mroot.Inode.MountSource.FilesystemType)
-
-		// (10) Mount source: filesystem-specific information or "none".
-		fmt.Fprintf(buf, "none ")
-
-		// (11) Superblock options, and final newline.
-		fmt.Fprintf(buf, "%s\n", superBlockOpts(mountPath, mroot.Inode.MountSource))
+	var fsctx *kernel.FSContext
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		fsctx = t.FSContext()
 	})
-	return nil
-}
-
-func superBlockOpts(mountPath string, msrc *fs.MountSource) string {
-	// gVisor doesn't (yet) have a concept of super block options, so we
-	// use the ro/rw bit from the mount flag.
-	opts := "rw"
-	if msrc.Flags.ReadOnly {
-		opts = "ro"
+	if fsctx == nil {
+		// The task has been destroyed. Nothing to show here.
+		return nil
 	}
-
-	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
-	// the cgroup name in the options. For now we just read that from the
-	// path.
-	// TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we
-	// should get this value from the cgroup itself, and not rely on the
-	// path.
-	if msrc.FilesystemType == "cgroup" {
-		splitPath := strings.Split(mountPath, "/")
-		cgroupType := splitPath[len(splitPath)-1]
-		opts += "," + cgroupType
+	rootDir := fsctx.RootDirectoryVFS2()
+	if !rootDir.Ok() {
+		// Root has been destroyed. Don't try to read mounts.
+		return nil
 	}
-	return opts
+	defer rootDir.DecRef()
+	i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf)
+	return nil
 }
 
 // mountsData is used to implement /proc/[pid]/mounts.
@@ -789,33 +700,62 @@ type mountsData struct {
 	task *kernel.Task
 }
 
-var _ dynamicInode = (*mountInfoData)(nil)
+var _ dynamicInode = (*mountsData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	forEachMount(i.task, func(mountPath string, m *fs.Mount) {
-		// Format:
-		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
-		//
-		// We use the filesystem name as the first field, since there
-		// is no real block device we can point to, and we also should
-		// not expose anything about the remote filesystem.
-		//
-		// Only ro/rw option is supported for now.
-		//
-		// The "needs dump"and fsck flags are always 0, which is allowed.
-		root := m.Root()
-		if root == nil {
-			return // No longer valid.
-		}
-		defer root.DecRef()
-
-		flags := root.Inode.MountSource.Flags
-		opts := "rw"
-		if flags.ReadOnly {
-			opts = "ro"
-		}
-		fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", mountPath, root.Inode.MountSource.FilesystemType, opts, 0, 0)
+	var fsctx *kernel.FSContext
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		fsctx = t.FSContext()
 	})
+	if fsctx == nil {
+		// The task has been destroyed. Nothing to show here.
+		return nil
+	}
+	rootDir := fsctx.RootDirectoryVFS2()
+	if !rootDir.Ok() {
+		// Root has been destroyed. Don't try to read mounts.
+		return nil
+	}
+	defer rootDir.DecRef()
+	i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf)
 	return nil
 }
+
+type namespaceSymlink struct {
+	kernfs.StaticSymlink
+
+	task *kernel.Task
+}
+
+func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
+	// Namespace symlinks should contain the namespace name and the inode number
+	// for the namespace instance, so for example user:[123456]. We currently fake
+	// the inode number by sticking the symlink inode in its place.
+	target := fmt.Sprintf("%s:[%d]", ns, ino)
+
+	inode := &namespaceSymlink{task: task}
+	// Note: credentials are overridden by taskOwnedInode.
+	inode.Init(task.Credentials(), ino, target)
+
+	taskInode := &taskOwnedInode{Inode: inode, owner: task}
+	d := &kernfs.Dentry{}
+	d.Init(taskInode)
+	return d
+}
+
+// Readlink implements Inode.
+func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) {
+	if err := checkTaskState(s.task); err != nil {
+		return "", err
+	}
+	return s.StaticSymlink.Readlink(ctx)
+}
+
+// Getlink implements Inode.Getlink.
+func (s *namespaceSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+	if err := checkTaskState(s.task); err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+	return s.StaticSymlink.Getlink(ctx)
+}
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index 373a7b17d..6595fcee6 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -24,7 +24,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -32,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -206,22 +206,21 @@ var _ dynamicInode = (*netUnixData)(nil)
 func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
 	for _, se := range n.kernel.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
+		s := se.SockVFS2
+		if !s.TryIncRef() {
+			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
 			continue
 		}
-		sfile := s.(*fs.File)
-		if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
+		if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX {
 			s.DecRef()
 			// Not a unix socket.
 			continue
 		}
-		sops := sfile.FileOperations.(*unix.SocketOperations)
+		sops := s.Impl().(*unix.SocketVFS2)
 
 		addr, err := sops.Endpoint().GetLocalAddress()
 		if err != nil {
-			log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err)
+			log.Warningf("Failed to retrieve socket name from %+v: %v", s, err)
 			addr.Addr = "<unknown>"
 		}
 
@@ -234,6 +233,15 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 			}
 		}
 
+		// Get inode number.
+		var ino uint64
+		stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_INO})
+		if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
+			log.Warningf("Failed to retrieve ino for socket file: %v", statErr)
+		} else {
+			ino = stat.Ino
+		}
+
 		// In the socket entry below, the value for the 'Num' field requires
 		// some consideration. Linux prints the address to the struct
 		// unix_sock representing a socket in the kernel, but may redact the
@@ -252,14 +260,14 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		// the definition of this struct changes over time.
 		//
 		// For now, we always redact this pointer.
-		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d",
+		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d",
 			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
-			sfile.ReadRefs()-1,            // RefCount, don't count our own ref.
+			s.Refs()-1,                    // RefCount, don't count our own ref.
 			0,                             // Protocol, always 0 for UDS.
 			sockFlags,                     // Flags.
 			sops.Endpoint().Type(),        // Type.
 			sops.State(),                  // State.
-			sfile.InodeID(),               // Inode.
+			ino,                           // Inode.
 		)
 
 		// Path
@@ -341,15 +349,14 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 	t := kernel.TaskFromContext(ctx)
 
 	for _, se := range k.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
+		s := se.SockVFS2
+		if !s.TryIncRef() {
+			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
 			continue
 		}
-		sfile := s.(*fs.File)
-		sops, ok := sfile.FileOperations.(socket.Socket)
+		sops, ok := s.Impl().(socket.SocketVFS2)
 		if !ok {
-			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
 		}
 		if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
 			s.DecRef()
@@ -398,14 +405,15 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 		// Unimplemented.
 		fmt.Fprintf(buf, "%08X ", 0)
 
+		stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO})
+
 		// Field: uid.
-		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
-		if err != nil {
-			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+		if statErr != nil || stat.Mask&linux.STATX_UID == 0 {
+			log.Warningf("Failed to retrieve uid for socket file: %v", statErr)
 			fmt.Fprintf(buf, "%5d ", 0)
 		} else {
 			creds := auth.CredentialsFromContext(ctx)
-			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
+			fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow()))
 		}
 
 		// Field: timeout; number of unanswered 0-window probes.
@@ -413,11 +421,16 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 		fmt.Fprintf(buf, "%8d ", 0)
 
 		// Field: inode.
-		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+		if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
+			log.Warningf("Failed to retrieve inode for socket file: %v", statErr)
+			fmt.Fprintf(buf, "%8d ", 0)
+		} else {
+			fmt.Fprintf(buf, "%8d ", stat.Ino)
+		}
 
 		// Field: refcount. Don't count the ref we obtain while deferencing
 		// the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+		fmt.Fprintf(buf, "%d ", s.Refs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -499,15 +512,14 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	t := kernel.TaskFromContext(ctx)
 
 	for _, se := range d.kernel.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
+		s := se.SockVFS2
+		if !s.TryIncRef() {
+			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
 			continue
 		}
-		sfile := s.(*fs.File)
-		sops, ok := sfile.FileOperations.(socket.Socket)
+		sops, ok := s.Impl().(socket.SocketVFS2)
 		if !ok {
-			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
 		}
 		if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
 			s.DecRef()
@@ -551,25 +563,31 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		// Field: retrnsmt. Always 0 for UDP.
 		fmt.Fprintf(buf, "%08X ", 0)
 
+		stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO})
+
 		// Field: uid.
-		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
-		if err != nil {
-			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+		if statErr != nil || stat.Mask&linux.STATX_UID == 0 {
+			log.Warningf("Failed to retrieve uid for socket file: %v", statErr)
 			fmt.Fprintf(buf, "%5d ", 0)
 		} else {
 			creds := auth.CredentialsFromContext(ctx)
-			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
+			fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow()))
 		}
 
 		// Field: timeout. Always 0 for UDP.
 		fmt.Fprintf(buf, "%8d ", 0)
 
 		// Field: inode.
-		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+		if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
+			log.Warningf("Failed to retrieve inode for socket file: %v", statErr)
+			fmt.Fprintf(buf, "%8d ", 0)
+		} else {
+			fmt.Fprintf(buf, "%8d ", stat.Ino)
+		}
 
 		// Field: ref; reference count on the socket inode. Don't count the ref
 		// we obtain while deferencing the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+		fmt.Fprintf(buf, "%d ", s.Refs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -670,9 +688,9 @@ func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		if line.prefix == "Tcp" {
 			tcp := stat.(*inet.StatSNMPTCP)
 			// "Tcp" needs special processing because MaxConn is signed. RFC 2012.
-			fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
+			fmt.Fprintf(buf, "%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
 		} else {
-			fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat)))
+			fmt.Fprintf(buf, "%s: %s\n", line.prefix, sprintSlice(toSlice(stat)))
 		}
 	}
 	return nil
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 882c1981e..4621e2de0 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -63,6 +63,11 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
 	return strconv.FormatUint(uint64(tgid), 10), nil
 }
 
+func (s *selfSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx)
+	return vfs.VirtualDentry{}, target, err
+}
+
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
 func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
@@ -101,6 +106,11 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
 	return fmt.Sprintf("%d/task/%d", tgid, tid), nil
 }
 
+func (s *threadSelfSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx)
+	return vfs.VirtualDentry{}, target, err
+}
+
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
 func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD
new file mode 100644
index 000000000..52084ddb5
--- /dev/null
+++ b/pkg/sentry/fsimpl/sockfs/BUILD
@@ -0,0 +1,17 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "sockfs",
+    srcs = ["sockfs.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
new file mode 100644
index 000000000..3f7ad1d65
--- /dev/null
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -0,0 +1,102 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sockfs provides a filesystem implementation for anonymous sockets.
+package sockfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// NewFilesystem creates a new sockfs filesystem.
+//
+// Note that there should only ever be one instance of sockfs.Filesystem,
+// backing a global socket mount.
+func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem {
+	fs, _, err := filesystemType{}.GetFilesystem(nil, vfsObj, nil, "", vfs.GetFilesystemOptions{})
+	if err != nil {
+		panic("failed to create sockfs filesystem")
+	}
+	return fs
+}
+
+// filesystemType implements vfs.FilesystemType.
+type filesystemType struct{}
+
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	fs := &filesystem{}
+	fs.Init(vfsObj, fsType)
+	return fs.VFSFilesystem(), nil, nil
+}
+
+// Name implements FilesystemType.Name.
+//
+// Note that registering sockfs is unnecessary, except for the fact that it
+// will not show up under /proc/filesystems as a result. This is a very minor
+// discrepancy from Linux.
+func (filesystemType) Name() string {
+	return "sockfs"
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+}
+
+// inode implements kernfs.Inode.
+//
+// TODO(gvisor.dev/issue/1476): Add device numbers to this inode (which are
+// not included in InodeAttrs) to store the numbers of the appropriate
+// socket device. Override InodeAttrs.Stat() accordingly.
+type inode struct {
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return nil, syserror.ENXIO
+}
+
+// InitSocket initializes a socket FileDescription, with a corresponding
+// Dentry in mnt.
+//
+// fd should be the FileDescription associated with socketImpl, i.e. its first
+// field. mnt should be the global socket mount, Kernel.socketMount.
+func InitSocket(socketImpl vfs.FileDescriptionImpl, fd *vfs.FileDescription, mnt *vfs.Mount, creds *auth.Credentials) error {
+	fsimpl := mnt.Filesystem().Impl()
+	fs := fsimpl.(*kernfs.Filesystem)
+
+	// File mode matches net/socket.c:sock_alloc.
+	filemode := linux.FileMode(linux.S_IFSOCK | 0600)
+	i := &inode{}
+	i.Init(creds, fs.NextIno(), filemode)
+
+	d := &kernfs.Dentry{}
+	d.Init(i)
+
+	opts := &vfs.FileDescriptionOptions{UseDentryMetadata: true}
+	if err := fd.Init(socketImpl, linux.O_RDWR, mnt, d.VFSDentry(), opts); err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index e16808c63..0556af877 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -162,6 +162,9 @@ func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector {
 // exactly the specified set of expected entries. AssertAllDirentTypes respects
 // collector.skipDots, and implicitly checks for "." and ".." accordingly.
 func (s *System) AssertAllDirentTypes(collector *DirentCollector, expected map[string]DirentType) {
+	if expected == nil {
+		expected = make(map[string]DirentType)
+	}
 	// Also implicitly check for "." and "..", if enabled.
 	if !collector.skipDots {
 		expected["."] = linux.DT_DIR
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 6ea35affb..4e6cd3491 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -24,6 +24,7 @@ go_library(
         "filesystem.go",
         "named_pipe.go",
         "regular_file.go",
+        "socket_file.go",
         "symlink.go",
         "tmpfs.go",
     ],
@@ -50,6 +51,7 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
         "//pkg/sentry/vfs/lock",
+        "//pkg/sentry/vfs/memxattr",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 37c75ab64..45712c9b9 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -68,6 +68,8 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
 
+	fd.inode().touchAtime(fd.vfsfd.Mount())
+
 	if fd.off == 0 {
 		if err := cb.Handle(vfs.Dirent{
 			Name:    ".",
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index e678ecc37..f4d50d64f 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -46,6 +46,9 @@ func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
 		return nil, err
 	}
 afterSymlink:
+	if len(rp.Component()) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
 	nextVFSD, err := rp.ResolveComponent(&d.vfsd)
 	if err != nil {
 		return nil, err
@@ -57,7 +60,7 @@ afterSymlink:
 	}
 	next := nextVFSD.Impl().(*dentry)
 	if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
-		// TODO(gvisor.dev/issues/1197): Symlink traversals updates
+		// TODO(gvisor.dev/issue/1197): Symlink traversals updates
 		// access time.
 		if err := rp.HandleSymlink(symlink.target); err != nil {
 			return nil, err
@@ -133,6 +136,9 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 	if name == "." || name == ".." {
 		return syserror.EEXIST
 	}
+	if len(name) > linux.NAME_MAX {
+		return syserror.ENAMETOOLONG
+	}
 	// Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
 	// because if the child exists we want to return EEXIST immediately instead
 	// of attempting symlink/mount traversal.
@@ -153,7 +159,11 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 		return err
 	}
 	defer mnt.EndWrite()
-	return create(parent, name)
+	if err := create(parent, name); err != nil {
+		return err
+	}
+	parent.inode.touchCMtime()
+	return nil
 }
 
 // AccessAt implements vfs.Filesystem.Impl.AccessAt.
@@ -251,8 +261,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		case linux.S_IFCHR:
 			childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
 		case linux.S_IFSOCK:
-			// Not yet supported.
-			return syserror.EPERM
+			childInode = fs.newSocketFile(rp.Credentials(), opts.Mode, opts.Endpoint)
 		default:
 			return syserror.EINVAL
 		}
@@ -328,7 +337,12 @@ afterTrailingSymlink:
 		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
 		parent.vfsd.InsertChild(&child.vfsd, name)
 		parent.inode.impl.(*directory).childList.PushBack(child)
-		return child.open(ctx, rp, &opts, true)
+		fd, err := child.open(ctx, rp, &opts, true)
+		if err != nil {
+			return nil, err
+		}
+		parent.inode.touchCMtime()
+		return fd, nil
 	}
 	if err != nil {
 		return nil, err
@@ -381,6 +395,8 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, opts.Flags)
 	case *deviceFile:
 		return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
+	case *socketFile:
+		return nil, syserror.ENXIO
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
 	}
@@ -398,6 +414,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
 	if !ok {
 		return "", syserror.EINVAL
 	}
+	symlink.inode.touchAtime(rp.Mount())
 	return symlink.target, nil
 }
 
@@ -515,7 +532,10 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		oldParent.inode.decLinksLocked()
 		newParent.inode.incLinksLocked()
 	}
-	// TODO(gvisor.dev/issues/1197): Update timestamps and parent directory
+	oldParent.inode.touchCMtime()
+	newParent.inode.touchCMtime()
+	renamed.inode.touchCtime()
+	// TODO(gvisor.dev/issue/1197): Update timestamps and parent directory
 	// sizes.
 	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
 	return nil
@@ -565,6 +585,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	parent.inode.decLinksLocked() // from child's ".."
 	child.inode.decLinksLocked()
 	vfsObj.CommitDeleteDentry(childVFSD)
+	parent.inode.touchCMtime()
 	return nil
 }
 
@@ -600,7 +621,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if err != nil {
 		return linux.Statfs{}, err
 	}
-	// TODO(gvisor.dev/issues/1197): Actually implement statfs.
+	// TODO(gvisor.dev/issue/1197): Actually implement statfs.
 	return linux.Statfs{}, syserror.ENOSYS
 }
 
@@ -654,62 +675,68 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	parent.inode.impl.(*directory).childList.Remove(child)
 	child.inode.decLinksLocked()
 	vfsObj.CommitDeleteDentry(childVFSD)
+	parent.inode.touchCMtime()
 	return nil
 }
 
 // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
-//
-// TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) {
-	return nil, syserror.ECONNREFUSED
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return nil, err
+	}
+	switch impl := d.inode.impl.(type) {
+	case *socketFile:
+		return impl.ep, nil
+	default:
+		return nil, syserror.ECONNREFUSED
+	}
 }
 
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return nil, err
 	}
-	// TODO(b/127675828): support extended attributes
-	return nil, syserror.ENOTSUP
+	return d.inode.listxattr(size)
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return "", err
 	}
-	// TODO(b/127675828): support extended attributes
-	return "", syserror.ENOTSUP
+	return d.inode.getxattr(rp.Credentials(), &opts)
 }
 
 // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
 func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
-	// TODO(b/127675828): support extended attributes
-	return syserror.ENOTSUP
+	return d.inode.setxattr(rp.Credentials(), &opts)
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
 func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
-	// TODO(b/127675828): support extended attributes
-	return syserror.ENOTSUP
+	return d.inode.removexattr(rp.Credentials(), name)
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 26cd65605..57e5e28ec 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -286,7 +286,8 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 	rw := getRegularFileReadWriter(f, offset)
 	n, err := dst.CopyOutFrom(ctx, rw)
 	putRegularFileReadWriter(rw)
-	return int64(n), err
+	fd.inode().touchAtime(fd.vfsfd.Mount())
+	return n, err
 }
 
 // Read implements vfs.FileDescriptionImpl.Read.
@@ -323,6 +324,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 	f.inode.mu.Lock()
 	rw := getRegularFileReadWriter(f, offset)
 	n, err := src.CopyInTo(ctx, rw)
+	fd.inode().touchCMtimeLocked()
 	f.inode.mu.Unlock()
 	putRegularFileReadWriter(rw)
 	return n, err
diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
new file mode 100644
index 000000000..25c2321af
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -0,0 +1,34 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+)
+
+// socketFile is a socket (=S_IFSOCK) tmpfs file.
+type socketFile struct {
+	inode inode
+	ep    transport.BoundEndpoint
+}
+
+func (fs *filesystem) newSocketFile(creds *auth.Credentials, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
+	file := &socketFile{ep: ep}
+	file.inode.init(file, fs, creds, mode)
+	file.inode.nlink = 1 // from parent directory
+	return &file.inode
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go
index ebe035dee..d4f59ee5b 100644
--- a/pkg/sentry/fsimpl/tmpfs/stat_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go
@@ -29,7 +29,7 @@ func TestStatAfterCreate(t *testing.T) {
 	mode := linux.FileMode(0644)
 
 	// Run with different file types.
-	// TODO(gvisor.dev/issues/1197): Also test symlinks and sockets.
+	// TODO(gvisor.dev/issue/1197): Also test symlinks and sockets.
 	for _, typ := range []string{"file", "dir", "pipe"} {
 		t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
 			var (
@@ -140,7 +140,7 @@ func TestSetStatAtime(t *testing.T) {
 		Mask:  0,
 		Atime: linux.NsecToStatxTimestamp(100),
 	}}); err != nil {
-		t.Errorf("SetStat atime without mask failed: %v")
+		t.Errorf("SetStat atime without mask failed: %v", err)
 	}
 	// Atime should be unchanged.
 	if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
@@ -155,7 +155,7 @@ func TestSetStatAtime(t *testing.T) {
 		Atime: linux.NsecToStatxTimestamp(100),
 	}
 	if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil {
-		t.Errorf("SetStat atime with mask failed: %v")
+		t.Errorf("SetStat atime with mask failed: %v", err)
 	}
 	if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
 		t.Errorf("Stat got error: %v", err)
@@ -169,7 +169,7 @@ func TestSetStat(t *testing.T) {
 	mode := linux.FileMode(0644)
 
 	// Run with different file types.
-	// TODO(gvisor.dev/issues/1197): Also test symlinks and sockets.
+	// TODO(gvisor.dev/issue/1197): Also test symlinks and sockets.
 	for _, typ := range []string{"file", "dir", "pipe"} {
 		t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
 			var (
@@ -205,7 +205,7 @@ func TestSetStat(t *testing.T) {
 				Mask:  0,
 				Atime: linux.NsecToStatxTimestamp(100),
 			}}); err != nil {
-				t.Errorf("SetStat atime without mask failed: %v")
+				t.Errorf("SetStat atime without mask failed: %v", err)
 			}
 			// Atime should be unchanged.
 			if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
@@ -220,7 +220,7 @@ func TestSetStat(t *testing.T) {
 				Atime: linux.NsecToStatxTimestamp(100),
 			}
 			if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil {
-				t.Errorf("SetStat atime with mask failed: %v")
+				t.Errorf("SetStat atime with mask failed: %v", err)
 			}
 			if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
 				t.Errorf("Stat got error: %v", err)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index b07b0dbae..9fa8637d5 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -27,6 +27,7 @@ package tmpfs
 import (
 	"fmt"
 	"math"
+	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -37,6 +38,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/memxattr"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -68,6 +70,22 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// FilesystemOpts is used to pass configuration data to tmpfs.
+type FilesystemOpts struct {
+	// RootFileType is the FileType of the filesystem root. Valid values
+	// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
+	RootFileType uint16
+
+	// RootSymlinkTarget is the target of the root symlink. Only valid if
+	// RootFileType == S_IFLNK.
+	RootSymlinkTarget string
+
+	// FilesystemType allows setting a different FilesystemType for this
+	// tmpfs filesystem. This allows tmpfs to "impersonate" other
+	// filesystems, like ramdiskfs and cgroupfs.
+	FilesystemType vfs.FilesystemType
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
@@ -79,9 +97,33 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		memFile: memFileProvider.MemoryFile(),
 		clock:   clock,
 	}
-	fs.vfsfs.Init(vfsObj, &fstype, &fs)
-	root := fs.newDentry(fs.newDirectory(creds, 01777))
-	return &fs.vfsfs, &root.vfsd, nil
+
+	rootFileType := uint16(linux.S_IFDIR)
+	newFSType := vfs.FilesystemType(&fstype)
+	tmpfsOpts, ok := opts.InternalData.(FilesystemOpts)
+	if ok {
+		if tmpfsOpts.RootFileType != 0 {
+			rootFileType = tmpfsOpts.RootFileType
+		}
+		if tmpfsOpts.FilesystemType != nil {
+			newFSType = tmpfsOpts.FilesystemType
+		}
+	}
+
+	fs.vfsfs.Init(vfsObj, newFSType, &fs)
+
+	var root *inode
+	switch rootFileType {
+	case linux.S_IFREG:
+		root = fs.newRegularFile(creds, 0777)
+	case linux.S_IFLNK:
+		root = fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget)
+	case linux.S_IFDIR:
+		root = fs.newDirectory(creds, 01777)
+	default:
+		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
+	}
+	return &fs.vfsfs, &fs.newDentry(root).vfsd, nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
@@ -146,6 +188,11 @@ type inode struct {
 	// filesystem.RmdirAt() drops the reference.
 	refs int64
 
+	// xattrs implements extended attributes.
+	//
+	// TODO(b/148380782): Support xattrs other than user.*
+	xattrs memxattr.SimpleExtendedAttributes
+
 	// Inode metadata. Writing multiple fields atomically requires holding
 	// mu, othewise atomic operations can be used.
 	mu    sync.Mutex
@@ -275,7 +322,7 @@ func (i *inode) statTo(stat *linux.Statx) {
 	stat.Atime = linux.NsecToStatxTimestamp(i.atime)
 	stat.Ctime = linux.NsecToStatxTimestamp(i.ctime)
 	stat.Mtime = linux.NsecToStatxTimestamp(i.mtime)
-	// TODO(gvisor.dev/issues/1197): Device number.
+	// TODO(gvisor.dev/issue/1197): Device number.
 	switch impl := i.impl.(type) {
 	case *regularFile:
 		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
@@ -291,7 +338,7 @@ func (i *inode) statTo(stat *linux.Statx) {
 	case *deviceFile:
 		stat.RdevMajor = impl.major
 		stat.RdevMinor = impl.minor
-	case *directory, *namedPipe:
+	case *socketFile, *directory, *namedPipe:
 		// Nothing to do.
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
@@ -345,28 +392,41 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 			return syserror.EINVAL
 		}
 	}
+	now := i.clock.Now().Nanoseconds()
 	if mask&linux.STATX_ATIME != 0 {
-		atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped())
+		if stat.Atime.Nsec == linux.UTIME_NOW {
+			atomic.StoreInt64(&i.atime, now)
+		} else {
+			atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped())
+		}
 		needsCtimeBump = true
 	}
 	if mask&linux.STATX_MTIME != 0 {
-		atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped())
+		if stat.Mtime.Nsec == linux.UTIME_NOW {
+			atomic.StoreInt64(&i.mtime, now)
+		} else {
+			atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped())
+		}
 		needsCtimeBump = true
 		// Ignore the mtime bump, since we just set it ourselves.
 		needsMtimeBump = false
 	}
 	if mask&linux.STATX_CTIME != 0 {
-		atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped())
+		if stat.Ctime.Nsec == linux.UTIME_NOW {
+			atomic.StoreInt64(&i.ctime, now)
+		} else {
+			atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped())
+		}
 		// Ignore the ctime bump, since we just set it ourselves.
 		needsCtimeBump = false
 	}
-	now := i.clock.Now().Nanoseconds()
 	if needsMtimeBump {
 		atomic.StoreInt64(&i.mtime, now)
 	}
 	if needsCtimeBump {
 		atomic.StoreInt64(&i.ctime, now)
 	}
+
 	i.mu.Unlock()
 	return nil
 }
@@ -426,6 +486,8 @@ func (i *inode) direntType() uint8 {
 		return linux.DT_DIR
 	case *symlink:
 		return linux.DT_LNK
+	case *socketFile:
+		return linux.DT_SOCK
 	case *deviceFile:
 		switch impl.kind {
 		case vfs.BlockDevice:
@@ -444,6 +506,92 @@ func (i *inode) isDir() bool {
 	return linux.FileMode(i.mode).FileType() == linux.S_IFDIR
 }
 
+func (i *inode) touchAtime(mnt *vfs.Mount) {
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	now := i.clock.Now().Nanoseconds()
+	i.mu.Lock()
+	atomic.StoreInt64(&i.atime, now)
+	i.mu.Unlock()
+	mnt.EndWrite()
+}
+
+// Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
+func (i *inode) touchCtime() {
+	now := i.clock.Now().Nanoseconds()
+	i.mu.Lock()
+	atomic.StoreInt64(&i.ctime, now)
+	i.mu.Unlock()
+}
+
+// Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
+func (i *inode) touchCMtime() {
+	now := i.clock.Now().Nanoseconds()
+	i.mu.Lock()
+	atomic.StoreInt64(&i.mtime, now)
+	atomic.StoreInt64(&i.ctime, now)
+	i.mu.Unlock()
+}
+
+// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds
+// inode.mu.
+func (i *inode) touchCMtimeLocked() {
+	now := i.clock.Now().Nanoseconds()
+	atomic.StoreInt64(&i.mtime, now)
+	atomic.StoreInt64(&i.ctime, now)
+}
+
+func (i *inode) listxattr(size uint64) ([]string, error) {
+	return i.xattrs.Listxattr(size)
+}
+
+func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+	if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
+		return "", err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return "", syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return "", syserror.ENODATA
+	}
+	return i.xattrs.Getxattr(opts)
+}
+
+func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return syserror.EPERM
+	}
+	return i.xattrs.Setxattr(opts)
+}
+
+func (i *inode) removexattr(creds *auth.Credentials, name string) error {
+	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return syserror.EPERM
+	}
+	return i.xattrs.Removexattr(name)
+}
+
+// Extended attributes in the user.* namespace are only supported for regular
+// files and directories.
+func (i *inode) userXattrSupported() bool {
+	filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
+	return filetype == linux.S_IFREG || filetype == linux.S_IFDIR
+}
+
 // fileDescription is embedded by tmpfs implementations of
 // vfs.FileDescriptionImpl.
 type fileDescription struct {
@@ -471,3 +619,23 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	creds := auth.CredentialsFromContext(ctx)
 	return fd.inode().setStat(ctx, creds, &opts.Stat)
 }
+
+// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
+func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.inode().listxattr(size)
+}
+
+// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
+func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
+	return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts)
+}
+
+// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
+func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+	return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts)
+}
+
+// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
+func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+	return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name)
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index beba29a09..e0ff58d8c 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -169,6 +169,8 @@ go_library(
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
         "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/hostcpu",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index d09d97825..ed40b5303 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -307,6 +307,61 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	return fds, nil
 }
 
+// NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available
+// greater than or equal to the fd parameter. All files will share the set
+// flags. Success is guaranteed to be all or none.
+func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return nil, syscall.EINVAL
+	}
+
+	// Default limit.
+	end := int32(math.MaxInt32)
+
+	// Ensure we don't get past the provided limit.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		lim := limitSet.Get(limits.NumberOfFiles)
+		if lim.Cur != limits.Infinity {
+			end = int32(lim.Cur)
+		}
+		if fd >= end {
+			return nil, syscall.EMFILE
+		}
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// From f.next to find available fd.
+	if fd < f.next {
+		fd = f.next
+	}
+
+	// Install all entries.
+	for i := fd; i < end && len(fds) < len(files); i++ {
+		if d, _, _ := f.getVFS2(i); d == nil {
+			f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
+			fds = append(fds, i)                 // Record the file descriptor.
+		}
+	}
+
+	// Failure? Unwind existing FDs.
+	if len(fds) < len(files) {
+		for _, i := range fds {
+			f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+		}
+		return nil, syscall.EMFILE
+	}
+
+	if fd == f.next {
+		// Update next search start position.
+		f.next = fds[len(fds)-1] + 1
+	}
+
+	return fds, nil
+}
+
 // NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
 // the given file description. If it succeeds, it takes a reference on file.
 func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 6feda8fa1..de8a95854 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -50,6 +50,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -225,6 +226,11 @@ type Kernel struct {
 	// by extMu.
 	nextSocketEntry uint64
 
+	// socketMount is a disconnected vfs.Mount, not included in k.vfs,
+	// representing a sockfs.filesystem. socketMount is used to back
+	// VirtualDentries representing anonymous sockets.
+	socketMount *vfs.Mount
+
 	// deviceRegistry is used to save/restore device.SimpleDevices.
 	deviceRegistry struct{} `state:".(*device.Registry)"`
 
@@ -348,6 +354,19 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
 	k.futexes = futex.NewManager()
 	k.netlinkPorts = port.New()
+	if VFS2Enabled {
+		if err := k.vfs.Init(); err != nil {
+			return fmt.Errorf("failed to initialize VFS: %v", err)
+		}
+		fs := sockfs.NewFilesystem(&k.vfs)
+		// NewDisconnectedMount will take an additional reference on fs.
+		defer fs.DecRef()
+		sm, err := k.vfs.NewDisconnectedMount(fs, nil, &vfs.MountOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to initialize socket mount: %v", err)
+		}
+		k.socketMount = sm
+	}
 	return nil
 }
 
@@ -545,15 +564,25 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
+
+	// Tasks that belong to the same process could potentially point to the
+	// same FDTable. So we retain a map of processed ones to avoid
+	// processing the same FDTable multiple times.
+	processed := make(map[*FDTable]struct{})
 	for t := range ts.Root.tids {
 		// We can skip locking Task.mu here since the kernel is paused.
-		if t.fdTable != nil {
-			t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
-				if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
-					e.UnregisterEpollWaiters()
-				}
-			})
+		if t.fdTable == nil {
+			continue
+		}
+		if _, ok := processed[t.fdTable]; ok {
+			continue
 		}
+		t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+			if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
+				e.UnregisterEpollWaiters()
+			}
+		})
+		processed[t.fdTable] = struct{}{}
 	}
 }
 
@@ -1015,14 +1044,17 @@ func (k *Kernel) pauseTimeLocked() {
 		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
 		// but ktime.Timer.Pause is idempotent so this is harmless.
 		if t.fdTable != nil {
-			// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
-			if !VFS2Enabled {
-				t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+			t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+				if VFS2Enabled {
+					if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok {
+						tfd.PauseTimer()
+					}
+				} else {
 					if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
 						tfd.PauseTimer()
 					}
-				})
-			}
+				}
+			})
 		}
 	}
 	k.timekeeper.PauseUpdates()
@@ -1047,15 +1079,18 @@ func (k *Kernel) resumeTimeLocked() {
 				it.ResumeTimer()
 			}
 		}
-		// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
-		if !VFS2Enabled {
-			if t.fdTable != nil {
-				t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+		if t.fdTable != nil {
+			t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+				if VFS2Enabled {
+					if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok {
+						tfd.ResumeTimer()
+					}
+				} else {
 					if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
 						tfd.ResumeTimer()
 					}
-				})
-			}
+				}
+			})
 		}
 	}
 }
@@ -1416,9 +1451,10 @@ func (k *Kernel) SupervisorContext() context.Context {
 // +stateify savable
 type SocketEntry struct {
 	socketEntry
-	k    *Kernel
-	Sock *refs.WeakRef
-	ID   uint64 // Socket table entry number.
+	k        *Kernel
+	Sock     *refs.WeakRef
+	SockVFS2 *vfs.FileDescription
+	ID       uint64 // Socket table entry number.
 }
 
 // WeakRefGone implements refs.WeakRefUser.WeakRefGone.
@@ -1441,7 +1477,30 @@ func (k *Kernel) RecordSocket(sock *fs.File) {
 	k.extMu.Unlock()
 }
 
+// RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for
+// tracking.
+//
+// Precondition: Caller must hold a reference to sock.
+//
+// Note that the socket table will not hold a reference on the
+// vfs.FileDescription, because we do not support weak refs on VFS2 files.
+func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
+	k.extMu.Lock()
+	id := k.nextSocketEntry
+	k.nextSocketEntry++
+	s := &SocketEntry{
+		k:        k,
+		ID:       id,
+		SockVFS2: sock,
+	}
+	k.sockets.PushBack(s)
+	k.extMu.Unlock()
+}
+
 // ListSockets returns a snapshot of all sockets.
+//
+// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef()
+// to get a reference on a socket in the table.
 func (k *Kernel) ListSockets() []*SocketEntry {
 	k.extMu.Lock()
 	var socks []*SocketEntry
@@ -1452,6 +1511,11 @@ func (k *Kernel) ListSockets() []*SocketEntry {
 	return socks
 }
 
+// SocketMount returns the global socket mount.
+func (k *Kernel) SocketMount() *vfs.Mount {
+	return k.socketMount
+}
+
 // supervisorContext is a privileged context.
 type supervisorContext struct {
 	context.NoopSleeper
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 725e9db7d..62c8691f1 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -255,7 +255,8 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
 	// atomic, but requires no atomicity for writes larger than this.
 	wanted := ops.left()
-	if avail := p.max - p.view.Size(); wanted > avail {
+	avail := p.max - p.view.Size()
+	if wanted > avail {
 		if wanted <= p.atomicIOBytes {
 			return 0, syserror.ErrWouldBlock
 		}
@@ -268,8 +269,14 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 		return done, err
 	}
 
-	if wanted > done {
-		// Partial write due to full pipe.
+	if done < avail {
+		// Non-failure, but short write.
+		return done, nil
+	}
+	if done < wanted {
+		// Partial write due to full pipe. Note that this could also be
+		// the short write case above, we would expect a second call
+		// and the write to return zero bytes in this case.
 		return done, syserror.ErrWouldBlock
 	}
 
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 35ad97d5d..e23e796ef 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -184,7 +184,6 @@ func (t *Task) CanTrace(target *Task, attach bool) bool {
 	if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 {
 		return false
 	}
-	// TODO: Yama LSM
 	return true
 }
 
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index ded95f532..18416643b 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -304,7 +304,7 @@ func (t *Task) rseqAddrInterrupt() {
 	}
 
 	var cs linux.RSeqCriticalSection
-	if err := cs.CopyIn(t, critAddr); err != nil {
+	if _, err := cs.CopyIn(t, critAddr); err != nil {
 		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
 		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 208569057..f66cfcc7f 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -461,7 +461,7 @@ func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.A
 func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	// TODO(b/38173783): RemoveMapping may be called during task exit, when ctx
+	// RemoveMapping may be called during task exit, when ctx
 	// is context.Background. Gracefully handle missing clocks. Failing to
 	// update the detach time in these cases is ok, since no one can observe the
 	// omission.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 93c4fe969..2e3565747 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -209,65 +209,61 @@ type Stracer interface {
 	// SyscallEnter is called on syscall entry.
 	//
 	// The returned private data is passed to SyscallExit.
-	//
-	// TODO(gvisor.dev/issue/155): remove kernel imports from the strace
-	// package so that the type can be used directly.
 	SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
 
 	// SyscallExit is called on syscall exit.
 	SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error)
 }
 
-// SyscallTable is a lookup table of system calls. Critically, a SyscallTable
-// is *immutable*. In order to make supporting suspend and resume sane, they
-// must be uniquely registered and may not change during operation.
+// SyscallTable is a lookup table of system calls.
 //
-// +stateify savable
+// Note that a SyscallTable is not savable directly. Instead, they are saved as
+// an OS/Arch pair and lookup happens again on restore.
 type SyscallTable struct {
 	// OS is the operating system that this syscall table implements.
-	OS abi.OS `state:"wait"`
+	OS abi.OS
 
 	// Arch is the architecture that this syscall table targets.
-	Arch arch.Arch `state:"wait"`
+	Arch arch.Arch
 
 	// The OS version that this syscall table implements.
-	Version Version `state:"manual"`
+	Version Version
 
 	// AuditNumber is a numeric constant that represents the syscall table. If
 	// non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
 	// linux/audit.h.
-	AuditNumber uint32 `state:"manual"`
+	AuditNumber uint32
 
 	// Table is the collection of functions.
-	Table map[uintptr]Syscall `state:"manual"`
+	Table map[uintptr]Syscall
 
 	// lookup is a fixed-size array that holds the syscalls (indexed by
 	// their numbers). It is used for fast look ups.
-	lookup []SyscallFn `state:"manual"`
+	lookup []SyscallFn
 
 	// Emulate is a collection of instruction addresses to emulate. The
 	// keys are addresses, and the values are system call numbers.
-	Emulate map[usermem.Addr]uintptr `state:"manual"`
+	Emulate map[usermem.Addr]uintptr
 
 	// The function to call in case of a missing system call.
-	Missing MissingFn `state:"manual"`
+	Missing MissingFn
 
 	// Stracer traces this syscall table.
-	Stracer Stracer `state:"manual"`
+	Stracer Stracer
 
 	// External is used to handle an external callback.
-	External func(*Kernel) `state:"manual"`
+	External func(*Kernel)
 
 	// ExternalFilterBefore is called before External is called before the syscall is executed.
 	// External is not called if it returns false.
-	ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+	ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool
 
 	// ExternalFilterAfter is called before External is called after the syscall is executed.
 	// External is not called if it returns false.
-	ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+	ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool
 
 	// FeatureEnable stores the strace and one-shot enable bits.
-	FeatureEnable SyscallFlagsTable `state:"manual"`
+	FeatureEnable SyscallFlagsTable
 }
 
 // allSyscallTables contains all known tables.
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
index 00358326b..90f890495 100644
--- a/pkg/sentry/kernel/syscalls_state.go
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -14,16 +14,34 @@
 
 package kernel
 
-import "fmt"
+import (
+	"fmt"
 
-// afterLoad is invoked by stateify.
-func (s *SyscallTable) afterLoad() {
-	otherTable, ok := LookupSyscallTable(s.OS, s.Arch)
-	if !ok {
-		// Couldn't find a reference?
-		panic(fmt.Sprintf("syscall table not found for OS %v Arch %v", s.OS, s.Arch))
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+// syscallTableInfo is used to reload the SyscallTable.
+//
+// +stateify savable
+type syscallTableInfo struct {
+	OS   abi.OS
+	Arch arch.Arch
+}
+
+// saveSt saves the SyscallTable.
+func (tc *TaskContext) saveSt() syscallTableInfo {
+	return syscallTableInfo{
+		OS:   tc.st.OS,
+		Arch: tc.st.Arch,
 	}
+}
 
-	// Copy the table.
-	*s = *otherTable
+// loadSt loads the SyscallTable.
+func (tc *TaskContext) loadSt(sti syscallTableInfo) {
+	st, ok := LookupSyscallTable(sti.OS, sti.Arch)
+	if !ok {
+		panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch))
+	}
+	tc.st = st // Save the table reference.
 }
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index d6546735e..e5d133d6c 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -777,6 +777,15 @@ func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error
 	return t.fdTable.NewFDs(t, fd, files, flags)
 }
 
+// NewFDsVFS2 is a convenience wrapper for t.FDTable().NewFDsVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDsVFS2(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) {
+	return t.fdTable.NewFDsVFS2(t, fd, files, flags)
+}
+
 // NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file.
 //
 // This automatically passes the task as the context.
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 0158b1788..9fa528384 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -49,7 +49,7 @@ type TaskContext struct {
 	fu *futex.Manager
 
 	// st is the task's syscall table.
-	st *SyscallTable
+	st *SyscallTable `state:".(syscallTableInfo)"`
 }
 
 // release releases all resources held by the TaskContext. release is called by
@@ -58,7 +58,6 @@ func (tc *TaskContext) release() {
 	// Nil out pointers so that if the task is saved after release, it doesn't
 	// follow the pointers to possibly now-invalid objects.
 	if tc.MemoryManager != nil {
-		// TODO(b/38173783)
 		tc.MemoryManager.DecUsers(context.Background())
 		tc.MemoryManager = nil
 	}
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index ce3e6ef28..0325967e4 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -455,7 +455,7 @@ func (t *Task) SetKeepCaps(k bool) {
 	t.creds.Store(creds)
 }
 
-// updateCredsForExec updates t.creds to reflect an execve().
+// updateCredsForExecLocked updates t.creds to reflect an execve().
 //
 // NOTE(b/30815691): We currently do not implement privileged executables
 // (set-user/group-ID bits and file capabilities). This allows us to make a lot
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 8802db142..6aa798346 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -513,8 +513,6 @@ func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
 	if t.stop != nil {
 		return false
 	}
-	// - TODO(b/38173783): No special case for when t is also the sending task,
-	// because the identity of the sender is unknown.
 	// - Do not choose tasks that have already been interrupted, as they may be
 	// busy handling another signal.
 	if len(t.interruptChan) != 0 {
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 706de83ef..e959700f2 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -245,7 +245,7 @@ type Clock interface {
 type WallRateClock struct{}
 
 // WallTimeUntil implements Clock.WallTimeUntil.
-func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
+func (*WallRateClock) WallTimeUntil(t, now Time) time.Duration {
 	return t.Sub(now)
 }
 
@@ -254,16 +254,16 @@ func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
 type NoClockEvents struct{}
 
 // Readiness implements waiter.Waitable.Readiness.
-func (NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (*NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return 0
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
-func (NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+func (*NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
-func (NoClockEvents) EventUnregister(e *waiter.Entry) {
+func (*NoClockEvents) EventUnregister(e *waiter.Entry) {
 }
 
 // ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and
@@ -273,7 +273,7 @@ type ClockEventsQueue struct {
 }
 
 // Readiness implements waiter.Waitable.Readiness.
-func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (*ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return 0
 }
 
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 0332fc71c..5c667117c 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -201,8 +201,10 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		if pma.needCOW {
 			perms.Write = false
 		}
-		if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
-			return err
+		if perms.Any() { // MapFile precondition
+			if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
+				return err
+			}
 		}
 		pseg = pseg.NextSegment()
 	}
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index cb29d94b0..379148903 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -59,25 +59,27 @@ func (a *aioManager) newAIOContext(events uint32, id uint64) bool {
 	}
 
 	a.contexts[id] = &AIOContext{
-		done:           make(chan struct{}, 1),
+		requestReady:   make(chan struct{}, 1),
 		maxOutstanding: events,
 	}
 	return true
 }
 
-// destroyAIOContext destroys an asynchronous I/O context.
+// destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for
+// for pending requests to complete. Returns the destroyed AIOContext so it can
+// be drained.
 //
-// False is returned if the context does not exist.
-func (a *aioManager) destroyAIOContext(id uint64) bool {
+// Nil is returned if the context does not exist.
+func (a *aioManager) destroyAIOContext(id uint64) *AIOContext {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 	ctx, ok := a.contexts[id]
 	if !ok {
-		return false
+		return nil
 	}
 	delete(a.contexts, id)
 	ctx.destroy()
-	return true
+	return ctx
 }
 
 // lookupAIOContext looks up the given context.
@@ -102,8 +104,8 @@ type ioResult struct {
 //
 // +stateify savable
 type AIOContext struct {
-	// done is the notification channel used for all requests.
-	done chan struct{} `state:"nosave"`
+	// requestReady is the notification channel used for all requests.
+	requestReady chan struct{} `state:"nosave"`
 
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
@@ -129,8 +131,14 @@ func (ctx *AIOContext) destroy() {
 	ctx.mu.Lock()
 	defer ctx.mu.Unlock()
 	ctx.dead = true
-	if ctx.outstanding == 0 {
-		close(ctx.done)
+	ctx.checkForDone()
+}
+
+// Preconditions: ctx.mu must be held by caller.
+func (ctx *AIOContext) checkForDone() {
+	if ctx.dead && ctx.outstanding == 0 {
+		close(ctx.requestReady)
+		ctx.requestReady = nil
 	}
 }
 
@@ -154,11 +162,12 @@ func (ctx *AIOContext) PopRequest() (interface{}, bool) {
 
 	// Is there anything ready?
 	if e := ctx.results.Front(); e != nil {
-		ctx.results.Remove(e)
-		ctx.outstanding--
-		if ctx.outstanding == 0 && ctx.dead {
-			close(ctx.done)
+		if ctx.outstanding == 0 {
+			panic("AIOContext outstanding is going negative")
 		}
+		ctx.outstanding--
+		ctx.results.Remove(e)
+		ctx.checkForDone()
 		return e.data, true
 	}
 	return nil, false
@@ -172,26 +181,58 @@ func (ctx *AIOContext) FinishRequest(data interface{}) {
 
 	// Push to the list and notify opportunistically. The channel notify
 	// here is guaranteed to be safe because outstanding must be non-zero.
-	// The done channel is only closed when outstanding reaches zero.
+	// The requestReady channel is only closed when outstanding reaches zero.
 	ctx.results.PushBack(&ioResult{data: data})
 
 	select {
-	case ctx.done <- struct{}{}:
+	case ctx.requestReady <- struct{}{}:
 	default:
 	}
 }
 
 // WaitChannel returns a channel that is notified when an AIO request is
-// completed.
-//
-// The boolean return value indicates whether or not the context is active.
-func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) {
+// completed. Returns nil if the context is destroyed and there are no more
+// outstanding requests.
+func (ctx *AIOContext) WaitChannel() chan struct{} {
 	ctx.mu.Lock()
 	defer ctx.mu.Unlock()
-	if ctx.outstanding == 0 && ctx.dead {
-		return nil, false
+	return ctx.requestReady
+}
+
+// Dead returns true if the context has been destroyed.
+func (ctx *AIOContext) Dead() bool {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	return ctx.dead
+}
+
+// CancelPendingRequest forgets about a request that hasn't yet completed.
+func (ctx *AIOContext) CancelPendingRequest() {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	if ctx.outstanding == 0 {
+		panic("AIOContext outstanding is going negative")
 	}
-	return ctx.done, true
+	ctx.outstanding--
+	ctx.checkForDone()
+}
+
+// Drain drops all completed requests. Pending requests remain untouched.
+func (ctx *AIOContext) Drain() {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	if ctx.outstanding == 0 {
+		return
+	}
+	size := uint32(ctx.results.Len())
+	if ctx.outstanding < size {
+		panic("AIOContext outstanding is going negative")
+	}
+	ctx.outstanding -= size
+	ctx.results.Reset()
+	ctx.checkForDone()
 }
 
 // aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
@@ -332,9 +373,9 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
 		Length:          aioRingBufferSize,
 		MappingIdentity: m,
 		Mappable:        m,
-		// TODO(fvoznika): Linux does "do_mmap_pgoff(..., PROT_READ |
-		// PROT_WRITE, ...)" in fs/aio.c:aio_setup_ring(); why do we make this
-		// mapping read-only?
+		// Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in
+		// fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC,
+		// user mode should not write to this page.
 		Perms:    usermem.Read,
 		MaxPerms: usermem.Read,
 	})
@@ -349,11 +390,11 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
 	return id, nil
 }
 
-// DestroyAIOContext destroys an asynchronous I/O context. It returns false if
-// the context does not exist.
-func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) bool {
+// DestroyAIOContext destroys an asynchronous I/O context. It returns the
+// destroyed context. nil if the context does not exist.
+func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext {
 	if _, ok := mm.LookupAIOContext(ctx, id); !ok {
-		return false
+		return nil
 	}
 
 	// Only unmaps after it assured that the address is a valid aio context to
diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go
index c37fc9f7b..3dabac1af 100644
--- a/pkg/sentry/mm/aio_context_state.go
+++ b/pkg/sentry/mm/aio_context_state.go
@@ -16,5 +16,5 @@ package mm
 
 // afterLoad is invoked by stateify.
 func (a *AIOContext) afterLoad() {
-	a.done = make(chan struct{}, 1)
+	a.requestReady = make(chan struct{}, 1)
 }
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index e27f57536..159f7eafd 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -70,7 +70,6 @@ go_test(
         "requires-kvm",
     ],
     deps = [
-        "//pkg/procid",
         "//pkg/sentry/arch",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm/testutil",
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index 555b5fa96..4b23f7803 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -46,14 +46,6 @@ var (
 	// bounceSignalMask has only bounceSignal set.
 	bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
 
-	// otherSignalsMask includes all other signals that will be cause the
-	// vCPU to exit during execution.
-	//
-	// Currently, this includes the preemption signal and the profiling
-	// signal. In general, these should be signals whose delivery actually
-	// influences the way the program executes as the switch can be costly.
-	otherSignalsMask = uint64(1<<(uint64(syscall.SIGURG)-1)) | uint64(1<<(uint64(syscall.SIGPROF)-1))
-
 	// bounce is the interrupt vector used to return to the kernel.
 	bounce = uint32(ring0.VirtualizationException)
 
@@ -94,8 +86,8 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) {
 }
 
 func init() {
-	// Install the handler, masking all signals.
-	if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler, ^uint64(0)); err != nil {
+	// Install the handler.
+	if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
 		panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
 	}
 
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 4e9d80765..9add7c944 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -24,7 +24,6 @@ import (
 	"syscall"
 	"unsafe"
 
-	"gvisor.dev/gvisor/pkg/atomicbitops"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
@@ -59,19 +58,6 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
 	return &((*arch.UContext64)(context).MContext)
 }
 
-// injectInterrupt is a helper to inject an interrupt.
-//
-//go:nosplit
-func injectInterrupt(c *vCPU) {
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_INTERRUPT,
-		uintptr(unsafe.Pointer(&bounce))); errno != 0 {
-		throw("interrupt injection failed")
-	}
-}
-
 // bluepillHandler is called from the signal stub.
 //
 // The world may be stopped while this is executing, and it executes on the
@@ -83,9 +69,6 @@ func bluepillHandler(context unsafe.Pointer) {
 	// Sanitize the registers; interrupts must always be disabled.
 	c := bluepillArchEnter(bluepillArchContext(context))
 
-	// Enable preemption.
-	c.setSignalMask(true)
-
 	// Increment the number of switches.
 	atomic.AddUint32(&c.switches, 1)
 
@@ -106,9 +89,6 @@ func bluepillHandler(context unsafe.Pointer) {
 			// interrupted KVM. Since we're in a signal handler
 			// currently, all signals are masked and the signal
 			// must have been delivered directly to this thread.
-			//
-			// We will not be able to actually do subsequent
-			// KVM_RUNs until this signal is processed.
 			timeout := syscall.Timespec{}
 			sig, _, errno := syscall.RawSyscall6(
 				syscall.SYS_RT_SIGTIMEDWAIT,
@@ -118,24 +98,12 @@ func bluepillHandler(context unsafe.Pointer) {
 				8,                                 // sigset size.
 				0, 0)
 			if errno == syscall.EAGAIN {
-				// If weren't able to process this signal, then
-				// it must not have been in the bounceMask. By
-				// elimination, it must have been the
-				// preemption signal. We can't process this
-				// signal right now, so we need to disable
-				// preemption until the interrupt is actually
-				// handled.
-				c.setSignalMask(false)
-				// Note that there is a waiter for this vCPU.
-				// This will cause the vCPU to exit at some
-				// point in the future (releasing the user lock
-				// and guest mode).
-				atomicbitops.OrUint32(&c.state, vCPUWaiter)
-			} else if errno != 0 {
-				// We only expect success or a timeout.
+				continue
+			}
+			if errno != 0 {
 				throw("error waiting for pending signal")
-			} else if sig != uintptr(bounceSignal) {
-				// Only the bounce should be processed.
+			}
+			if sig != uintptr(bounceSignal) {
 				throw("unexpected signal")
 			}
 
@@ -146,10 +114,11 @@ func bluepillHandler(context unsafe.Pointer) {
 			// ready.
 			if c.runData.readyForInterruptInjection == 0 {
 				c.runData.requestInterruptWindow = 1
+				continue // Rerun vCPU.
 			} else {
-				injectInterrupt(c)
+				// Force injection below; the vCPU is ready.
+				c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
 			}
-			continue // Rerun vCPU.
 		case syscall.EFAULT:
 			// If a fault is not serviceable due to the host
 			// backing pages having page permissions, instead of an
@@ -168,30 +137,6 @@ func bluepillHandler(context unsafe.Pointer) {
 		}
 
 		switch c.runData.exitReason {
-		case _KVM_EXIT_HLT:
-			// Copy out registers.
-			bluepillArchExit(c, bluepillArchContext(context))
-
-			// Return to the vCPUReady state; notify any waiters.
-			user := atomic.LoadUint32(&c.state) & vCPUUser
-			switch atomic.SwapUint32(&c.state, user) {
-			case user | vCPUGuest: // Expected case.
-			case user | vCPUGuest | vCPUWaiter:
-				c.notify()
-			default:
-				throw("invalid state")
-			}
-			return
-		case _KVM_EXIT_IRQ_WINDOW_OPEN:
-			// Inject an interrupt now.
-			injectInterrupt(c)
-			// Clear previous injection request.
-			c.runData.requestInterruptWindow = 0
-		case _KVM_EXIT_INTR:
-			// This is fine, it is the normal exit reason during
-			// signal delivery. However, we still need to handle
-			// other potential exit reasons *combined* with EINTR,
-			// so this switch must be hit even after the above.
 		case _KVM_EXIT_EXCEPTION:
 			c.die(bluepillArchContext(context), "exception")
 			return
@@ -210,6 +155,20 @@ func bluepillHandler(context unsafe.Pointer) {
 		case _KVM_EXIT_DEBUG:
 			c.die(bluepillArchContext(context), "debug")
 			return
+		case _KVM_EXIT_HLT:
+			// Copy out registers.
+			bluepillArchExit(c, bluepillArchContext(context))
+
+			// Return to the vCPUReady state; notify any waiters.
+			user := atomic.LoadUint32(&c.state) & vCPUUser
+			switch atomic.SwapUint32(&c.state, user) {
+			case user | vCPUGuest: // Expected case.
+			case user | vCPUGuest | vCPUWaiter:
+				c.notify()
+			default:
+				throw("invalid state")
+			}
+			return
 		case _KVM_EXIT_MMIO:
 			// Increment the fault count.
 			atomic.AddUint32(&c.faults, 1)
@@ -241,6 +200,18 @@ func bluepillHandler(context unsafe.Pointer) {
 					data[i] = *b
 				}
 			}
+		case _KVM_EXIT_IRQ_WINDOW_OPEN:
+			// Interrupt: we must have requested an interrupt
+			// window; set the interrupt line.
+			if _, _, errno := syscall.RawSyscall(
+				syscall.SYS_IOCTL,
+				uintptr(c.fd),
+				_KVM_INTERRUPT,
+				uintptr(unsafe.Pointer(&bounce))); errno != 0 {
+				throw("interrupt injection failed")
+			}
+			// Clear previous injection request.
+			c.runData.requestInterruptWindow = 0
 		case _KVM_EXIT_SHUTDOWN:
 			c.die(bluepillArchContext(context), "shutdown")
 			return
diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go
index 79045651e..716198712 100644
--- a/pkg/sentry/platform/kvm/kvm_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_arm64.go
@@ -18,6 +18,8 @@ package kvm
 
 import (
 	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
 type kvmOneReg struct {
@@ -46,6 +48,6 @@ type userRegs struct {
 func updateGlobalOnce(fd int) error {
 	physicalInit()
 	err := updateSystemValues(int(fd))
-	updateVectorTable()
+	ring0.Init()
 	return err
 }
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 07d9c9a98..1d5c77ff4 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -48,7 +48,6 @@ const (
 	_KVM_EXIT_IRQ_WINDOW_OPEN = 0x7
 	_KVM_EXIT_SHUTDOWN        = 0x8
 	_KVM_EXIT_FAIL_ENTRY      = 0x9
-	_KVM_EXIT_INTR            = 0xa
 	_KVM_EXIT_INTERNAL_ERROR  = 0x11
 	_KVM_EXIT_SYSTEM_EVENT    = 0x18
 )
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index d42ba3f24..c42752d50 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -16,15 +16,12 @@ package kvm
 
 import (
 	"math/rand"
-	"os"
 	"reflect"
-	"runtime"
 	"sync/atomic"
 	"syscall"
 	"testing"
 	"time"
 
-	"gvisor.dev/gvisor/pkg/procid"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
@@ -323,18 +320,15 @@ func TestBounce(t *testing.T) {
 	})
 }
 
-// randomSleep is used by some race tests below.
-//
-// O(hundreds of microseconds) is appropriate to ensure different overlaps and
-// different schedules.
-func randomSleep() {
-	if n := rand.Intn(1000); n > 100 {
-		time.Sleep(time.Duration(n) * time.Microsecond)
-	}
-}
-
 func TestBounceStress(t *testing.T) {
 	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		randomSleep := func() {
+			// O(hundreds of microseconds) is appropriate to ensure
+			// different overlaps and different schedules.
+			if n := rand.Intn(1000); n > 100 {
+				time.Sleep(time.Duration(n) * time.Microsecond)
+			}
+		}
 		for i := 0; i < 1000; i++ {
 			// Start an asynchronously executing goroutine that
 			// calls Bounce at pseudo-random point in time.
@@ -361,50 +355,6 @@ func TestBounceStress(t *testing.T) {
 	})
 }
 
-func TestPreemption(t *testing.T) {
-	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		// Lock the main vCPU thread.
-		runtime.LockOSThread()
-		pid := os.Getpid()
-		tid := procid.Current()
-		running := uint32(1)
-		defer atomic.StoreUint32(&running, 0)
-
-		// Start generating "preemptions".
-		go func() {
-			for atomic.LoadUint32(&running) != 0 {
-				// Kick via a preemption: best effort.
-				syscall.Tgkill(pid, int(tid), syscall.SIGURG)
-				randomSleep()
-			}
-		}()
-
-		for i := 0; i < 1000; i++ {
-			randomSleep()
-			var si arch.SignalInfo
-			if _, err := c.SwitchToUser(ring0.SwitchOpts{
-				Registers:          regs,
-				FloatingPointState: dummyFPState,
-				PageTables:         pt,
-			}, &si); err != platform.ErrContextInterrupt {
-				t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
-			}
-			// Was this caused by a preemption signal?
-			if got := atomic.LoadUint32(&c.state); got&vCPUGuest != 0 && got&vCPUWaiter == 0 {
-				continue
-			}
-			c.unlock()
-			// Should have dropped from guest mode, processed preemption.
-			if got := atomic.LoadUint32(&c.state); got != vCPUReady {
-				t.Errorf("vCPU not in ready state: got %v", got)
-			}
-			randomSleep()
-			c.lock()
-		}
-		return false
-	})
-}
-
 func TestInvalidate(t *testing.T) {
 	var data uintptr // Used below.
 	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 345b71e8f..f1afc74dc 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -108,9 +108,6 @@ type vCPU struct {
 	// This is a bitmask of the three fields (vCPU*) described above.
 	state uint32
 
-	// signalMask is the vCPU signal mask.
-	signalMask uint64
-
 	// runData for this vCPU.
 	runData *runData
 
@@ -124,7 +121,6 @@ type vCPU struct {
 	// vCPUArchState is the architecture-specific state.
 	vCPUArchState
 
-	// dieState is the temporary state associated with throwing exceptions.
 	dieState dieState
 }
 
@@ -157,6 +153,11 @@ func (m *machine) newVCPU() *vCPU {
 	c.CPU.Init(&m.kernel, c)
 	m.vCPUsByID[c.id] = c
 
+	// Ensure the signal mask is correct.
+	if err := c.setSignalMask(); err != nil {
+		panic(fmt.Sprintf("error setting signal mask: %v", err))
+	}
+
 	// Map the run data.
 	runData, err := mapRunData(int(fd))
 	if err != nil {
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 52286e56d..7156c245f 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -111,6 +111,31 @@ func (c *vCPU) setSystemTime() error {
 	return nil
 }
 
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+	// The layout of this structure implies that it will not necessarily be
+	// the same layout chosen by the Go compiler. It gets fudged here.
+	var data struct {
+		length uint32
+		mask1  uint32
+		mask2  uint32
+		_      uint32
+	}
+	data.length = 8 // Fixed sigset size.
+	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+	data.mask2 = ^uint32(bounceSignalMask >> 32)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SIGNAL_MASK,
+		uintptr(unsafe.Pointer(&data))); errno != 0 {
+		return fmt.Errorf("error setting signal mask: %v", errno)
+	}
+	return nil
+}
+
 // setUserRegisters sets user registers in the vCPU.
 func (c *vCPU) setUserRegisters(uregs *userRegs) error {
 	if _, _, errno := syscall.RawSyscall(
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 185eeb4f0..3b35858ae 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -48,69 +48,6 @@ func (m *machine) initArchState() error {
 	return nil
 }
 
-func getPageWithReflect(p uintptr) []byte {
-	return (*(*[0xFFFFFF]byte)(unsafe.Pointer(p & ^uintptr(syscall.Getpagesize()-1))))[:syscall.Getpagesize()]
-}
-
-// Work around: move ring0.Vectors() into a specific address with 11-bits alignment.
-//
-// According to the design documentation of Arm64,
-// the start address of exception vector table should be 11-bits aligned.
-// Please see the code in linux kernel as reference: arch/arm64/kernel/entry.S
-// But, we can't align a function's start address to a specific address by using golang.
-// We have raised this question in golang community:
-// https://groups.google.com/forum/m/#!topic/golang-dev/RPj90l5x86I
-// This function will be removed when golang supports this feature.
-//
-// There are 2 jobs were implemented in this function:
-// 1, move the start address of exception vector table into the specific address.
-// 2, modify the offset of each instruction.
-func updateVectorTable() {
-	fromLocation := reflect.ValueOf(ring0.Vectors).Pointer()
-	offset := fromLocation & (1<<11 - 1)
-	if offset != 0 {
-		offset = 1<<11 - offset
-	}
-
-	toLocation := fromLocation + offset
-	page := getPageWithReflect(toLocation)
-	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil {
-		panic(err)
-	}
-
-	page = getPageWithReflect(toLocation + 4096)
-	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil {
-		panic(err)
-	}
-
-	// Move exception-vector-table into the specific address.
-	var entry *uint32
-	var entryFrom *uint32
-	for i := 1; i <= 0x800; i++ {
-		entry = (*uint32)(unsafe.Pointer(toLocation + 0x800 - uintptr(i)))
-		entryFrom = (*uint32)(unsafe.Pointer(fromLocation + 0x800 - uintptr(i)))
-		*entry = *entryFrom
-	}
-
-	// The offset from the address of each unconditionally branch is changed.
-	// We should modify the offset of each instruction.
-	nums := []uint32{0x0, 0x80, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400, 0x480, 0x500, 0x580, 0x600, 0x680, 0x700, 0x780}
-	for _, num := range nums {
-		entry = (*uint32)(unsafe.Pointer(toLocation + uintptr(num)))
-		*entry = *entry - (uint32)(offset/4)
-	}
-
-	page = getPageWithReflect(toLocation)
-	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil {
-		panic(err)
-	}
-
-	page = getPageWithReflect(toLocation + 4096)
-	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil {
-		panic(err)
-	}
-}
-
 // initArchState initializes architecture-specific state.
 func (c *vCPU) initArchState() error {
 	var (
@@ -268,6 +205,32 @@ func (c *vCPU) setSystemTime() error {
 	return nil
 }
 
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+	// The layout of this structure implies that it will not necessarily be
+	// the same layout chosen by the Go compiler. It gets fudged here.
+	var data struct {
+		length uint32
+		mask1  uint32
+		mask2  uint32
+		_      uint32
+	}
+	data.length = 8 // Fixed sigset size.
+	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+	data.mask2 = ^uint32(bounceSignalMask >> 32)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SIGNAL_MASK,
+		uintptr(unsafe.Pointer(&data))); errno != 0 {
+		return fmt.Errorf("error setting signal mask: %v", errno)
+	}
+
+	return nil
+}
+
 // SwitchToUser unpacks architectural-details.
 func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
 	// Check for canonical addresses.
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index e4de0a889..f04be2ab5 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -87,47 +87,6 @@ func unmapRunData(r *runData) error {
 	return nil
 }
 
-// setSignalMask sets the vCPU signal mask.
-//
-// This will be called from the bluepill handler, and therefore must not
-// perform any allocation.
-//
-//go:nosplit
-func (c *vCPU) setSignalMask(enableOthers bool) {
-	// The signal mask is either:
-	// *) Only the bounce signal, which we need to use to execute the
-	//    machine state up until the bounce interrupt can be processed.
-	//    or
-	// *) All signals, which is the default state unless we need to
-	//    continue execution to exit guest mode (the case above).
-	mask := bounceSignalMask
-	if enableOthers {
-		mask |= otherSignalsMask
-	}
-	if c.signalMask == mask {
-		return // Already set.
-	}
-
-	// The layout of this structure implies that it will not necessarily be
-	// the same layout chosen by the Go compiler. It gets fudged here.
-	var data struct {
-		length uint32
-		mask1  uint32
-		mask2  uint32
-		_      uint32
-	}
-	data.length = 8 // Fixed sigset size.
-	data.mask1 = ^uint32(mask & 0xffffffff)
-	data.mask2 = ^uint32(mask >> 32)
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_SET_SIGNAL_MASK,
-		uintptr(unsafe.Pointer(&data))); errno != 0 {
-		throw("setSignal mask failed")
-	}
-}
-
 // atomicAddressSpace is an atomic address space pointer.
 type atomicAddressSpace struct {
 	pointer unsafe.Pointer
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index 934b6fbcd..b69520030 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -72,11 +72,13 @@ go_library(
         "lib_amd64.s",
         "lib_arm64.go",
         "lib_arm64.s",
+        "lib_arm64_unsafe.go",
         "ring0.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/cpuid",
+        "//pkg/safecopy",
         "//pkg/sentry/platform/ring0/pagetables",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index af075aae4..242b9305c 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -37,3 +37,10 @@ func SaveVRegs(*byte)
 
 // LoadVRegs loads V0-V31 registers.
 func LoadVRegs(*byte)
+
+// Init sets function pointers based on architectural features.
+//
+// This must be called prior to using ring0.
+func Init() {
+	rewriteVectors()
+}
diff --git a/pkg/sentry/platform/ring0/lib_arm64_unsafe.go b/pkg/sentry/platform/ring0/lib_arm64_unsafe.go
new file mode 100644
index 000000000..c05166fea
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_arm64_unsafe.go
@@ -0,0 +1,108 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package ring0
+
+import (
+	"reflect"
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/safecopy"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+	nopInstruction = 0xd503201f
+	instSize       = unsafe.Sizeof(uint32(0))
+	vectorsRawLen  = 0x800
+)
+
+func unsafeSlice(addr uintptr, length int) (slice []uint32) {
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
+	hdr.Data = addr
+	hdr.Len = length / int(instSize)
+	hdr.Cap = length / int(instSize)
+	return slice
+}
+
+// Work around: move ring0.Vectors() into a specific address with 11-bits alignment.
+//
+// According to the design documentation of Arm64,
+// the start address of exception vector table should be 11-bits aligned.
+// Please see the code in linux kernel as reference: arch/arm64/kernel/entry.S
+// But, we can't align a function's start address to a specific address by using golang.
+// We have raised this question in golang community:
+// https://groups.google.com/forum/m/#!topic/golang-dev/RPj90l5x86I
+// This function will be removed when golang supports this feature.
+//
+// There are 2 jobs were implemented in this function:
+// 1, move the start address of exception vector table into the specific address.
+// 2, modify the offset of each instruction.
+func rewriteVectors() {
+	vectorsBegin := reflect.ValueOf(Vectors).Pointer()
+
+	// The exception-vector-table is required to be 11-bits aligned.
+	// And the size is 0x800.
+	// Please see the documentation as reference:
+	// https://developer.arm.com/docs/100933/0100/aarch64-exception-vector-table
+	//
+	// But, golang does not allow to set a function's address to a specific value.
+	// So, for gvisor, I defined the size of exception-vector-table as 4K,
+	// filled the 2nd 2K part with NOP-s.
+	// So that, I can safely move the 1st 2K part into the address with 11-bits alignment.
+	//
+	// So, the prerequisite for this function to work correctly is:
+	// vectorsSafeLen >= 0x1000
+	// vectorsRawLen  = 0x800
+	vectorsSafeLen := int(safecopy.FindEndAddress(vectorsBegin) - vectorsBegin)
+	if vectorsSafeLen < 2*vectorsRawLen {
+		panic("Can't update vectors")
+	}
+
+	vectorsSafeTable := unsafeSlice(vectorsBegin, vectorsSafeLen) // Now a []uint32
+	vectorsRawLen32 := vectorsRawLen / int(instSize)
+
+	offset := vectorsBegin & (1<<11 - 1)
+	if offset != 0 {
+		offset = 1<<11 - offset
+	}
+
+	pageBegin := (vectorsBegin + offset) & ^uintptr(usermem.PageSize-1)
+
+	_, _, errno := syscall.Syscall(syscall.SYS_MPROTECT, uintptr(pageBegin), uintptr(usermem.PageSize), uintptr(syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC))
+	if errno != 0 {
+		panic(errno.Error())
+	}
+
+	offset = offset / instSize // By index, not bytes.
+	// Move exception-vector-table into the specific address, should uses memmove here.
+	for i := 1; i <= vectorsRawLen32; i++ {
+		vectorsSafeTable[int(offset)+vectorsRawLen32-i] = vectorsSafeTable[vectorsRawLen32-i]
+	}
+
+	// Adjust branch since instruction was moved forward.
+	for i := 0; i < vectorsRawLen32; i++ {
+		if vectorsSafeTable[int(offset)+i] != nopInstruction {
+			vectorsSafeTable[int(offset)+i] -= uint32(offset)
+		}
+	}
+
+	_, _, errno = syscall.Syscall(syscall.SYS_MPROTECT, uintptr(pageBegin), uintptr(usermem.PageSize), uintptr(syscall.PROT_READ|syscall.PROT_EXEC))
+	if errno != 0 {
+		panic(errno.Error())
+	}
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
index dcf061df9..157438d9b 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
+// +build 386 amd64
 
 package pagetables
 
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index 5f80d64e8..9da0ea685 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
+// +build 386 amd64
 
 package ring0
 
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 611fa22c3..c40c6d673 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -16,6 +16,7 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/usermem",
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index f14c336b9..7ac38764d 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -300,7 +300,7 @@ type SocketOperations struct {
 // New creates a new endpoint socket.
 func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) {
 	if skType == linux.SOCK_STREAM {
-		if err := endpoint.SetSockOptInt(tcpip.DelayOption, 1); err != nil {
+		if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 	}
@@ -535,7 +535,7 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 	}
 
 	if resCh != nil {
-		t := ctx.(*kernel.Task)
+		t := kernel.TaskFromContext(ctx)
 		if err := t.Block(resCh); err != nil {
 			return 0, syserr.FromError(err).ToError()
 		}
@@ -608,7 +608,7 @@ func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader
 	}
 
 	if resCh != nil {
-		t := ctx.(*kernel.Task)
+		t := kernel.TaskFromContext(ctx)
 		if err := t.Block(resCh); err != nil {
 			return 0, syserr.FromError(err).ToError()
 		}
@@ -663,7 +663,7 @@ func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error
 // This is a hack to work around the fact that both IPv4 and IPv6 ANY are
 // represented by the empty string.
 //
-// TODO(gvisor.dev/issues/1556): remove this function.
+// TODO(gvisor.dev/issue/1556): remove this function.
 func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
 	if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
 		addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00"
@@ -940,7 +940,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 
 // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
 // sockets backed by a commonEndpoint.
-func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
+func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
 		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
@@ -965,8 +965,15 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int,
 	return nil, syserr.ErrProtocolNotAvailable
 }
 
+func boolToInt32(v bool) int32 {
+	if v {
+		return 1
+	}
+	return 0
+}
+
 // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
-func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) {
 	// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
 	switch name {
 	case linux.SO_ERROR:
@@ -998,12 +1005,11 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.PasscredOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.PasscredOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_SNDBUF:
 		if outLen < sizeOfInt32 {
@@ -1042,24 +1048,22 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.ReuseAddressOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.ReuseAddressOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_REUSEPORT:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.ReusePortOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.ReusePortOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_BINDTODEVICE:
 		var v tcpip.BindToDeviceOption
@@ -1089,24 +1093,22 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.BroadcastOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.BroadcastOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_KEEPALIVE:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.KeepaliveEnabledOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.KeepaliveEnabledOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_LINGER:
 		if outLen < linux.SizeOfLinger {
@@ -1156,47 +1158,41 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptInt(tcpip.DelayOption)
+		v, err := ep.GetSockOptBool(tcpip.DelayOption)
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		if v == 0 {
-			return int32(1), nil
-		}
-		return int32(0), nil
+		return boolToInt32(!v), nil
 
 	case linux.TCP_CORK:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.CorkOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.CorkOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.TCP_QUICKACK:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.QuickAckOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.QuickAckOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.TCP_MAXSEG:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.MaxSegOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.MaxSegOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1328,11 +1324,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		var o int32
-		if v {
-			o = 1
-		}
-		return o, nil
+		return boolToInt32(v), nil
 
 	case linux.IPV6_PATHMTU:
 		t.Kernel().EmitUnimplementedEvent(t)
@@ -1342,8 +1334,8 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		if outLen == 0 {
 			return make([]byte, 0), nil
 		}
-		var v tcpip.IPv6TrafficClassOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1365,12 +1357,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		var o int32
-		if v {
-			o = 1
-		}
-		return o, nil
+		return boolToInt32(v), nil
 
 	default:
 		emitUnimplementedEventIPv6(t, name)
@@ -1386,8 +1373,8 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.TTLOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.TTLOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1403,8 +1390,8 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.MulticastTTLOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1429,23 +1416,19 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.MulticastLoopOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.MulticastLoopOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		if v {
-			return int32(1), nil
-		}
-		return int32(0), nil
+		return boolToInt32(v), nil
 
 	case linux.IP_TOS:
 		// Length handling for parity with Linux.
 		if outLen == 0 {
 			return []byte(nil), nil
 		}
-		var v tcpip.IPv4TOSOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 		if outLen < sizeOfInt32 {
@@ -1462,11 +1445,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		var o int32
-		if v {
-			o = 1
-		}
-		return o, nil
+		return boolToInt32(v), nil
 
 	case linux.IP_PKTINFO:
 		if outLen < sizeOfInt32 {
@@ -1477,11 +1456,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		var o int32
-		if v {
-			o = 1
-		}
-		return o, nil
+		return boolToInt32(v), nil
 
 	default:
 		emitUnimplementedEventIP(t, name)
@@ -1541,7 +1516,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 
 // SetSockOpt can be used to implement the linux syscall setsockopt(2) for
 // sockets backed by a commonEndpoint.
-func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
+func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
 	switch level {
 	case linux.SOL_SOCKET:
 		return setSockOptSocket(t, s, ep, name, optVal)
@@ -1568,7 +1543,7 @@ func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, n
 }
 
 // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
-func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
 	case linux.SO_SNDBUF:
 		if len(optVal) < sizeOfInt32 {
@@ -1592,7 +1567,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReuseAddressOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReuseAddressOption, v != 0))
 
 	case linux.SO_REUSEPORT:
 		if len(optVal) < sizeOfInt32 {
@@ -1600,7 +1575,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReusePortOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReusePortOption, v != 0))
 
 	case linux.SO_BINDTODEVICE:
 		n := bytes.IndexByte(optVal, 0)
@@ -1628,7 +1603,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BroadcastOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.BroadcastOption, v != 0))
 
 	case linux.SO_PASSCRED:
 		if len(optVal) < sizeOfInt32 {
@@ -1636,7 +1611,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.PasscredOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.PasscredOption, v != 0))
 
 	case linux.SO_KEEPALIVE:
 		if len(optVal) < sizeOfInt32 {
@@ -1644,7 +1619,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveEnabledOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.KeepaliveEnabledOption, v != 0))
 
 	case linux.SO_SNDTIMEO:
 		if len(optVal) < linux.SizeOfTimeval {
@@ -1716,11 +1691,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		var o int
-		if v == 0 {
-			o = 1
-		}
-		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.DelayOption, o))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.DelayOption, v == 0))
 
 	case linux.TCP_CORK:
 		if len(optVal) < sizeOfInt32 {
@@ -1728,7 +1699,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.CorkOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.CorkOption, v != 0))
 
 	case linux.TCP_QUICKACK:
 		if len(optVal) < sizeOfInt32 {
@@ -1736,7 +1707,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.QuickAckOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.QuickAckOption, v != 0))
 
 	case linux.TCP_MAXSEG:
 		if len(optVal) < sizeOfInt32 {
@@ -1744,7 +1715,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MaxSegOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v)))
 
 	case linux.TCP_KEEPIDLE:
 		if len(optVal) < sizeOfInt32 {
@@ -1855,7 +1826,7 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		if v == -1 {
 			v = 0
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv6TrafficClassOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v)))
 
 	case linux.IPV6_RECVTCLASS:
 		v, err := parseIntOrChar(optVal)
@@ -1940,7 +1911,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		if v < 0 || v > 255 {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastTTLOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v)))
 
 	case linux.IP_ADD_MEMBERSHIP:
 		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
@@ -1987,9 +1958,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(
-			tcpip.MulticastLoopOption(v != 0),
-		))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.MulticastLoopOption, v != 0))
 
 	case linux.MCAST_JOIN_GROUP:
 		// FIXME(b/124219304): Implement MCAST_JOIN_GROUP.
@@ -2008,7 +1977,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		} else if v < 1 || v > 255 {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TTLOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TTLOption, int(v)))
 
 	case linux.IP_TOS:
 		if len(optVal) == 0 {
@@ -2018,7 +1987,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		if err != nil {
 			return err
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v)))
 
 	case linux.IP_RECVTOS:
 		v, err := parseIntOrChar(optVal)
diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go
index eb090e79b..c3f04b613 100644
--- a/pkg/sentry/socket/netstack/provider.go
+++ b/pkg/sentry/socket/netstack/provider.go
@@ -62,10 +62,6 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in
 		}
 
 	case linux.SOCK_RAW:
-		// TODO(b/142504697): "In order to create a raw socket, a
-		// process must have the CAP_NET_RAW capability in the user
-		// namespace that governs its network namespace." - raw(7)
-
 		// Raw sockets require CAP_NET_RAW.
 		creds := auth.CredentialsFromContext(ctx)
 		if !creds.HasCapability(linux.CAP_NET_RAW) {
@@ -141,10 +137,6 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*
 }
 
 func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
-	// TODO(b/142504697): "In order to create a packet socket, a process
-	// must have the CAP_NET_RAW capability in the user namespace that
-	// governs its network namespace." - packet(7)
-
 	// Packet sockets require CAP_NET_RAW.
 	creds := auth.CredentialsFromContext(t)
 	if !creds.HasCapability(linux.CAP_NET_RAW) {
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 50d9744e6..6580bd6e9 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -48,11 +49,25 @@ func (c *ControlMessages) Release() {
 	c.Unix.Release()
 }
 
-// Socket is the interface containing socket syscalls used by the syscall layer
-// to redirect them to the appropriate implementation.
+// Socket is an interface combining fs.FileOperations and SocketOps,
+// representing a VFS1 socket file.
 type Socket interface {
 	fs.FileOperations
+	SocketOps
+}
+
+// SocketVFS2 is an interface combining vfs.FileDescription and SocketOps,
+// representing a VFS2 socket file.
+type SocketVFS2 interface {
+	vfs.FileDescriptionImpl
+	SocketOps
+}
 
+// SocketOps is the interface containing socket syscalls used by the syscall
+// layer to redirect them to the appropriate implementation.
+//
+// It is implemented by both Socket and SocketVFS2.
+type SocketOps interface {
 	// Connect implements the connect(2) linux syscall.
 	Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error
 
@@ -153,6 +168,8 @@ var families = make(map[int][]Provider)
 // RegisterProvider registers the provider of a given address family so that
 // sockets of that type can be created via socket() and/or socketpair()
 // syscalls.
+//
+// This should only be called during the initialization of the address family.
 func RegisterProvider(family int, provider Provider) {
 	families[family] = append(families[family], provider)
 }
@@ -216,6 +233,74 @@ func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent {
 	return fs.NewDirent(ctx, inode, fmt.Sprintf("socket:[%d]", ino))
 }
 
+// ProviderVFS2 is the vfs2 interface implemented by providers of sockets for
+// specific address families (e.g., AF_INET).
+type ProviderVFS2 interface {
+	// Socket creates a new socket.
+	//
+	// If a nil Socket _and_ a nil error is returned, it means that the
+	// protocol is not supported. A non-nil error should only be returned
+	// if the protocol is supported, but an error occurs during creation.
+	Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error)
+
+	// Pair creates a pair of connected sockets.
+	//
+	// See Socket for error information.
+	Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error)
+}
+
+// familiesVFS2 holds a map of all known address families and their providers.
+var familiesVFS2 = make(map[int][]ProviderVFS2)
+
+// RegisterProviderVFS2 registers the provider of a given address family so that
+// sockets of that type can be created via socket() and/or socketpair()
+// syscalls.
+//
+// This should only be called during the initialization of the address family.
+func RegisterProviderVFS2(family int, provider ProviderVFS2) {
+	familiesVFS2[family] = append(familiesVFS2[family], provider)
+}
+
+// NewVFS2 creates a new socket with the given family, type and protocol.
+func NewVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
+	for _, p := range familiesVFS2[family] {
+		s, err := p.Socket(t, stype, protocol)
+		if err != nil {
+			return nil, err
+		}
+		if s != nil {
+			t.Kernel().RecordSocketVFS2(s)
+			return s, nil
+		}
+	}
+
+	return nil, syserr.ErrAddressFamilyNotSupported
+}
+
+// PairVFS2 creates a new connected socket pair with the given family, type and
+// protocol.
+func PairVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
+	providers, ok := familiesVFS2[family]
+	if !ok {
+		return nil, nil, syserr.ErrAddressFamilyNotSupported
+	}
+
+	for _, p := range providers {
+		s1, s2, err := p.Pair(t, stype, protocol)
+		if err != nil {
+			return nil, nil, err
+		}
+		if s1 != nil && s2 != nil {
+			k := t.Kernel()
+			k.RecordSocketVFS2(s1)
+			k.RecordSocketVFS2(s2)
+			return s1, s2, nil
+		}
+	}
+
+	return nil, nil, syserr.ErrSocketNotSupported
+}
+
 // SendReceiveTimeout stores timeouts for send and receive calls.
 //
 // It is meant to be embedded into Socket implementations to help satisfy the
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index 08743deba..de2cc4bdf 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -8,23 +8,27 @@ go_library(
         "device.go",
         "io.go",
         "unix.go",
+        "unix_vfs2.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/fspath",
         "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/netstack",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 74bcd6300..c708b6030 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -30,6 +30,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/ilist",
+        "//pkg/log",
         "//pkg/refs",
         "//pkg/sync",
         "//pkg/syserr",
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 2ef654235..1f3880cc5 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -20,6 +20,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -838,24 +839,45 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess
 
 // SetSockOpt sets a socket option. Currently not supported.
 func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch v := opt.(type) {
-	case tcpip.PasscredOption:
-		e.setPasscred(v != 0)
-		return nil
-	}
 	return nil
 }
 
 func (e *baseEndpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	switch opt {
+	case tcpip.BroadcastOption:
+	case tcpip.PasscredOption:
+		e.setPasscred(v)
+	case tcpip.ReuseAddressOption:
+	default:
+		log.Warningf("Unsupported socket option: %d", opt)
+		return tcpip.ErrUnknownProtocolOption
+	}
 	return nil
 }
 
 func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	switch opt {
+	case tcpip.SendBufferSizeOption:
+	case tcpip.ReceiveBufferSizeOption:
+	default:
+		log.Warningf("Unsupported socket option: %d", opt)
+		return tcpip.ErrUnknownProtocolOption
+	}
 	return nil
 }
 
 func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	return false, tcpip.ErrUnknownProtocolOption
+	switch opt {
+	case tcpip.KeepaliveEnabledOption:
+		return false, nil
+
+	case tcpip.PasscredOption:
+		return e.Passcred(), nil
+
+	default:
+		log.Warningf("Unsupported socket option: %d", opt)
+		return false, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
@@ -914,29 +936,19 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		return int(v), nil
 
 	default:
+		log.Warningf("Unsupported socket option: %d", opt)
 		return -1, tcpip.ErrUnknownProtocolOption
 	}
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
+	switch opt.(type) {
 	case tcpip.ErrorOption:
 		return nil
 
-	case *tcpip.PasscredOption:
-		if e.Passcred() {
-			*o = tcpip.PasscredOption(1)
-		} else {
-			*o = tcpip.PasscredOption(0)
-		}
-		return nil
-
-	case *tcpip.KeepaliveEnabledOption:
-		*o = 0
-		return nil
-
 	default:
+		log.Warningf("Unsupported socket option: %T", opt)
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 4d30aa714..7c64f30fa 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -23,6 +23,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -33,6 +34,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -52,11 +54,8 @@ type SocketOperations struct {
 	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
-	refs.AtomicRefCount
-	socket.SendReceiveTimeout
 
-	ep    transport.Endpoint
-	stype linux.SockType
+	socketOpsCommon
 }
 
 // New creates a new unix socket.
@@ -75,16 +74,29 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty
 	}
 
 	s := SocketOperations{
-		ep:    ep,
-		stype: stype,
+		socketOpsCommon: socketOpsCommon{
+			ep:    ep,
+			stype: stype,
+		},
 	}
 	s.EnableLeakCheck("unix.SocketOperations")
 
 	return fs.NewFile(ctx, d, flags, &s)
 }
 
+// socketOpsCommon contains the socket operations common to VFS1 and VFS2.
+//
+// +stateify savable
+type socketOpsCommon struct {
+	refs.AtomicRefCount
+	socket.SendReceiveTimeout
+
+	ep    transport.Endpoint
+	stype linux.SockType
+}
+
 // DecRef implements RefCounter.DecRef.
-func (s *SocketOperations) DecRef() {
+func (s *socketOpsCommon) DecRef() {
 	s.DecRefWithDestructor(func() {
 		s.ep.Close()
 	})
@@ -97,7 +109,7 @@ func (s *SocketOperations) Release() {
 	s.DecRef()
 }
 
-func (s *SocketOperations) isPacket() bool {
+func (s *socketOpsCommon) isPacket() bool {
 	switch s.stype {
 	case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
 		return true
@@ -110,7 +122,7 @@ func (s *SocketOperations) isPacket() bool {
 }
 
 // Endpoint extracts the transport.Endpoint.
-func (s *SocketOperations) Endpoint() transport.Endpoint {
+func (s *socketOpsCommon) Endpoint() transport.Endpoint {
 	return s.ep
 }
 
@@ -143,7 +155,7 @@ func extractPath(sockaddr []byte) (string, *syserr.Error) {
 
 // GetPeerName implements the linux syscall getpeername(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr, err := s.ep.GetRemoteAddress()
 	if err != nil {
 		return nil, 0, syserr.TranslateNetstackError(err)
@@ -155,7 +167,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32,
 
 // GetSockName implements the linux syscall getsockname(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr, err := s.ep.GetLocalAddress()
 	if err != nil {
 		return nil, 0, syserr.TranslateNetstackError(err)
@@ -178,7 +190,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 
 // Listen implements the linux syscall listen(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
+func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
 	return s.ep.Listen(backlog)
 }
 
@@ -310,6 +322,8 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 			}
 
 			// Create the socket.
+			//
+			// TODO(gvisor.dev/issue/2324): Correctly set file permissions.
 			childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}})
 			if err != nil {
 				return syserr.ErrPortInUse
@@ -345,6 +359,31 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint,
 		return ep, nil
 	}
 
+	if kernel.VFS2Enabled {
+		p := fspath.Parse(path)
+		root := t.FSContext().RootDirectoryVFS2()
+		start := root
+		relPath := !p.Absolute
+		if relPath {
+			start = t.FSContext().WorkingDirectoryVFS2()
+		}
+		pop := vfs.PathOperation{
+			Root:               root,
+			Start:              start,
+			Path:               p,
+			FollowFinalSymlink: true,
+		}
+		ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop)
+		root.DecRef()
+		if relPath {
+			start.DecRef()
+		}
+		if e != nil {
+			return nil, syserr.FromError(e)
+		}
+		return ep, nil
+	}
+
 	// Find the node in the filesystem.
 	root := t.FSContext().RootDirectory()
 	cwd := t.FSContext().WorkingDirectory()
@@ -363,12 +402,11 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint,
 		// No socket!
 		return nil, syserr.ErrConnectionRefused
 	}
-
 	return ep, nil
 }
 
 // Connect implements the linux syscall connect(2) for unix sockets.
-func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
+func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
 	ep, err := extractEndpoint(t, sockaddr)
 	if err != nil {
 		return err
@@ -379,7 +417,7 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 	return s.ep.Connect(t, ep)
 }
 
-// Writev implements fs.FileOperations.Write.
+// Write implements fs.FileOperations.Write.
 func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
 	t := kernel.TaskFromContext(ctx)
 	ctrl := control.New(t, s.ep, nil)
@@ -399,7 +437,7 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 
 // SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	w := EndpointWriter{
 		Ctx:      t,
 		Endpoint: s.ep,
@@ -453,27 +491,27 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 }
 
 // Passcred implements transport.Credentialer.Passcred.
-func (s *SocketOperations) Passcred() bool {
+func (s *socketOpsCommon) Passcred() bool {
 	return s.ep.Passcred()
 }
 
 // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred.
-func (s *SocketOperations) ConnectedPasscred() bool {
+func (s *socketOpsCommon) ConnectedPasscred() bool {
 	return s.ep.ConnectedPasscred()
 }
 
 // Readiness implements waiter.Waitable.Readiness.
-func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return s.ep.Readiness(mask)
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
-func (s *SocketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
 	s.ep.EventRegister(e, mask)
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
-func (s *SocketOperations) EventUnregister(e *waiter.Entry) {
+func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) {
 	s.ep.EventUnregister(e)
 }
 
@@ -485,7 +523,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 
 // Shutdown implements the linux syscall shutdown(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
+func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
 	f, err := netstack.ConvertShutdown(how)
 	if err != nil {
 		return err
@@ -511,7 +549,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
+func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 	peek := flags&linux.MSG_PEEK != 0
 	dontWait := flags&linux.MSG_DONTWAIT != 0
@@ -648,12 +686,12 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 }
 
 // State implements socket.Socket.State.
-func (s *SocketOperations) State() uint32 {
+func (s *socketOpsCommon) State() uint32 {
 	return s.ep.State()
 }
 
 // Type implements socket.Socket.Type.
-func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) {
+func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) {
 	// Unix domain sockets always have a protocol of 0.
 	return linux.AF_UNIX, s.stype, 0
 }
@@ -706,4 +744,5 @@ func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.F
 
 func init() {
 	socket.RegisterProvider(linux.AF_UNIX, &provider{})
+	socket.RegisterProviderVFS2(linux.AF_UNIX, &providerVFS2{})
 }
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
new file mode 100644
index 000000000..3e54d49c4
--- /dev/null
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -0,0 +1,348 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package unix
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// SocketVFS2 implements socket.SocketVFS2 (and by extension,
+// vfs.FileDescriptionImpl) for Unix sockets.
+type SocketVFS2 struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	socketOpsCommon
+}
+
+// NewVFS2File creates and returns a new vfs.FileDescription for a unix socket.
+func NewVFS2File(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) {
+	sock := NewFDImpl(ep, stype)
+	vfsfd := &sock.vfsfd
+	if err := sockfs.InitSocket(sock, vfsfd, t.Kernel().SocketMount(), t.Credentials()); err != nil {
+		return nil, syserr.FromError(err)
+	}
+	return vfsfd, nil
+}
+
+// NewFDImpl creates and returns a new SocketVFS2.
+func NewFDImpl(ep transport.Endpoint, stype linux.SockType) *SocketVFS2 {
+	// You can create AF_UNIX, SOCK_RAW sockets. They're the same as
+	// SOCK_DGRAM and don't require CAP_NET_RAW.
+	if stype == linux.SOCK_RAW {
+		stype = linux.SOCK_DGRAM
+	}
+
+	return &SocketVFS2{
+		socketOpsCommon: socketOpsCommon{
+			ep:    ep,
+			stype: stype,
+		},
+	}
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
+// a transport.Endpoint.
+func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
+}
+
+// blockingAccept implements a blocking version of accept(2), that is, if no
+// connections are ready to be accept, it will block until one becomes ready.
+func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
+	// Register for notifications.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.socketOpsCommon.EventRegister(&e, waiter.EventIn)
+	defer s.socketOpsCommon.EventUnregister(&e)
+
+	// Try to accept the connection; if it fails, then wait until we get a
+	// notification.
+	for {
+		if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+			return ep, err
+		}
+
+		if err := t.Block(ch); err != nil {
+			return nil, syserr.FromError(err)
+		}
+	}
+}
+
+// Accept implements the linux syscall accept(2) for sockets backed by
+// a transport.Endpoint.
+func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
+	// Issue the accept request to get the new endpoint.
+	ep, err := s.ep.Accept()
+	if err != nil {
+		if err != syserr.ErrWouldBlock || !blocking {
+			return 0, nil, 0, err
+		}
+
+		var err *syserr.Error
+		ep, err = s.blockingAccept(t)
+		if err != nil {
+			return 0, nil, 0, err
+		}
+	}
+
+	// We expect this to be a FileDescription here.
+	ns, err := NewVFS2File(t, ep, s.stype)
+	if err != nil {
+		return 0, nil, 0, err
+	}
+	defer ns.DecRef()
+
+	if flags&linux.SOCK_NONBLOCK != 0 {
+		ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK)
+	}
+
+	var addr linux.SockAddr
+	var addrLen uint32
+	if peerRequested {
+		// Get address of the peer.
+		var err *syserr.Error
+		addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
+		if err != nil {
+			return 0, nil, 0, err
+		}
+	}
+
+	fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
+		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
+	})
+	if e != nil {
+		return 0, nil, 0, syserr.FromError(e)
+	}
+
+	t.Kernel().RecordSocketVFS2(ns)
+	return fd, addr, addrLen, nil
+}
+
+// Bind implements the linux syscall bind(2) for unix sockets.
+func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	p, e := extractPath(sockaddr)
+	if e != nil {
+		return e
+	}
+
+	bep, ok := s.ep.(transport.BoundEndpoint)
+	if !ok {
+		// This socket can't be bound.
+		return syserr.ErrInvalidArgument
+	}
+
+	return s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *syserr.Error {
+		// Is it abstract?
+		if p[0] == 0 {
+			if t.IsNetworkNamespaced() {
+				return syserr.ErrInvalidEndpointState
+			}
+			if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil {
+				// syserr.ErrPortInUse corresponds to EADDRINUSE.
+				return syserr.ErrPortInUse
+			}
+		} else {
+			path := fspath.Parse(p)
+			root := t.FSContext().RootDirectoryVFS2()
+			defer root.DecRef()
+			start := root
+			relPath := !path.Absolute
+			if relPath {
+				start = t.FSContext().WorkingDirectoryVFS2()
+				defer start.DecRef()
+			}
+			pop := vfs.PathOperation{
+				Root:  root,
+				Start: start,
+				Path:  path,
+			}
+			err := t.Kernel().VFS().MknodAt(t, t.Credentials(), &pop, &vfs.MknodOptions{
+				// TODO(gvisor.dev/issue/2324): The file permissions should be taken
+				// from s and t.FSContext().Umask() (see net/unix/af_unix.c:unix_bind),
+				// but VFS1 just always uses 0400. Resolve this inconsistency.
+				Mode:     linux.S_IFSOCK | 0400,
+				Endpoint: bep,
+			})
+			if err == syserror.EEXIST {
+				return syserr.ErrAddressInUse
+			}
+			return syserr.FromError(err)
+		}
+
+		return nil
+	})
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return netstack.Ioctl(ctx, s.ep, uio, args)
+}
+
+// PRead implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Read implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// All flags other than RWF_NOWAIT should be ignored.
+	// TODO(gvisor.dev/issue/1476): Support RWF_NOWAIT.
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+	return dst.CopyOutFrom(ctx, &EndpointReader{
+		Ctx:       ctx,
+		Endpoint:  s.ep,
+		NumRights: 0,
+		Peek:      false,
+		From:      nil,
+	})
+}
+
+// PWrite implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Write implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// All flags other than RWF_NOWAIT should be ignored.
+	// TODO(gvisor.dev/issue/1476): Support RWF_NOWAIT.
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	t := kernel.TaskFromContext(ctx)
+	ctrl := control.New(t, s.ep, nil)
+
+	if src.NumBytes() == 0 {
+		nInt, err := s.ep.SendMsg(ctx, [][]byte{}, ctrl, nil)
+		return int64(nInt), err.ToError()
+	}
+
+	return src.CopyInTo(ctx, &EndpointWriter{
+		Ctx:      ctx,
+		Endpoint: s.ep,
+		Control:  ctrl,
+		To:       nil,
+	})
+}
+
+// Release implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Release() {
+	// Release only decrements a reference on s because s may be referenced in
+	// the abstract socket namespace.
+	s.DecRef()
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return s.socketOpsCommon.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	s.socketOpsCommon.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (s *SocketVFS2) EventUnregister(e *waiter.Entry) {
+	s.socketOpsCommon.EventUnregister(e)
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
+// a transport.Endpoint.
+func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
+	return netstack.SetSockOpt(t, s, s.ep, level, name, optVal)
+}
+
+// providerVFS2 is a unix domain socket provider for VFS2.
+type providerVFS2 struct{}
+
+func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
+	// Check arguments.
+	if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
+		return nil, syserr.ErrProtocolNotSupported
+	}
+
+	// Create the endpoint and socket.
+	var ep transport.Endpoint
+	switch stype {
+	case linux.SOCK_DGRAM, linux.SOCK_RAW:
+		ep = transport.NewConnectionless(t)
+	case linux.SOCK_SEQPACKET, linux.SOCK_STREAM:
+		ep = transport.NewConnectioned(t, stype, t.Kernel())
+	default:
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	f, err := NewVFS2File(t, ep, stype)
+	if err != nil {
+		ep.Close()
+		return nil, err
+	}
+	return f, nil
+}
+
+// Pair creates a new pair of AF_UNIX connected sockets.
+func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
+	// Check arguments.
+	if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
+		return nil, nil, syserr.ErrProtocolNotSupported
+	}
+
+	switch stype {
+	case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW:
+		// Ok
+	default:
+		return nil, nil, syserr.ErrInvalidArgument
+	}
+
+	// Create the endpoints and sockets.
+	ep1, ep2 := transport.NewPair(t, stype, t.Kernel())
+	s1, err := NewVFS2File(t, ep1, stype)
+	if err != nil {
+		ep1.Close()
+		ep2.Close()
+		return nil, nil, err
+	}
+	s2, err := NewVFS2File(t, ep2, stype)
+	if err != nil {
+		s1.DecRef()
+		ep2.Close()
+		return nil, nil, err
+	}
+
+	return s1, s2, nil
+}
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 77655558e..b94c4fbf5 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -778,9 +778,6 @@ func (s SyscallMap) Name(sysno uintptr) string {
 //
 // N.B. This is not in an init function because we can't be sure all syscall
 // tables are registered with the kernel when init runs.
-//
-// TODO(gvisor.dev/issue/155): remove kernel package dependencies from this
-// package and have the kernel package self-initialize all syscall tables.
 func Initialize() {
 	for _, table := range kernel.SyscallTables() {
 		// Is this known?
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index b401978db..38cbeba5a 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -114,14 +114,28 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	id := args[0].Uint64()
 
-	// Destroy the given context.
-	if !t.MemoryManager().DestroyAIOContext(t, id) {
+	ctx := t.MemoryManager().DestroyAIOContext(t, id)
+	if ctx == nil {
 		// Does not exist.
 		return 0, nil, syserror.EINVAL
 	}
-	// FIXME(fvoznika): Linux blocks until all AIO to the destroyed context is
-	// done.
-	return 0, nil, nil
+
+	// Drain completed requests amd wait for pending requests until there are no
+	// more.
+	for {
+		ctx.Drain()
+
+		ch := ctx.WaitChannel()
+		if ch == nil {
+			// No more requests, we're done.
+			return 0, nil, nil
+		}
+		// The task cannot be interrupted during the wait. Equivalent to
+		// TASK_UNINTERRUPTIBLE in Linux.
+		t.UninterruptibleSleepStart(true /* deactivate */)
+		<-ch
+		t.UninterruptibleSleepFinish(true /* activate */)
+	}
 }
 
 // IoGetevents implements linux syscall io_getevents(2).
@@ -200,13 +214,13 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (interface{}, error) {
 	for {
 		if v, ok := ctx.PopRequest(); ok {
-			// Request was readly available. Just return it.
+			// Request was readily available. Just return it.
 			return v, nil
 		}
 
 		// Need to wait for request completion.
-		done, active := ctx.WaitChannel()
-		if !active {
+		done := ctx.WaitChannel()
+		if done == nil {
 			// Context has been destroyed.
 			return nil, syserror.EINVAL
 		}
@@ -248,6 +262,10 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
 }
 
 func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) {
+	if ctx.Dead() {
+		ctx.CancelPendingRequest()
+		return
+	}
 	ev := &ioEvent{
 		Data: cb.Data,
 		Obj:  uint64(cbAddr),
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 9c6728530..f92bf8096 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -161,8 +161,8 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
 			return 0, nil, syserror.EINVAL
 		}
-		// no_new_privs is assumed to always be set. See
-		// kernel.Task.updateCredsForExec.
+		// PR_SET_NO_NEW_PRIVS is assumed to always be set.
+		// See kernel.Task.updateCredsForExecLocked.
 		return 0, nil, nil
 
 	case linux.PR_GET_NO_NEW_PRIVS:
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 2919228d0..61b2576ac 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -31,6 +31,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // minListenBacklog is the minimum reasonable backlog for listening sockets.
 const minListenBacklog = 8
 
@@ -244,7 +246,10 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 
 	// Copy the file descriptors out.
 	if _, err := t.CopyOut(socks, fds); err != nil {
-		// Note that we don't close files here; see pipe(2) also.
+		for _, fd := range fds {
+			_, file := t.FDTable().Remove(fd)
+			file.DecRef()
+		}
 		return 0, nil, err
 	}
 
@@ -1128,3 +1133,5 @@ func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
 	return n, nil, err
 }
+
+// LINT.ThenChange(./vfs2/socket.go)
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index fd642834b..fbc6cf15f 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -29,6 +29,10 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
 		return 0, syserror.EINVAL
 	}
 
+	if opts.Length > int64(kernel.MAX_RW_COUNT) {
+		opts.Length = int64(kernel.MAX_RW_COUNT)
+	}
+
 	var (
 		total int64
 		n     int64
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index a11a87cd1..46ebf27a2 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -115,7 +115,8 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
 		return err
 	}
 	s := statFromAttrs(t, d.Inode.StableAttr, uattr)
-	return s.CopyOut(t, statAddr)
+	_, err = s.CopyOut(t, statAddr)
+	return err
 }
 
 // fstat implements fstat for the given *fs.File.
@@ -125,7 +126,8 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error {
 		return err
 	}
 	s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr)
-	return s.CopyOut(t, statAddr)
+	_, err = s.CopyOut(t, statAddr)
+	return err
 }
 
 // Statx implements linux syscall statx(2).
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index 2eb210014..b32abfe59 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -21,16 +21,19 @@ go_library(
         "poll.go",
         "read_write.go",
         "setstat.go",
+        "socket.go",
         "stat.go",
         "stat_amd64.go",
         "stat_arm64.go",
         "sync.go",
+        "sys_timerfd.go",
         "xattr.go",
     ],
     marshal = True,
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/binary",
         "//pkg/bits",
         "//pkg/fspath",
         "//pkg/gohacks",
@@ -42,10 +45,14 @@ go_library(
         "//pkg/sentry/limits",
         "//pkg/sentry/loader",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/syscalls",
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/vfs",
         "//pkg/sync",
+        "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
index d6cb0e79a..5a938cee2 100644
--- a/pkg/sentry/syscalls/linux/vfs2/epoll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -101,14 +101,14 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	var event linux.EpollEvent
 	switch op {
 	case linux.EPOLL_CTL_ADD:
-		if err := event.CopyIn(t, eventAddr); err != nil {
+		if _, err := event.CopyIn(t, eventAddr); err != nil {
 			return 0, nil, err
 		}
 		return 0, nil, ep.AddInterest(file, fd, event)
 	case linux.EPOLL_CTL_DEL:
 		return 0, nil, ep.DeleteInterest(file, fd)
 	case linux.EPOLL_CTL_MOD:
-		if err := event.CopyIn(t, eventAddr); err != nil {
+		if _, err := event.CopyIn(t, eventAddr); err != nil {
 			return 0, nil, err
 		}
 		return 0, nil, ep.ModifyInterest(file, fd, event)
diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
index a61cc5059..62e98817d 100644
--- a/pkg/sentry/syscalls/linux/vfs2/getdents.go
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -97,6 +97,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 		//     char           d_name[]; /* Filename (null-terminated) */
 		// };
 		size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
+		size = (size + 7) &^ 7 // round up to multiple of 8
 		if size > cb.remaining {
 			return syserror.EINVAL
 		}
@@ -106,7 +107,12 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
 		buf[18] = dirent.Type
 		copy(buf[19:], dirent.Name)
-		buf[size-1] = 0 // NUL terminator
+		// Zero out all remaining bytes in buf, including the NUL terminator
+		// after dirent.Name.
+		bufTail := buf[19+len(dirent.Name):]
+		for i := range bufTail {
+			bufTail[i] = 0
+		}
 	} else {
 		// struct linux_dirent {
 		//     unsigned long  d_ino;     /* Inode number */
@@ -125,6 +131,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 			panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
 		}
 		size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name)
+		size = (size + 7) &^ 7 // round up to multiple of sizeof(long)
 		if size > cb.remaining {
 			return syserror.EINVAL
 		}
@@ -133,9 +140,14 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 		usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
 		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
 		copy(buf[18:], dirent.Name)
-		buf[size-3] = 0 // NUL terminator
-		buf[size-2] = 0 // zero padding byte
-		buf[size-1] = dirent.Type
+		// Zero out all remaining bytes in buf, including the NUL terminator
+		// after dirent.Name and the zero padding byte between the name and
+		// dirent type.
+		bufTail := buf[18+len(dirent.Name):]
+		for i := range bufTail {
+			bufTail[i] = 0
+		}
+		bufTail[2] = dirent.Type
 	}
 	n, err := cb.t.CopyOutBytes(cb.addr, buf)
 	if err != nil {
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
index 7d220bc20..645e0bcb8 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -44,21 +44,22 @@ func Override(table map[uintptr]kernel.Syscall) {
 	table[32] = syscalls.Supported("dup", Dup)
 	table[33] = syscalls.Supported("dup2", Dup2)
 	delete(table, 40) // sendfile
-	delete(table, 41) // socket
-	delete(table, 42) // connect
-	delete(table, 43) // accept
-	delete(table, 44) // sendto
-	delete(table, 45) // recvfrom
-	delete(table, 46) // sendmsg
-	delete(table, 47) // recvmsg
-	delete(table, 48) // shutdown
-	delete(table, 49) // bind
-	delete(table, 50) // listen
-	delete(table, 51) // getsockname
-	delete(table, 52) // getpeername
-	delete(table, 53) // socketpair
-	delete(table, 54) // setsockopt
-	delete(table, 55) // getsockopt
+	// TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2.
+	table[41] = syscalls.PartiallySupported("socket", Socket, "In process of porting socket syscalls to VFS2.", nil)
+	table[42] = syscalls.PartiallySupported("connect", Connect, "In process of porting socket syscalls to VFS2.", nil)
+	table[43] = syscalls.PartiallySupported("accept", Accept, "In process of porting socket syscalls to VFS2.", nil)
+	table[44] = syscalls.PartiallySupported("sendto", SendTo, "In process of porting socket syscalls to VFS2.", nil)
+	table[45] = syscalls.PartiallySupported("recvfrom", RecvFrom, "In process of porting socket syscalls to VFS2.", nil)
+	table[46] = syscalls.PartiallySupported("sendmsg", SendMsg, "In process of porting socket syscalls to VFS2.", nil)
+	table[47] = syscalls.PartiallySupported("recvmsg", RecvMsg, "In process of porting socket syscalls to VFS2.", nil)
+	table[48] = syscalls.PartiallySupported("shutdown", Shutdown, "In process of porting socket syscalls to VFS2.", nil)
+	table[49] = syscalls.PartiallySupported("bind", Bind, "In process of porting socket syscalls to VFS2.", nil)
+	table[50] = syscalls.PartiallySupported("listen", Listen, "In process of porting socket syscalls to VFS2.", nil)
+	table[51] = syscalls.PartiallySupported("getsockname", GetSockName, "In process of porting socket syscalls to VFS2.", nil)
+	table[52] = syscalls.PartiallySupported("getpeername", GetPeerName, "In process of porting socket syscalls to VFS2.", nil)
+	table[53] = syscalls.PartiallySupported("socketpair", SocketPair, "In process of porting socket syscalls to VFS2.", nil)
+	table[54] = syscalls.PartiallySupported("getsockopt", GetSockOpt, "In process of porting socket syscalls to VFS2.", nil)
+	table[55] = syscalls.PartiallySupported("setsockopt", SetSockOpt, "In process of porting socket syscalls to VFS2.", nil)
 	table[59] = syscalls.Supported("execve", Execve)
 	table[72] = syscalls.Supported("fcntl", Fcntl)
 	delete(table, 73) // flock
@@ -139,12 +140,13 @@ func Override(table map[uintptr]kernel.Syscall) {
 	table[280] = syscalls.Supported("utimensat", Utimensat)
 	table[281] = syscalls.Supported("epoll_pwait", EpollPwait)
 	delete(table, 282) // signalfd
-	delete(table, 283) // timerfd_create
+	table[283] = syscalls.Supported("timerfd_create", TimerfdCreate)
 	delete(table, 284) // eventfd
 	delete(table, 285) // fallocate
-	delete(table, 286) // timerfd_settime
-	delete(table, 287) // timerfd_gettime
-	delete(table, 288) // accept4
+	table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime)
+	table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime)
+	// TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2.
+	table[288] = syscalls.PartiallySupported("accept4", Accept4, "In process of porting socket syscalls to VFS2.", nil)
 	delete(table, 289) // signalfd4
 	delete(table, 290) // eventfd2
 	table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
@@ -153,9 +155,11 @@ func Override(table map[uintptr]kernel.Syscall) {
 	delete(table, 294) // inotify_init1
 	table[295] = syscalls.Supported("preadv", Preadv)
 	table[296] = syscalls.Supported("pwritev", Pwritev)
-	delete(table, 299) // recvmmsg
+	// TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2.
+	table[299] = syscalls.PartiallySupported("recvmmsg", RecvMMsg, "In process of porting socket syscalls to VFS2.", nil)
 	table[306] = syscalls.Supported("syncfs", Syncfs)
-	delete(table, 307) // sendmmsg
+	// TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2.
+	table[307] = syscalls.PartiallySupported("sendmmsg", SendMMsg, "In process of porting socket syscalls to VFS2.", nil)
 	table[316] = syscalls.Supported("renameat2", Renameat2)
 	delete(table, 319) // memfd_create
 	table[322] = syscalls.Supported("execveat", Execveat)
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
index dbf4882da..ff1b25d7b 100644
--- a/pkg/sentry/syscalls/linux/vfs2/poll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -374,7 +374,8 @@ func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.D
 	}
 	remaining := timeoutRemaining(t, startNs, timeout)
 	tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
-	return tsRemaining.CopyOut(t, timespecAddr)
+	_, err := tsRemaining.CopyOut(t, timespecAddr)
+	return err
 }
 
 // copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
@@ -386,7 +387,8 @@ func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Du
 	}
 	remaining := timeoutRemaining(t, startNs, timeout)
 	tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
-	return tvRemaining.CopyOut(t, timevalAddr)
+	_, err := tvRemaining.CopyOut(t, timevalAddr)
+	return err
 }
 
 // pollRestartBlock encapsulates the state required to restart poll(2) via
@@ -477,7 +479,7 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	timeout := time.Duration(-1)
 	if timevalAddr != 0 {
 		var timeval linux.Timeval
-		if err := timeval.CopyIn(t, timevalAddr); err != nil {
+		if _, err := timeval.CopyIn(t, timevalAddr); err != nil {
 			return 0, nil, err
 		}
 		if timeval.Sec < 0 || timeval.Usec < 0 {
@@ -519,7 +521,7 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 			panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width()))
 		}
 		var maskStruct sigSetWithSize
-		if err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil {
+		if _, err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil {
 			return 0, nil, err
 		}
 		if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil {
@@ -554,7 +556,7 @@ func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.D
 	timeout := time.Duration(-1)
 	if timespecAddr != 0 {
 		var timespec linux.Timespec
-		if err := timespec.CopyIn(t, timespecAddr); err != nil {
+		if _, err := timespec.CopyIn(t, timespecAddr); err != nil {
 			return 0, err
 		}
 		if !timespec.Valid() {
@@ -573,7 +575,7 @@ func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) erro
 		return syserror.EINVAL
 	}
 	var mask linux.SignalSet
-	if err := mask.CopyIn(t, maskAddr); err != nil {
+	if _, err := mask.CopyIn(t, maskAddr); err != nil {
 		return err
 	}
 	mask &^= kernel.UnblockableSignals
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
index 136453ccc..4e61f1452 100644
--- a/pkg/sentry/syscalls/linux/vfs2/setstat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -226,7 +226,7 @@ func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
 	} else {
 		var times linux.Utime
-		if err := times.CopyIn(t, timesAddr); err != nil {
+		if _, err := times.CopyIn(t, timesAddr); err != nil {
 			return 0, nil, err
 		}
 		opts.Stat.Atime.Sec = times.Actime
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
new file mode 100644
index 000000000..79a4a7ada
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -0,0 +1,1138 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// minListenBacklog is the minimum reasonable backlog for listening sockets.
+const minListenBacklog = 8
+
+// maxListenBacklog is the maximum allowed backlog for listening sockets.
+const maxListenBacklog = 1024
+
+// maxAddrLen is the maximum socket address length we're willing to accept.
+const maxAddrLen = 200
+
+// maxOptLen is the maximum sockopt parameter length we're willing to accept.
+const maxOptLen = 1024 * 8
+
+// maxControlLen is the maximum length of the msghdr.msg_control buffer we're
+// willing to accept. Note that this limit is smaller than Linux, which allows
+// buffers upto INT_MAX.
+const maxControlLen = 10 * 1024 * 1024
+
+// nameLenOffset is the offset from the start of the MessageHeader64 struct to
+// the NameLen field.
+const nameLenOffset = 8
+
+// controlLenOffset is the offset form the start of the MessageHeader64 struct
+// to the ControlLen field.
+const controlLenOffset = 40
+
+// flagsOffset is the offset form the start of the MessageHeader64 struct
+// to the Flags field.
+const flagsOffset = 48
+
+const sizeOfInt32 = 4
+
+// messageHeader64Len is the length of a MessageHeader64 struct.
+var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+
+// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
+var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+
+// baseRecvFlags are the flags that are accepted across recvmsg(2),
+// recvmmsg(2), and recvfrom(2).
+const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC
+
+// MessageHeader64 is the 64-bit representation of the msghdr struct used in
+// the recvmsg and sendmsg syscalls.
+type MessageHeader64 struct {
+	// Name is the optional pointer to a network address buffer.
+	Name uint64
+
+	// NameLen is the length of the buffer pointed to by Name.
+	NameLen uint32
+	_       uint32
+
+	// Iov is a pointer to an array of io vectors that describe the memory
+	// locations involved in the io operation.
+	Iov uint64
+
+	// IovLen is the length of the array pointed to by Iov.
+	IovLen uint64
+
+	// Control is the optional pointer to ancillary control data.
+	Control uint64
+
+	// ControlLen is the length of the data pointed to by Control.
+	ControlLen uint64
+
+	// Flags on the sent/received message.
+	Flags int32
+	_     int32
+}
+
+// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
+// the recvmmsg and sendmmsg syscalls.
+type multipleMessageHeader64 struct {
+	msgHdr MessageHeader64
+	msgLen uint32
+	_      int32
+}
+
+// CopyInMessageHeader64 copies a message header from user to kernel memory.
+func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
+	b := t.CopyScratchBuffer(52)
+	if _, err := t.CopyInBytes(addr, b); err != nil {
+		return err
+	}
+
+	msg.Name = usermem.ByteOrder.Uint64(b[0:])
+	msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
+	msg.Iov = usermem.ByteOrder.Uint64(b[16:])
+	msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
+	msg.Control = usermem.ByteOrder.Uint64(b[32:])
+	msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
+	msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
+
+	return nil
+}
+
+// CaptureAddress allocates memory for and copies a socket address structure
+// from the untrusted address space range.
+func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
+	if addrlen > maxAddrLen {
+		return nil, syserror.EINVAL
+	}
+
+	addrBuf := make([]byte, addrlen)
+	if _, err := t.CopyInBytes(addr, addrBuf); err != nil {
+		return nil, err
+	}
+
+	return addrBuf, nil
+}
+
+// writeAddress writes a sockaddr structure and its length to an output buffer
+// in the unstrusted address space range. If the address is bigger than the
+// buffer, it is truncated.
+func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+	// Get the buffer length.
+	var bufLen uint32
+	if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+		return err
+	}
+
+	if int32(bufLen) < 0 {
+		return syserror.EINVAL
+	}
+
+	// Write the length unconditionally.
+	if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+		return err
+	}
+
+	if addr == nil {
+		return nil
+	}
+
+	if bufLen > addrLen {
+		bufLen = addrLen
+	}
+
+	// Copy as much of the address as will fit in the buffer.
+	encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+	if bufLen > uint32(len(encodedAddr)) {
+		bufLen = uint32(len(encodedAddr))
+	}
+	_, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)])
+	return err
+}
+
+// Socket implements the linux syscall socket(2).
+func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Create the new socket.
+	s, e := socket.NewVFS2(t, domain, linux.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	defer s.DecRef()
+
+	if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil {
+		return 0, nil, err
+	}
+
+	fd, err := t.NewFDFromVFS2(0, s, kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// SocketPair implements the linux syscall socketpair(2).
+func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+	addr := args[3].Pointer()
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Create the socket pair.
+	s1, s2, e := socket.PairVFS2(t, domain, linux.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	// Adding to the FD table will cause an extra reference to be acquired.
+	defer s1.DecRef()
+	defer s2.DecRef()
+
+	nonblocking := uint32(stype & linux.SOCK_NONBLOCK)
+	if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
+		return 0, nil, err
+	}
+	if err := s2.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
+		return 0, nil, err
+	}
+
+	// Create the FDs for the sockets.
+	flags := kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	}
+	fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{s1, s2}, flags)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if _, err := t.CopyOut(addr, fds); err != nil {
+		for _, fd := range fds {
+			_, file := t.FDTable().Remove(fd)
+			file.DecRef()
+		}
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// Connect implements the linux syscall connect(2).
+func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
+	return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS)
+}
+
+// accept is the implementation of the accept syscall. It is called by accept
+// and accept4 syscall handlers.
+func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) {
+	// Check that no unsupported flags are passed in.
+	if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	// Call the syscall implementation for this socket, then copy the
+	// output address if one is specified.
+	blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
+
+	peerRequested := addrLen != 0
+	nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	if peerRequested {
+		// NOTE(magi): Linux does not give you an error if it can't
+		// write the data back out so neither do we.
+		if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL {
+			return 0, err
+		}
+	}
+	return uintptr(nfd), nil
+}
+
+// Accept4 implements the linux syscall accept4(2).
+func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+	flags := int(args[3].Int())
+
+	n, err := accept(t, fd, addr, addrlen, flags)
+	return n, nil, err
+}
+
+// Accept implements the linux syscall accept(2).
+func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	n, err := accept(t, fd, addr, addrlen, 0)
+	return n, nil, err
+}
+
+// Bind implements the linux syscall bind(2).
+func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, s.Bind(t, a).ToError()
+}
+
+// Listen implements the linux syscall listen(2).
+func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	backlog := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Per Linux, the backlog is silently capped to reasonable values.
+	if backlog <= 0 {
+		backlog = minListenBacklog
+	}
+	if backlog > maxListenBacklog {
+		backlog = maxListenBacklog
+	}
+
+	return 0, nil, s.Listen(t, int(backlog)).ToError()
+}
+
+// Shutdown implements the linux syscall shutdown(2).
+func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	how := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Validate how, then call syscall implementation.
+	switch how {
+	case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR:
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, s.Shutdown(t, int(how)).ToError()
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2).
+func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLenAddr := args[4].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Read the length. Reject negative values.
+	optLen := int32(0)
+	if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+		return 0, nil, err
+	}
+	if optLen < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Call syscall implementation then copy both value and value len out.
+	v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen))
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+
+	vLen := int32(binary.Size(v))
+	if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+		return 0, nil, err
+	}
+
+	if v != nil {
+		if _, err := t.CopyOut(optValAddr, v); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	return 0, nil, nil
+}
+
+// getSockOpt tries to handle common socket options, or dispatches to a specific
+// socket implementation.
+func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
+	if level == linux.SOL_SOCKET {
+		switch name {
+		case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
+			if len < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+		}
+
+		switch name {
+		case linux.SO_TYPE:
+			_, skType, _ := s.Type()
+			return int32(skType), nil
+		case linux.SO_DOMAIN:
+			family, _, _ := s.Type()
+			return int32(family), nil
+		case linux.SO_PROTOCOL:
+			_, _, protocol := s.Type()
+			return int32(protocol), nil
+		}
+	}
+
+	return s.GetSockOpt(t, level, name, optValAddr, len)
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2).
+//
+// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket.
+func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLen := args[4].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	if optLen < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if optLen > maxOptLen {
+		return 0, nil, syserror.EINVAL
+	}
+	buf := t.CopyScratchBuffer(int(optLen))
+	if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+		return 0, nil, err
+	}
+
+	// Call syscall implementation.
+	if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, nil
+}
+
+// GetSockName implements the linux syscall getsockname(2).
+func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Get the socket name and copy it to the caller.
+	v, vl, err := s.GetSockName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// GetPeerName implements the linux syscall getpeername(2).
+func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Get the socket peer name and copy it to the caller.
+	v, vl, err := s.GetPeerName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// RecvMsg implements the linux syscall recvmsg(2).
+func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.RecvTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline)
+	return n, nil, err
+}
+
+// RecvMMsg implements the linux syscall recvmmsg(2).
+func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+	toPtr := args[4].Pointer()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if toPtr != 0 {
+		var ts linux.Timespec
+		if _, err := ts.CopyIn(t, toPtr); err != nil {
+			return 0, nil, err
+		}
+		if !ts.Valid() {
+			return 0, nil, syserror.EINVAL
+		}
+		deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration())
+		haveDeadline = true
+	}
+
+	if !haveDeadline {
+		if dl := s.RecvTimeout(); dl > 0 {
+			deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+			haveDeadline = true
+		} else if dl < 0 {
+			flags |= linux.MSG_DONTWAIT
+		}
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		var n uintptr
+		if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
+	// Capture the message header and io vectors.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syserror.EMSGSIZE
+	}
+	dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	// FIXME(b/63594852): Pretend we have an empty error queue.
+	if flags&linux.MSG_ERRQUEUE != 0 {
+		return 0, syserror.EAGAIN
+	}
+
+	// Fast path when no control message nor name buffers are provided.
+	if msg.ControlLen == 0 && msg.NameLen == 0 {
+		n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
+		if err != nil {
+			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
+		}
+		if !cms.Unix.Empty() {
+			mflags |= linux.MSG_CTRUNC
+			cms.Release()
+		}
+
+		if int(msg.Flags) != mflags {
+			// Copy out the flags to the caller.
+			if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+				return 0, err
+			}
+		}
+
+		return uintptr(n), nil
+	}
+
+	if msg.ControlLen > maxControlLen {
+		return 0, syserror.ENOBUFS
+	}
+	n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	defer cms.Release()
+
+	controlData := make([]byte, 0, msg.ControlLen)
+	controlData = control.PackControlMessages(t, cms, controlData)
+
+	if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
+		creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
+		controlData, mflags = control.PackCredentials(t, creds, controlData, mflags)
+	}
+
+	if cms.Unix.Rights != nil {
+		controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
+	}
+
+	// Copy the address to the caller.
+	if msg.NameLen != 0 {
+		if err := writeAddress(t, sender, senderLen, usermem.Addr(msg.Name), usermem.Addr(msgPtr+nameLenOffset)); err != nil {
+			return 0, err
+		}
+	}
+
+	// Copy the control data to the caller.
+	if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+		return 0, err
+	}
+	if len(controlData) > 0 {
+		if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	// Copy out the flags to the caller.
+	if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+		return 0, err
+	}
+
+	return uintptr(n), nil
+}
+
+// recvFrom is the implementation of the recvfrom syscall. It is called by
+// recvfrom and recv syscall handlers.
+func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) {
+	if int(bufLen) < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.RecvTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
+	cm.Release()
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+
+	// Copy the address to the caller.
+	if nameLenPtr != 0 {
+		if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil {
+			return 0, err
+		}
+	}
+
+	return uintptr(n), nil
+}
+
+// RecvFrom implements the linux syscall recvfrom(2).
+func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLenPtr := args[5].Pointer()
+
+	n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr)
+	return n, nil, err
+}
+
+// SendMsg implements the linux syscall sendmsg(2).
+func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := sendSingleMsg(t, s, file, msgPtr, flags)
+	return n, nil, err
+}
+
+// SendMMsg implements the linux syscall sendmmsg(2).
+func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		var n uintptr
+		if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr usermem.Addr, flags int32) (uintptr, error) {
+	// Capture the message header.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	var controlData []byte
+	if msg.ControlLen > 0 {
+		// Put an upper bound to prevent large allocations.
+		if msg.ControlLen > maxControlLen {
+			return 0, syserror.ENOBUFS
+		}
+		controlData = make([]byte, msg.ControlLen)
+		if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	if msg.NameLen != 0 {
+		var err error
+		to, err = CaptureAddress(t, usermem.Addr(msg.Name), msg.NameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	// Read data then call the sendmsg implementation.
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syserror.EMSGSIZE
+	}
+	src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	controlMessages, err := control.Parse(t, s, controlData)
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.SendTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
+	err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
+	if err != nil {
+		controlMessages.Release()
+	}
+	return uintptr(n), err
+}
+
+// sendTo is the implementation of the sendto syscall. It is called by sendto
+// and send syscall handlers.
+func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) {
+	bl := int(bufLen)
+	if bl < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	var err error
+	if namePtr != 0 {
+		to, err = CaptureAddress(t, namePtr, nameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.SendTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
+	return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
+}
+
+// SendTo implements the linux syscall sendto(2).
+func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLen := args[5].Uint()
+
+	n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
+	return n, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
index fdfe49243..bb1d5cac4 100644
--- a/pkg/sentry/syscalls/linux/vfs2/stat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -91,7 +91,8 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
 				}
 				var stat linux.Stat
 				convertStatxToUserStat(t, &statx, &stat)
-				return stat.CopyOut(t, statAddr)
+				_, err = stat.CopyOut(t, statAddr)
+				return err
 			}
 			start = dirfile.VirtualDentry()
 			start.IncRef()
@@ -111,7 +112,8 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
 	}
 	var stat linux.Stat
 	convertStatxToUserStat(t, &statx, &stat)
-	return stat.CopyOut(t, statAddr)
+	_, err = stat.CopyOut(t, statAddr)
+	return err
 }
 
 func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec {
@@ -140,7 +142,8 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 	var stat linux.Stat
 	convertStatxToUserStat(t, &statx, &stat)
-	return 0, nil, stat.CopyOut(t, statAddr)
+	_, err = stat.CopyOut(t, statAddr)
+	return 0, nil, err
 }
 
 // Statx implements Linux syscall statx(2).
@@ -199,7 +202,8 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 					return 0, nil, err
 				}
 				userifyStatx(t, &statx)
-				return 0, nil, statx.CopyOut(t, statxAddr)
+				_, err = statx.CopyOut(t, statxAddr)
+				return 0, nil, err
 			}
 			start = dirfile.VirtualDentry()
 			start.IncRef()
@@ -218,7 +222,8 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return 0, nil, err
 	}
 	userifyStatx(t, &statx)
-	return 0, nil, statx.CopyOut(t, statxAddr)
+	_, err = statx.CopyOut(t, statxAddr)
+	return 0, nil, err
 }
 
 func userifyStatx(t *kernel.Task, statx *linux.Statx) {
@@ -359,8 +364,8 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	if err != nil {
 		return 0, nil, err
 	}
-
-	return 0, nil, statfs.CopyOut(t, bufAddr)
+	_, err = statfs.CopyOut(t, bufAddr)
+	return 0, nil, err
 }
 
 // Fstatfs implements Linux syscall fstatfs(2).
@@ -378,6 +383,6 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	if err != nil {
 		return 0, nil, err
 	}
-
-	return 0, nil, statfs.CopyOut(t, bufAddr)
+	_, err = statfs.CopyOut(t, bufAddr)
+	return 0, nil, err
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go b/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go
new file mode 100644
index 000000000..7938a5249
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go
@@ -0,0 +1,123 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TimerfdCreate implements Linux syscall timerfd_create(2).
+func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := args[0].Int()
+	flags := args[1].Int()
+
+	if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var fileFlags uint32
+	if flags&linux.TFD_NONBLOCK != 0 {
+		fileFlags = linux.O_NONBLOCK
+	}
+
+	var clock ktime.Clock
+	switch clockID {
+	case linux.CLOCK_REALTIME:
+		clock = t.Kernel().RealtimeClock()
+	case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME:
+		clock = t.Kernel().MonotonicClock()
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+	file, err := t.Kernel().VFS().NewTimerFD(clock, fileFlags)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// TimerfdSettime implements Linux syscall timerfd_settime(2).
+func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	flags := args[1].Int()
+	newValAddr := args[2].Pointer()
+	oldValAddr := args[3].Pointer()
+
+	if flags&^(linux.TFD_TIMER_ABSTIME) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	tfd, ok := file.Impl().(*vfs.TimerFileDescription)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var newVal linux.Itimerspec
+	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+		return 0, nil, err
+	}
+	newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock())
+	if err != nil {
+		return 0, nil, err
+	}
+	tm, oldS := tfd.SetTime(newS)
+	if oldValAddr != 0 {
+		oldVal := ktime.ItimerspecFromSetting(tm, oldS)
+		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// TimerfdGettime implements Linux syscall timerfd_gettime(2).
+func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	curValAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	tfd, ok := file.Impl().(*vfs.TimerFileDescription)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	tm, s := tfd.GetTime()
+	curVal := ktime.ItimerspecFromSetting(tm, s)
+	_, err := t.CopyOut(curValAddr, &curVal)
+	return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
index 89e9ff4d7..af455d5c1 100644
--- a/pkg/sentry/syscalls/linux/vfs2/xattr.go
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -51,7 +51,7 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml
 	}
 	defer tpop.Release()
 
-	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop)
+	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
 	if err != nil {
 		return 0, nil, err
 	}
@@ -74,7 +74,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 	defer file.DecRef()
 
-	names, err := file.Listxattr(t)
+	names, err := file.Listxattr(t, uint64(size))
 	if err != nil {
 		return 0, nil, err
 	}
@@ -116,7 +116,10 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
 		return 0, nil, err
 	}
 
-	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, name)
+	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{
+		Name: name,
+		Size: uint64(size),
+	})
 	if err != nil {
 		return 0, nil, err
 	}
@@ -145,7 +148,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 
-	value, err := file.Getxattr(t, name)
+	value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)})
 	if err != nil {
 		return 0, nil, err
 	}
@@ -230,7 +233,7 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 
-	return 0, nil, file.Setxattr(t, vfs.SetxattrOptions{
+	return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{
 		Name:  name,
 		Value: value,
 		Flags: uint32(flags),
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index bf4d27c7d..9aeb83fb0 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -36,6 +36,7 @@ go_library(
         "pathname.go",
         "permissions.go",
         "resolving_path.go",
+        "timerfd.go",
         "vfs.go",
     ],
     visibility = ["//pkg/sentry:internal"],
@@ -51,6 +52,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index d1f6dfb45..a64d86122 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -245,7 +245,7 @@ func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath
 }
 
 // ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
+func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
 	if !rp.Done() {
 		return nil, syserror.ENOTDIR
 	}
@@ -253,7 +253,7 @@ func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([
 }
 
 // GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
+func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) {
 	if !rp.Done() {
 		return "", syserror.ENOTDIR
 	}
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 3da45d744..8e0b40841 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -99,6 +99,8 @@ func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
 		interest: make(map[epollInterestKey]*epollInterest),
 	}
 	if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+		DenyPRead:         true,
+		DenyPWrite:        true,
 		UseDentryMetadata: true,
 	}); err != nil {
 		return nil, err
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 8ee549dc2..4fb9aea87 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -84,6 +84,17 @@ type FileDescriptionOptions struct {
 	// usually only the case if O_DIRECT would actually have an effect.
 	AllowDirectIO bool
 
+	// If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
+	DenyPRead bool
+
+	// If DenyPWrite is true, calls to FileDescription.PWrite() return
+	// ESPIPE.
+	DenyPWrite bool
+
+	// if InvalidWrite is true, calls to FileDescription.Write() return
+	// EINVAL.
+	InvalidWrite bool
+
 	// If UseDentryMetadata is true, calls to FileDescription methods that
 	// interact with file and filesystem metadata (Stat, SetStat, StatFS,
 	// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
@@ -175,6 +186,12 @@ func (fd *FileDescription) DecRef() {
 	}
 }
 
+// Refs returns the current number of references. The returned count
+// is inherently racy and is unsafe to use without external synchronization.
+func (fd *FileDescription) Refs() int64 {
+	return atomic.LoadInt64(&fd.refs)
+}
+
 // Mount returns the mount on which fd was opened. It does not take a reference
 // on the returned Mount.
 func (fd *FileDescription) Mount() *Mount {
@@ -306,6 +323,7 @@ type FileDescriptionImpl interface {
 	// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
 	//
 	// Preconditions: The FileDescription was opened for reading.
+	// FileDescriptionOptions.DenyPRead == false.
 	PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
 
 	// Read is similar to PRead, but does not specify an offset.
@@ -337,6 +355,7 @@ type FileDescriptionImpl interface {
 	// EOPNOTSUPP.
 	//
 	// Preconditions: The FileDescription was opened for writing.
+	// FileDescriptionOptions.DenyPWrite == false.
 	PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
 
 	// Write is similar to PWrite, but does not specify an offset, which is
@@ -382,11 +401,11 @@ type FileDescriptionImpl interface {
 	Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
 
 	// Listxattr returns all extended attribute names for the file.
-	Listxattr(ctx context.Context) ([]string, error)
+	Listxattr(ctx context.Context, size uint64) ([]string, error)
 
 	// Getxattr returns the value associated with the given extended attribute
 	// for the file.
-	Getxattr(ctx context.Context, name string) (string, error)
+	Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
 
 	// Setxattr changes the value associated with the given extended attribute
 	// for the file.
@@ -515,6 +534,9 @@ func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
 // offset, and returns the number of bytes read. PRead is permitted to return
 // partial reads with a nil error.
 func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	if fd.opts.DenyPRead {
+		return 0, syserror.ESPIPE
+	}
 	if !fd.readable {
 		return 0, syserror.EBADF
 	}
@@ -533,6 +555,9 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt
 // offset, and returns the number of bytes written. PWrite is permitted to
 // return partial writes with a nil error.
 func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	if fd.opts.DenyPWrite {
+		return 0, syserror.ESPIPE
+	}
 	if !fd.writable {
 		return 0, syserror.EBADF
 	}
@@ -541,6 +566,9 @@ func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, o
 
 // Write is similar to PWrite, but does not specify an offset.
 func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	if fd.opts.InvalidWrite {
+		return 0, syserror.EINVAL
+	}
 	if !fd.writable {
 		return 0, syserror.EBADF
 	}
@@ -577,18 +605,23 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
 
 // Listxattr returns all extended attribute names for the file represented by
 // fd.
-func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
+//
+// If the size of the list (including a NUL terminating byte after every entry)
+// would exceed size, ERANGE may be returned. Note that implementations
+// are free to ignore size entirely and return without error). In all cases,
+// if size is 0, the list should be returned without error, regardless of size.
+func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp)
+		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
 		vfsObj.putResolvingPath(rp)
 		return names, err
 	}
-	names, err := fd.impl.Listxattr(ctx)
+	names, err := fd.impl.Listxattr(ctx, size)
 	if err == syserror.ENOTSUP {
 		// Linux doesn't actually return ENOTSUP in this case; instead,
 		// fs/xattr.c:vfs_listxattr() falls back to allowing the security
@@ -601,34 +634,39 @@ func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
 
 // Getxattr returns the value associated with the given extended attribute for
 // the file represented by fd.
-func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+//
+// If the size of the return value exceeds opts.Size, ERANGE may be returned
+// (note that implementations are free to ignore opts.Size entirely and return
+// without error). In all cases, if opts.Size is 0, the value should be
+// returned without error, regardless of size.
+func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(rp)
 		return val, err
 	}
-	return fd.impl.Getxattr(ctx, name)
+	return fd.impl.Getxattr(ctx, *opts)
 }
 
 // Setxattr changes the value associated with the given extended attribute for
 // the file represented by fd.
-func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts)
+		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(rp)
 		return err
 	}
-	return fd.impl.Setxattr(ctx, opts)
+	return fd.impl.Setxattr(ctx, *opts)
 }
 
 // Removexattr removes the given extended attribute from the file represented
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index d45e602ce..f4c111926 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -130,14 +130,14 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
 
 // Listxattr implements FileDescriptionImpl.Listxattr analogously to
 // inode_operations::listxattr == NULL in Linux.
-func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) {
+func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) {
 	// This isn't exactly accurate; see FileDescription.Listxattr.
 	return nil, syserror.ENOTSUP
 }
 
 // Getxattr implements FileDescriptionImpl.Getxattr analogously to
 // inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) {
+func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) {
 	return "", syserror.ENOTSUP
 }
 
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index cd34782ff..a537a29d1 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -442,7 +442,13 @@ type FilesystemImpl interface {
 	// - If extended attributes are not supported by the filesystem,
 	// ListxattrAt returns nil. (See FileDescription.Listxattr for an
 	// explanation.)
-	ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error)
+	//
+	// - If the size of the list (including a NUL terminating byte after every
+	// entry) would exceed size, ERANGE may be returned. Note that
+	// implementations are free to ignore size entirely and return without
+	// error). In all cases, if size is 0, the list should be returned without
+	// error, regardless of size.
+	ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
 
 	// GetxattrAt returns the value associated with the given extended
 	// attribute for the file at rp.
@@ -451,7 +457,15 @@ type FilesystemImpl interface {
 	//
 	// - If extended attributes are not supported by the filesystem, GetxattrAt
 	// returns ENOTSUP.
-	GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error)
+	//
+	// - If an extended attribute named opts.Name does not exist, ENODATA is
+	// returned.
+	//
+	// - If the size of the return value exceeds opts.Size, ERANGE may be
+	// returned (note that implementations are free to ignore opts.Size entirely
+	// and return without error). In all cases, if opts.Size is 0, the value
+	// should be returned without error, regardless of size.
+	GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error)
 
 	// SetxattrAt changes the value associated with the given extended
 	// attribute for the file at rp.
@@ -460,6 +474,10 @@ type FilesystemImpl interface {
 	//
 	// - If extended attributes are not supported by the filesystem, SetxattrAt
 	// returns ENOTSUP.
+	//
+	// - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
+	// EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
+	// ENODATA is returned.
 	SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
 
 	// RemovexattrAt removes the given extended attribute from the file at rp.
@@ -468,6 +486,8 @@ type FilesystemImpl interface {
 	//
 	// - If extended attributes are not supported by the filesystem,
 	// RemovexattrAt returns ENOTSUP.
+	//
+	// - If name does not exist, ENODATA is returned.
 	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
 
 	// BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
@@ -497,7 +517,7 @@ type FilesystemImpl interface {
 	// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
 	PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
 
-	// TODO: inotify_add_watch()
+	// TODO(gvisor.dev/issue/1479): inotify_add_watch()
 }
 
 // PrependPathAtVFSRootError is returned by implementations of
diff --git a/pkg/sentry/vfs/memxattr/BUILD b/pkg/sentry/vfs/memxattr/BUILD
new file mode 100644
index 000000000..d8c4d27b9
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "memxattr",
+    srcs = ["xattr.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go
new file mode 100644
index 000000000..cc1e7d764
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/xattr.go
@@ -0,0 +1,102 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memxattr provides a default, in-memory extended attribute
+// implementation.
+package memxattr
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// SimpleExtendedAttributes implements extended attributes using a map of
+// names to values.
+//
+// +stateify savable
+type SimpleExtendedAttributes struct {
+	// mu protects the below fields.
+	mu     sync.RWMutex `state:"nosave"`
+	xattrs map[string]string
+}
+
+// Getxattr returns the value at 'name'.
+func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) {
+	x.mu.RLock()
+	value, ok := x.xattrs[opts.Name]
+	x.mu.RUnlock()
+	if !ok {
+		return "", syserror.ENODATA
+	}
+	// Check that the size of the buffer provided in getxattr(2) is large enough
+	// to contain the value.
+	if opts.Size != 0 && uint64(len(value)) > opts.Size {
+		return "", syserror.ERANGE
+	}
+	return value, nil
+}
+
+// Setxattr sets 'value' at 'name'.
+func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
+	x.mu.Lock()
+	defer x.mu.Unlock()
+	if x.xattrs == nil {
+		if opts.Flags&linux.XATTR_REPLACE != 0 {
+			return syserror.ENODATA
+		}
+		x.xattrs = make(map[string]string)
+	}
+
+	_, ok := x.xattrs[opts.Name]
+	if ok && opts.Flags&linux.XATTR_CREATE != 0 {
+		return syserror.EEXIST
+	}
+	if !ok && opts.Flags&linux.XATTR_REPLACE != 0 {
+		return syserror.ENODATA
+	}
+
+	x.xattrs[opts.Name] = opts.Value
+	return nil
+}
+
+// Listxattr returns all names in xattrs.
+func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
+	// Keep track of the size of the buffer needed in listxattr(2) for the list.
+	listSize := 0
+	x.mu.RLock()
+	names := make([]string, 0, len(x.xattrs))
+	for n := range x.xattrs {
+		names = append(names, n)
+		// Add one byte per null terminator.
+		listSize += len(n) + 1
+	}
+	x.mu.RUnlock()
+	if size != 0 && uint64(listSize) > size {
+		return nil, syserror.ERANGE
+	}
+	return names, nil
+}
+
+// Removexattr removes the xattr at 'name'.
+func (x *SimpleExtendedAttributes) Removexattr(name string) error {
+	x.mu.Lock()
+	defer x.mu.Unlock()
+	if _, ok := x.xattrs[name]; !ok {
+		return syserror.ENODATA
+	}
+	delete(x.xattrs, name)
+	return nil
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 4b68cabda..f06946103 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -15,7 +15,11 @@
 package vfs
 
 import (
+	"bytes"
+	"fmt"
 	"math"
+	"sort"
+	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -44,7 +48,7 @@ var lastMountID uint64
 //
 // +stateify savable
 type Mount struct {
-	// vfs, fs, and root are immutable. References are held on fs and root.
+	// vfs, fs, root are immutable. References are held on fs and root.
 	//
 	// Invariant: root belongs to fs.
 	vfs  *VirtualFilesystem
@@ -229,9 +233,9 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 		}
 		vd.dentry.mu.Lock()
 	}
-	// TODO: Linux requires that either both the mount point and the mount root
-	// are directories, or neither are, and returns ENOTDIR if this is not the
-	// case.
+	// TODO(gvisor.dev/issue/1035): Linux requires that either both the mount
+	// point and the mount root are directories, or neither are, and returns
+	// ENOTDIR if this is not the case.
 	mntns := vd.mount.ns
 	mnt := newMount(vfs, fs, root, mntns, opts)
 	vfs.mounts.seq.BeginWrite()
@@ -270,9 +274,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
 		}
 	}
 
-	// TODO(jamieliu): Linux special-cases umount of the caller's root, which
-	// we don't implement yet (we'll just fail it since the caller holds a
-	// reference on it).
+	// TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's
+	// root, which we don't implement yet (we'll just fail it since the caller
+	// holds a reference on it).
 
 	vfs.mounts.seq.BeginWrite()
 	if opts.Flags&linux.MNT_DETACH == 0 {
@@ -639,12 +643,28 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error {
 	return nil
 }
 
+func (mnt *Mount) readOnly() bool {
+	return atomic.LoadInt64(&mnt.writers) < 0
+}
+
 // Filesystem returns the mounted Filesystem. It does not take a reference on
 // the returned Filesystem.
 func (mnt *Mount) Filesystem() *Filesystem {
 	return mnt.fs
 }
 
+// submountsLocked returns this Mount and all Mounts that are descendents of
+// it.
+//
+// Precondition: mnt.vfs.mountMu must be held.
+func (mnt *Mount) submountsLocked() []*Mount {
+	mounts := []*Mount{mnt}
+	for m := range mnt.children {
+		mounts = append(mounts, m.submountsLocked()...)
+	}
+	return mounts
+}
+
 // Root returns mntns' root. A reference is taken on the returned
 // VirtualDentry.
 func (mntns *MountNamespace) Root() VirtualDentry {
@@ -655,3 +675,174 @@ func (mntns *MountNamespace) Root() VirtualDentry {
 	vd.IncRef()
 	return vd
 }
+
+// GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+	rootMnt := taskRootDir.mount
+	mounts := rootMnt.submountsLocked()
+	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+	for _, mnt := range mounts {
+		// Get the path to this mount relative to task root.
+		mntRootVD := VirtualDentry{
+			mount:  mnt,
+			dentry: mnt.root,
+		}
+		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+		if err != nil {
+			// For some reason we didn't get a path. Log a warning
+			// and run with empty path.
+			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			path = ""
+		}
+		if path == "" {
+			// Either an error occurred, or path is not reachable
+			// from root.
+			break
+		}
+
+		opts := "rw"
+		if mnt.readOnly() {
+			opts = "ro"
+		}
+		if mnt.flags.NoExec {
+			opts += ",noexec"
+		}
+
+		// Format:
+		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
+		//
+		// The "needs dump" and "fsck order" flags are always 0, which
+		// is allowed.
+		fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0)
+	}
+}
+
+// GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to
+// buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+	rootMnt := taskRootDir.mount
+	mounts := rootMnt.submountsLocked()
+	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+	for _, mnt := range mounts {
+		// Get the path to this mount relative to task root.
+		mntRootVD := VirtualDentry{
+			mount:  mnt,
+			dentry: mnt.root,
+		}
+		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+		if err != nil {
+			// For some reason we didn't get a path. Log a warning
+			// and run with empty path.
+			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			path = ""
+		}
+		if path == "" {
+			// Either an error occurred, or path is not reachable
+			// from root.
+			break
+		}
+		// Stat the mount root to get the major/minor device numbers.
+		pop := &PathOperation{
+			Root:  mntRootVD,
+			Start: mntRootVD,
+		}
+		statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{})
+		if err != nil {
+			// Well that's not good. Ignore this mount.
+			break
+		}
+
+		// Format:
+		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
+
+		// (1) Mount ID.
+		fmt.Fprintf(buf, "%d ", mnt.ID)
+
+		// (2)  Parent ID (or this ID if there is no parent).
+		pID := mnt.ID
+		if p := mnt.parent(); p != nil {
+			pID = p.ID
+		}
+		fmt.Fprintf(buf, "%d ", pID)
+
+		// (3) Major:Minor device ID. We don't have a superblock, so we
+		// just use the root inode device number.
+		fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor)
+
+		// (4) Root: the pathname of the directory in the filesystem
+		// which forms the root of this mount.
+		//
+		// NOTE(b/78135857): This will always be "/" until we implement
+		// bind mounts.
+		fmt.Fprintf(buf, "/ ")
+
+		// (5) Mount point (relative to process root).
+		fmt.Fprintf(buf, "%s ", manglePath(path))
+
+		// (6) Mount options.
+		opts := "rw"
+		if mnt.readOnly() {
+			opts = "ro"
+		}
+		if mnt.flags.NoExec {
+			opts += ",noexec"
+		}
+		// TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is
+		// set.
+		fmt.Fprintf(buf, "%s ", opts)
+
+		// (7) Optional fields: zero or more fields of the form "tag[:value]".
+		// (8) Separator: the end of the optional fields is marked by a single hyphen.
+		fmt.Fprintf(buf, "- ")
+
+		// (9) Filesystem type.
+		fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name())
+
+		// (10) Mount source: filesystem-specific information or "none".
+		fmt.Fprintf(buf, "none ")
+
+		// (11) Superblock options, and final newline.
+		fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt))
+	}
+}
+
+// manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
+// See Linux fs/seq_file.c:mangle_path.
+func manglePath(p string) string {
+	r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134")
+	return r.Replace(p)
+}
+
+// superBlockOpts returns the super block options string for the the mount at
+// the given path.
+func superBlockOpts(mountPath string, mnt *Mount) string {
+	// gVisor doesn't (yet) have a concept of super block options, so we
+	// use the ro/rw bit from the mount flag.
+	opts := "rw"
+	if mnt.readOnly() {
+		opts = "ro"
+	}
+
+	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
+	// the cgroup name in the options. For now we just read that from the
+	// path.
+	//
+	// TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we
+	// should get this value from the cgroup itself, and not rely on the
+	// path.
+	if mnt.fs.FilesystemType().Name() == "cgroup" {
+		splitPath := strings.Split(mountPath, "/")
+		cgroupType := splitPath[len(splitPath)-1]
+		opts += "," + cgroupType
+	}
+	return opts
+}
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index 3b933468d..3335e4057 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -55,7 +55,7 @@ func TestMountTableInsertLookup(t *testing.T) {
 	}
 }
 
-// TODO: concurrent lookup/insertion/removal
+// TODO(gvisor.dev/issue/1035): concurrent lookup/insertion/removal.
 
 // must be powers of 2
 var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 3e90dc4ed..534528ce6 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -16,6 +16,7 @@ package vfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 )
 
 // GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
@@ -44,6 +45,10 @@ type MknodOptions struct {
 	// DevMinor are the major and minor device numbers for the created device.
 	DevMajor uint32
 	DevMinor uint32
+
+	// Endpoint is the endpoint to bind to the created file, if a socket file is
+	// being created for bind(2) on a Unix domain socket.
+	Endpoint transport.BoundEndpoint
 }
 
 // MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC.
@@ -127,6 +132,20 @@ type SetStatOptions struct {
 	Stat linux.Statx
 }
 
+// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(),
+// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and
+// FileDescriptionImpl.Getxattr().
+type GetxattrOptions struct {
+	// Name is the name of the extended attribute to retrieve.
+	Name string
+
+	// Size is the maximum value size that the caller will tolerate. If the value
+	// is larger than size, getxattr methods may return ERANGE, but they are also
+	// free to ignore the hint entirely (i.e. the value returned may be larger
+	// than size). All size checking is done independently at the syscall layer.
+	Size uint64
+}
+
 // SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
 // FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
 // FileDescriptionImpl.Setxattr().
diff --git a/pkg/sentry/vfs/timerfd.go b/pkg/sentry/vfs/timerfd.go
new file mode 100644
index 000000000..42b880656
--- /dev/null
+++ b/pkg/sentry/vfs/timerfd.go
@@ -0,0 +1,142 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/context"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// TimerFileDescription implements FileDescriptionImpl for timer fds. It also
+// implements ktime.TimerListener.
+type TimerFileDescription struct {
+	vfsfd FileDescription
+	FileDescriptionDefaultImpl
+	DentryMetadataFileDescriptionImpl
+
+	events waiter.Queue
+	timer  *ktime.Timer
+
+	// val is the number of timer expirations since the last successful
+	// call to PRead, or SetTime. val must be accessed using atomic memory
+	// operations.
+	val uint64
+}
+
+var _ FileDescriptionImpl = (*TimerFileDescription)(nil)
+var _ ktime.TimerListener = (*TimerFileDescription)(nil)
+
+// NewTimerFD returns a new timer fd.
+func (vfs *VirtualFilesystem) NewTimerFD(clock ktime.Clock, flags uint32) (*FileDescription, error) {
+	vd := vfs.NewAnonVirtualDentry("[timerfd]")
+	defer vd.DecRef()
+	tfd := &TimerFileDescription{}
+	tfd.timer = ktime.NewTimer(clock, tfd)
+	if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+		UseDentryMetadata: true,
+		DenyPRead:         true,
+		DenyPWrite:        true,
+		InvalidWrite:      true,
+	}); err != nil {
+		return nil, err
+	}
+	return &tfd.vfsfd, nil
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	const sizeofUint64 = 8
+	if dst.NumBytes() < sizeofUint64 {
+		return 0, syserror.EINVAL
+	}
+	if val := atomic.SwapUint64(&tfd.val, 0); val != 0 {
+		var buf [sizeofUint64]byte
+		usermem.ByteOrder.PutUint64(buf[:], val)
+		if _, err := dst.CopyOut(ctx, buf[:]); err != nil {
+			// Linux does not undo consuming the number of
+			// expirations even if writing to userspace fails.
+			return 0, err
+		}
+		return sizeofUint64, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+// Clock returns the timer fd's Clock.
+func (tfd *TimerFileDescription) Clock() ktime.Clock {
+	return tfd.timer.Clock()
+}
+
+// GetTime returns the associated Timer's setting and the time at which it was
+// observed.
+func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) {
+	return tfd.timer.Get()
+}
+
+// SetTime atomically changes the associated Timer's setting, resets the number
+// of expirations to 0, and returns the previous setting and the time at which
+// it was observed.
+func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) {
+	return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) })
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	var ready waiter.EventMask
+	if atomic.LoadUint64(&tfd.val) != 0 {
+		ready |= waiter.EventIn
+	}
+	return ready
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	tfd.events.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) {
+	tfd.events.EventUnregister(e)
+}
+
+// PauseTimer pauses the associated Timer.
+func (tfd *TimerFileDescription) PauseTimer() {
+	tfd.timer.Pause()
+}
+
+// ResumeTimer resumes the associated Timer.
+func (tfd *TimerFileDescription) ResumeTimer() {
+	tfd.timer.Resume()
+}
+
+// Release implements FileDescriptionImpl.Release()
+func (tfd *TimerFileDescription) Release() {
+	tfd.timer.Destroy()
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
+	atomic.AddUint64(&tfd.val, exp)
+	tfd.events.Notify(waiter.EventIn)
+	return ktime.Setting{}, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (tfd *TimerFileDescription) Destroy() {}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 720b90d8f..f592913d5 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -680,10 +680,10 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
 
 // ListxattrAt returns all extended attribute names for the file at the given
 // path.
-func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) {
+func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp)
+		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return names, nil
@@ -705,10 +705,10 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
 
 // GetxattrAt returns the value associated with the given extended attribute
 // for the file at the given path.
-func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) {
+func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return val, nil