87 files changed, 4608 insertions, 703 deletions
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 3b6987665..5053393c1 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -32,29 +32,38 @@ import (
 const (
 	// SyscallWidth is the width of insturctions.
 	SyscallWidth = 4
+
+	// fpsimdMagic is the magic number which is used in fpsimd_context.
+	fpsimdMagic = 0x46508001
+
+	// fpsimdContextSize is the size of fpsimd_context.
+	fpsimdContextSize = 0x210
 )
 
+// ARMTrapFlag is the mask for the trap flag.
+const ARMTrapFlag = uint64(1) << 21
+
 // aarch64FPState is aarch64 floating point state.
 type aarch64FPState []byte
 
-// initAarch64FPState (defined in asm files) sets up initial state.
-func initAarch64FPState(data *FloatingPointData) {
-	// TODO(gvisor.dev/issue/1238): floating-point is not supported.
+// initAarch64FPState sets up initial state.
+func initAarch64FPState(data aarch64FPState) {
+	binary.LittleEndian.PutUint32(data, fpsimdMagic)
+	binary.LittleEndian.PutUint32(data[4:], fpsimdContextSize)
 }
 
 func newAarch64FPStateSlice() []byte {
-	return alignedBytes(4096, 32)[:4096]
+	return alignedBytes(4096, 16)[:fpsimdContextSize]
 }
 
 // newAarch64FPState returns an initialized floating point state.
 //
 // The returned state is large enough to store all floating point state
 // supported by host, even if the app won't use much of it due to a restricted
-// FeatureSet. Since they may still be able to see state not advertised by
-// CPUID we must ensure it does not contain any sentry state.
+// FeatureSet.
 func newAarch64FPState() aarch64FPState {
 	f := aarch64FPState(newAarch64FPStateSlice())
-	initAarch64FPState(f.FloatingPointData())
+	initAarch64FPState(f)
 	return f
 }
 
@@ -133,10 +142,10 @@ func (s State) Proto() *rpb.Registers {
 
 // Fork creates and returns an identical copy of the state.
 func (s *State) Fork() State {
-	// TODO(gvisor.dev/issue/1238): floating-point is not supported.
 	return State{
-		Regs:       s.Regs,
-		FeatureSet: s.FeatureSet,
+		Regs:           s.Regs,
+		aarch64FPState: s.aarch64FPState.fork(),
+		FeatureSet:     s.FeatureSet,
 	}
 }
 
@@ -285,8 +294,10 @@ func New(arch Arch, fs *cpuid.FeatureSet) Context {
 	case ARM64:
 		return &context64{
 			State{
-				FeatureSet: fs,
+				aarch64FPState: newAarch64FPState(),
+				FeatureSet:     fs,
 			},
+			[]aarch64FPState(nil),
 		}
 	}
 	panic(fmt.Sprintf("unknown architecture %v", arch))
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index ac98897b5..885115ae2 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -53,6 +53,11 @@ const (
 	preferredPIELoadAddr usermem.Addr = maxAddr64 / 6 * 5
 )
 
+var (
+	// CPUIDInstruction doesn't exist on ARM64.
+	CPUIDInstruction = []byte{}
+)
+
 // These constants are selected as heuristics to help make the Platform's
 // potentially limited address space conform as closely to Linux as possible.
 const (
@@ -68,6 +73,7 @@ const (
 // context64 represents an ARM64 context.
 type context64 struct {
 	State
+	sigFPState []aarch64FPState // fpstate to be restored on sigreturn.
 }
 
 // Arch implements Context.Arch.
@@ -75,10 +81,19 @@ func (c *context64) Arch() Arch {
 	return ARM64
 }
 
+func (c *context64) copySigFPState() []aarch64FPState {
+	var sigfps []aarch64FPState
+	for _, s := range c.sigFPState {
+		sigfps = append(sigfps, s.fork())
+	}
+	return sigfps
+}
+
 // Fork returns an exact copy of this context.
 func (c *context64) Fork() Context {
 	return &context64{
-		State: c.State.Fork(),
+		State:      c.State.Fork(),
+		sigFPState: c.copySigFPState(),
 	}
 }
 
diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go
index 4f4cc46a8..0c1db4b13 100644
--- a/pkg/sentry/arch/signal_arm64.go
+++ b/pkg/sentry/arch/signal_arm64.go
@@ -30,14 +30,29 @@ type SignalContext64 struct {
 	Sp        uint64
 	Pc        uint64
 	Pstate    uint64
-	_pad      [8]byte // __attribute__((__aligned__(16)))
-	Reserved  [4096]uint8
+	_pad      [8]byte       // __attribute__((__aligned__(16)))
+	Fpsimd64  FpsimdContext // size = 528
+	Reserved  [3568]uint8
+}
+
+type aarch64Ctx struct {
+	Magic uint32
+	Size  uint32
+}
+
+// FpsimdContext is equivalent to struct fpsimd_context on arm64
+// (arch/arm64/include/uapi/asm/sigcontext.h).
+type FpsimdContext struct {
+	Head  aarch64Ctx
+	Fpsr  uint32
+	Fpcr  uint32
+	Vregs [64]uint64 // actually [32]uint128
 }
 
 // UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h).
 type UContext64 struct {
 	Flags  uint64
-	Link   *UContext64
+	Link   uint64
 	Stack  SignalStack
 	Sigset linux.SignalSet
 	// glibc uses a 1024-bit sigset_t
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 9b6bb26d0..9379a4d7b 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -26,6 +26,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 7e66c29b0..acbd401a0 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -124,10 +125,12 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		"ptmx": newSymlink(ctx, "pts/ptmx", msrc),
 
 		"tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor),
+	}
 
-		"net": newDirectory(ctx, map[string]*fs.Inode{
+	if isNetTunSupported(inet.StackFromContext(ctx)) {
+		contents["net"] = newDirectory(ctx, map[string]*fs.Inode{
 			"tun": newCharacterDevice(ctx, newNetTunDevice(ctx, fs.RootOwner, 0666), msrc, netTunDevMajor, netTunDevMinor),
-		}, msrc),
+		}, msrc)
 	}
 
 	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
index 755644488..dc7ad075a 100644
--- a/pkg/sentry/fs/dev/net_tun.go
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -168,3 +169,9 @@ func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.Eve
 func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
 	fops.device.EventUnregister(e)
 }
+
+// isNetTunSupported returns whether /dev/net/tun device is supported for s.
+func isNetTunSupported(s inet.Stack) bool {
+	_, ok := s.(*netstack.Stack)
+	return ok
+}
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index acab0411a..e0b32e1c1 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1438,8 +1438,8 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName
 	}, nil
 }
 
-func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error {
-	uattr, err := dir.Inode.UnstableAttr(ctx)
+func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error {
+	uattr, err := d.Inode.UnstableAttr(ctx)
 	if err != nil {
 		return syserror.EPERM
 	}
@@ -1465,30 +1465,33 @@ func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error {
 	return syserror.EPERM
 }
 
-// MayDelete determines whether `name`, a child of `dir`, can be deleted or
+// MayDelete determines whether `name`, a child of `d`, can be deleted or
 // renamed by `ctx`.
 //
 // Compare Linux kernel fs/namei.c:may_delete.
-func MayDelete(ctx context.Context, root, dir *Dirent, name string) error {
-	if err := dir.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
+func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error {
+	if err := d.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
 		return err
 	}
 
-	victim, err := dir.Walk(ctx, root, name)
+	unlock := d.lockDirectory()
+	defer unlock()
+
+	victim, err := d.walk(ctx, root, name, true /* may unlock */)
 	if err != nil {
 		return err
 	}
 	defer victim.DecRef()
 
-	return mayDelete(ctx, dir, victim)
+	return d.mayDelete(ctx, victim)
 }
 
 // mayDelete determines whether `victim`, a child of `dir`, can be deleted or
 // renamed by `ctx`.
 //
 // Preconditions: `dir` is writable and executable by `ctx`.
-func mayDelete(ctx context.Context, dir, victim *Dirent) error {
-	if err := checkSticky(ctx, dir, victim); err != nil {
+func (d *Dirent) mayDelete(ctx context.Context, victim *Dirent) error {
+	if err := d.checkSticky(ctx, victim); err != nil {
 		return err
 	}
 
@@ -1542,7 +1545,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 	defer renamed.DecRef()
 
 	// Check that the renamed dirent is deletable.
-	if err := mayDelete(ctx, oldParent, renamed); err != nil {
+	if err := oldParent.mayDelete(ctx, renamed); err != nil {
 		return err
 	}
 
@@ -1580,7 +1583,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		// across the Rename, so must call DecRef manually (no defer).
 
 		// Check that we can delete replaced.
-		if err := mayDelete(ctx, newParent, replaced); err != nil {
+		if err := newParent.mayDelete(ctx, replaced); err != nil {
 			replaced.DecRef()
 			return err
 		}
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index daecc4ffe..1922ff08c 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -259,8 +259,8 @@ func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode, ui
 
 // RemoveXattr implements fs.InodeOperations.RemoveXattr.
 func (i *InodeSimpleExtendedAttributes) RemoveXattr(_ context.Context, _ *fs.Inode, name string) error {
-	i.mu.RLock()
-	defer i.mu.RUnlock()
+	i.mu.Lock()
+	defer i.mu.Unlock()
 	if _, ok := i.xattrs[name]; ok {
 		delete(i.xattrs, name)
 		return nil
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index e672a438c..a3d10770b 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -36,11 +36,12 @@ func mountPathsAre(root *Dirent, got []*Mount, want ...string) error {
 	gotPaths := make(map[string]struct{}, len(got))
 	gotStr := make([]string, len(got))
 	for i, g := range got {
-		groot := g.Root()
-		name, _ := groot.FullName(root)
-		groot.DecRef()
-		gotStr[i] = name
-		gotPaths[name] = struct{}{}
+		if groot := g.Root(); groot != nil {
+			name, _ := groot.FullName(root)
+			groot.DecRef()
+			gotStr[i] = name
+			gotPaths[name] = struct{}{}
+		}
 	}
 	if len(got) != len(want) {
 		return fmt.Errorf("mount paths are different, got: %q, want: %q", gotStr, want)
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 574a2cc91..c7981f66e 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -100,10 +100,14 @@ func newUndoMount(d *Dirent) *Mount {
 	}
 }
 
-// Root returns the root dirent of this mount. Callers must call DecRef on the
-// returned dirent.
+// Root returns the root dirent of this mount.
+//
+// This may return nil if the mount has already been free. Callers must handle this
+// case appropriately. If non-nil, callers must call DecRef on the returned *Dirent.
 func (m *Mount) Root() *Dirent {
-	m.root.IncRef()
+	if !m.root.TryIncRef() {
+		return nil
+	}
 	return m.root
 }
 
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index c10888100..94deb553b 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -60,13 +60,15 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
 	})
 	for _, m := range ms {
 		mroot := m.Root()
+		if mroot == nil {
+			continue // No longer valid.
+		}
 		mountPath, desc := mroot.FullName(rootDir)
 		mroot.DecRef()
 		if !desc {
 			// MountSources that are not descendants of the chroot jail are ignored.
 			continue
 		}
-
 		fn(mountPath, m)
 	}
 }
@@ -91,6 +93,12 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 
 	var buf bytes.Buffer
 	forEachMount(mif.t, func(mountPath string, m *fs.Mount) {
+		mroot := m.Root()
+		if mroot == nil {
+			return // No longer valid.
+		}
+		defer mroot.DecRef()
+
 		// Format:
 		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
 		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
@@ -107,9 +115,6 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 
 		// (3) Major:Minor device ID. We don't have a superblock, so we
 		// just use the root inode device number.
-		mroot := m.Root()
-		defer mroot.DecRef()
-
 		sa := mroot.Inode.StableAttr
 		fmt.Fprintf(&buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor)
 
@@ -207,6 +212,9 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
 		//
 		// The "needs dump"and fsck flags are always 0, which is allowed.
 		root := m.Root()
+		if root == nil {
+			return // No longer valid.
+		}
 		defer root.DecRef()
 
 		flags := root.Inode.MountSource.Flags
diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
index e657c39bc..6aa17bfc1 100644
--- a/pkg/sentry/fsbridge/vfs.go
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -117,15 +117,19 @@ func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry)
 // default anyways.
 //
 // TODO(gvisor.dev/issue/1623): Check mount has read and exec permission.
-func (l *vfsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
+func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
 	vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem()
 	creds := auth.CredentialsFromContext(ctx)
+	path := fspath.Parse(pathname)
 	pop := &vfs.PathOperation{
 		Root:               l.root,
-		Start:              l.root,
-		Path:               fspath.Parse(path),
+		Start:              l.workingDir,
+		Path:               path,
 		FollowFinalSymlink: resolveFinal,
 	}
+	if path.Absolute {
+		pop.Start = l.root
+	}
 	fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts)
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index d00850e25..c4a8f0b38 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -1045,13 +1045,13 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 				// using the old file descriptor, preventing us from safely
 				// closing it. We could handle this by invalidating existing
 				// memmap.Translations, but this is expensive. Instead, use
-				// dup2() to make the old file descriptor refer to the new file
+				// dup3 to make the old file descriptor refer to the new file
 				// description, then close the new file descriptor (which is no
 				// longer needed). Racing callers may use the old or new file
 				// description, but this doesn't matter since they refer to the
 				// same file (unless d.fs.opts.overlayfsStaleRead is true,
 				// which we handle separately).
-				if err := syscall.Dup2(int(h.fd), int(d.handle.fd)); err != nil {
+				if err := syscall.Dup3(int(h.fd), int(d.handle.fd), 0); err != nil {
 					d.handleMu.Unlock()
 					ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err)
 					h.close(ctx)
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 54c1031a7..e95209661 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -361,8 +361,15 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro
 	rw.d.handleMu.RLock()
 	if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
 		n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off)
-		rw.d.handleMu.RUnlock()
 		rw.off += n
+		rw.d.dataMu.Lock()
+		if rw.off > rw.d.size {
+			atomic.StoreUint64(&rw.d.size, rw.off)
+			// The remote file's size will implicitly be extended to the correct
+			// value when we write back to it.
+		}
+		rw.d.dataMu.Unlock()
+		rw.d.handleMu.RUnlock()
 		return n, err
 	}
 
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index ce08a7d53..10c08fa90 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -73,9 +73,9 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames
 		"meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}),
 		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"),
 		"net":     newNetDir(root, inoGen, k),
-		"stat":    newDentry(root, inoGen.NextIno(), 0444, &statData{}),
+		"stat":    newDentry(root, inoGen.NextIno(), 0444, &statData{k: k}),
 		"uptime":  newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}),
-		"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}),
+		"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{k: k}),
 	}
 
 	inode := &tasksInode{
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 23b88f7a6..58001d56c 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -296,6 +296,50 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	return fds, nil
 }
 
+// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
+// the given file description. If it succeeds, it takes a reference on file.
+func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
+	if minfd < 0 {
+		// Don't accept negative FDs.
+		return -1, syscall.EINVAL
+	}
+
+	// Default limit.
+	end := int32(math.MaxInt32)
+
+	// Ensure we don't get past the provided limit.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		lim := limitSet.Get(limits.NumberOfFiles)
+		if lim.Cur != limits.Infinity {
+			end = int32(lim.Cur)
+		}
+		if minfd >= end {
+			return -1, syscall.EMFILE
+		}
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// From f.next to find available fd.
+	fd := minfd
+	if fd < f.next {
+		fd = f.next
+	}
+	for fd < end {
+		if d, _, _ := f.get(fd); d == nil {
+			f.setVFS2(fd, file, flags)
+			if fd == f.next {
+				// Update next search start position.
+				f.next = fd + 1
+			}
+			return fd, nil
+		}
+		fd++
+	}
+	return -1, syscall.EMFILE
+}
+
 // NewFDAt sets the file reference for the given FD. If there is an active
 // reference for that FD, the ref count for that existing reference is
 // decremented.
@@ -316,9 +360,6 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2
 		return syscall.EBADF
 	}
 
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
 	// Check the limit for the provided file.
 	if limitSet := limits.FromContext(ctx); limitSet != nil {
 		if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
@@ -327,6 +368,8 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2
 	}
 
 	// Install the entry.
+	f.mu.Lock()
+	defer f.mu.Unlock()
 	f.setAll(fd, file, fileVFS2, flags)
 	return nil
 }
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 7218aa24e..47f78df9a 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -244,6 +244,28 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
 	old.DecRef()
 }
 
+// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
+//
+// This is not a valid call after free.
+func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) {
+	if !vd.Ok() {
+		panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
+	}
+
+	f.mu.Lock()
+
+	if !f.rootVFS2.Ok() {
+		f.mu.Unlock()
+		panic(fmt.Sprintf("FSContext.SetRootDirectoryVFS2(%v)) called after destroy", vd))
+	}
+
+	old := f.rootVFS2
+	vd.IncRef()
+	f.rootVFS2 = vd
+	f.mu.Unlock()
+	old.DecRef()
+}
+
 // Umask returns the current umask.
 func (f *FSContext) Umask() uint {
 	f.mu.Lock()
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index c62fd6eb1..1d627564f 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -247,6 +247,10 @@ type Kernel struct {
 
 	// VFS keeps the filesystem state used across the kernel.
 	vfs vfs.VirtualFilesystem
+
+	// If set to true, report address space activation waits as if the task is in
+	// external wait so that the watchdog doesn't report the task stuck.
+	SleepForAddressSpaceActivation bool
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -751,6 +755,8 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
@@ -1477,6 +1483,8 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 4c049d5b4..f29dc0472 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,25 +1,10 @@
 load("//tools:defs.bzl", "go_library", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-go_template_instance(
-    name = "buffer_list",
-    out = "buffer_list.go",
-    package = "pipe",
-    prefix = "buffer",
-    template = "//pkg/ilist:generic_list",
-    types = {
-        "Element": "*buffer",
-        "Linker": "*buffer",
-    },
-)
-
 go_library(
     name = "pipe",
     srcs = [
-        "buffer.go",
-        "buffer_list.go",
         "device.go",
         "node.go",
         "pipe.go",
@@ -33,8 +18,8 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
+        "//pkg/buffer",
         "//pkg/context",
-        "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
@@ -51,7 +36,6 @@ go_test(
     name = "pipe_test",
     size = "small",
     srcs = [
-        "buffer_test.go",
         "node_test.go",
         "pipe_test.go",
     ],
diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go
deleted file mode 100644
index fe3be5dbd..000000000
--- a/pkg/sentry/kernel/pipe/buffer.go
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pipe
-
-import (
-	"io"
-
-	"gvisor.dev/gvisor/pkg/safemem"
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-// buffer encapsulates a queueable byte buffer.
-//
-// Note that the total size is slightly less than two pages. This
-// is done intentionally to ensure that the buffer object aligns
-// with runtime internals. We have no hard size or alignment
-// requirements. This two page size will effectively minimize
-// internal fragmentation, but still have a large enough chunk
-// to limit excessive segmentation.
-//
-// +stateify savable
-type buffer struct {
-	data  [8144]byte
-	read  int
-	write int
-	bufferEntry
-}
-
-// Reset resets internal data.
-//
-// This must be called before use.
-func (b *buffer) Reset() {
-	b.read = 0
-	b.write = 0
-}
-
-// Empty indicates the buffer is empty.
-//
-// This indicates there is no data left to read.
-func (b *buffer) Empty() bool {
-	return b.read == b.write
-}
-
-// Full indicates the buffer is full.
-//
-// This indicates there is no capacity left to write.
-func (b *buffer) Full() bool {
-	return b.write == len(b.data)
-}
-
-// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
-func (b *buffer) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
-	dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.write:]))
-	n, err := safemem.CopySeq(dst, srcs)
-	b.write += int(n)
-	return n, err
-}
-
-// WriteFromReader writes to the buffer from an io.Reader.
-func (b *buffer) WriteFromReader(r io.Reader, count int64) (int64, error) {
-	dst := b.data[b.write:]
-	if count < int64(len(dst)) {
-		dst = b.data[b.write:][:count]
-	}
-	n, err := r.Read(dst)
-	b.write += n
-	return int64(n), err
-}
-
-// ReadToBlocks implements safemem.Reader.ReadToBlocks.
-func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
-	src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.read:b.write]))
-	n, err := safemem.CopySeq(dsts, src)
-	b.read += int(n)
-	return n, err
-}
-
-// ReadToWriter reads from the buffer into an io.Writer.
-func (b *buffer) ReadToWriter(w io.Writer, count int64, dup bool) (int64, error) {
-	src := b.data[b.read:b.write]
-	if count < int64(len(src)) {
-		src = b.data[b.read:][:count]
-	}
-	n, err := w.Write(src)
-	if !dup {
-		b.read += n
-	}
-	return int64(n), err
-}
-
-// bufferPool is a pool for buffers.
-var bufferPool = sync.Pool{
-	New: func() interface{} {
-		return new(buffer)
-	},
-}
-
-// newBuffer grabs a new buffer from the pool.
-func newBuffer() *buffer {
-	b := bufferPool.Get().(*buffer)
-	b.Reset()
-	return b
-}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 08410283f..725e9db7d 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -20,6 +20,7 @@ import (
 	"sync/atomic"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -70,10 +71,10 @@ type Pipe struct {
 	// mu protects all pipe internal state below.
 	mu sync.Mutex `state:"nosave"`
 
-	// data is the buffer queue of pipe contents.
+	// view is the underlying set of buffers.
 	//
 	// This is protected by mu.
-	data bufferList
+	view buffer.View
 
 	// max is the maximum size of the pipe in bytes. When this max has been
 	// reached, writers will get EWOULDBLOCK.
@@ -81,11 +82,6 @@ type Pipe struct {
 	// This is protected by mu.
 	max int64
 
-	// size is the current size of the pipe in bytes.
-	//
-	// This is protected by mu.
-	size int64
-
 	// hadWriter indicates if this pipe ever had a writer. Note that this
 	// does not necessarily indicate there is *currently* a writer, just
 	// that there has been a writer at some point since the pipe was
@@ -196,7 +192,7 @@ type readOps struct {
 	limit func(int64)
 
 	// read performs the actual read operation.
-	read func(*buffer) (int64, error)
+	read func(*buffer.View) (int64, error)
 }
 
 // read reads data from the pipe into dst and returns the number of bytes
@@ -213,7 +209,7 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
 	defer p.mu.Unlock()
 
 	// Is the pipe empty?
-	if p.size == 0 {
+	if p.view.Size() == 0 {
 		if !p.HasWriters() {
 			// There are no writers, return EOF.
 			return 0, nil
@@ -222,71 +218,13 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
 	}
 
 	// Limit how much we consume.
-	if ops.left() > p.size {
-		ops.limit(p.size)
+	if ops.left() > p.view.Size() {
+		ops.limit(p.view.Size())
 	}
 
-	done := int64(0)
-	for ops.left() > 0 {
-		// Pop the first buffer.
-		first := p.data.Front()
-		if first == nil {
-			break
-		}
-
-		// Copy user data.
-		n, err := ops.read(first)
-		done += int64(n)
-		p.size -= n
-
-		// Empty buffer?
-		if first.Empty() {
-			// Push to the free list.
-			p.data.Remove(first)
-			bufferPool.Put(first)
-		}
-
-		// Handle errors.
-		if err != nil {
-			return done, err
-		}
-	}
-
-	return done, nil
-}
-
-// dup duplicates all data from this pipe into the given writer.
-//
-// There is no blocking behavior implemented here. The writer may propagate
-// some blocking error. All the writes must be complete writes.
-func (p *Pipe) dup(ctx context.Context, ops readOps) (int64, error) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-
-	// Is the pipe empty?
-	if p.size == 0 {
-		if !p.HasWriters() {
-			// See above.
-			return 0, nil
-		}
-		return 0, syserror.ErrWouldBlock
-	}
-
-	// Limit how much we consume.
-	if ops.left() > p.size {
-		ops.limit(p.size)
-	}
-
-	done := int64(0)
-	for buf := p.data.Front(); buf != nil; buf = buf.Next() {
-		n, err := ops.read(buf)
-		done += n
-		if err != nil {
-			return done, err
-		}
-	}
-
-	return done, nil
+	// Copy user data; the read op is responsible for trimming.
+	done, err := ops.read(&p.view)
+	return done, err
 }
 
 type writeOps struct {
@@ -297,7 +235,7 @@ type writeOps struct {
 	limit func(int64)
 
 	// write should write to the provided buffer.
-	write func(*buffer) (int64, error)
+	write func(*buffer.View) (int64, error)
 }
 
 // write writes data from sv into the pipe and returns the number of bytes
@@ -317,33 +255,19 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
 	// atomic, but requires no atomicity for writes larger than this.
 	wanted := ops.left()
-	if avail := p.max - p.size; wanted > avail {
+	if avail := p.max - p.view.Size(); wanted > avail {
 		if wanted <= p.atomicIOBytes {
 			return 0, syserror.ErrWouldBlock
 		}
 		ops.limit(avail)
 	}
 
-	done := int64(0)
-	for ops.left() > 0 {
-		// Need a new buffer?
-		last := p.data.Back()
-		if last == nil || last.Full() {
-			// Add a new buffer to the data list.
-			last = newBuffer()
-			p.data.PushBack(last)
-		}
-
-		// Copy user data.
-		n, err := ops.write(last)
-		done += int64(n)
-		p.size += n
-
-		// Handle errors.
-		if err != nil {
-			return done, err
-		}
+	// Copy user data.
+	done, err := ops.write(&p.view)
+	if err != nil {
+		return done, err
 	}
+
 	if wanted > done {
 		// Partial write due to full pipe.
 		return done, syserror.ErrWouldBlock
@@ -396,7 +320,7 @@ func (p *Pipe) HasWriters() bool {
 // Precondition: mu must be held.
 func (p *Pipe) rReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
-	if p.HasReaders() && p.data.Front() != nil {
+	if p.HasReaders() && p.view.Size() != 0 {
 		ready |= waiter.EventIn
 	}
 	if !p.HasWriters() && p.hadWriter {
@@ -422,7 +346,7 @@ func (p *Pipe) rReadiness() waiter.EventMask {
 // Precondition: mu must be held.
 func (p *Pipe) wReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
-	if p.HasWriters() && p.size < p.max {
+	if p.HasWriters() && p.view.Size() < p.max {
 		ready |= waiter.EventOut
 	}
 	if !p.HasReaders() {
@@ -451,7 +375,7 @@ func (p *Pipe) rwReadiness() waiter.EventMask {
 func (p *Pipe) queued() int64 {
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	return p.size
+	return p.view.Size()
 }
 
 // FifoSize implements fs.FifoSizer.FifoSize.
@@ -474,7 +398,7 @@ func (p *Pipe) SetFifoSize(size int64) (int64, error) {
 	}
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	if size < p.size {
+	if size < p.view.Size() {
 		return 0, syserror.EBUSY
 	}
 	p.max = size
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index 80158239e..5a1d4fd57 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/amutex"
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -49,9 +50,10 @@ func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error)
 		limit: func(l int64) {
 			dst = dst.TakeFirst64(l)
 		},
-		read: func(buf *buffer) (int64, error) {
-			n, err := dst.CopyOutFrom(ctx, buf)
+		read: func(view *buffer.View) (int64, error) {
+			n, err := dst.CopyOutFrom(ctx, view)
 			dst = dst.DropFirst64(n)
+			view.TrimFront(n)
 			return n, err
 		},
 	})
@@ -70,16 +72,15 @@ func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool)
 		limit: func(l int64) {
 			count = l
 		},
-		read: func(buf *buffer) (int64, error) {
-			n, err := buf.ReadToWriter(w, count, dup)
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadToWriter(w, count)
+			if !dup {
+				view.TrimFront(n)
+			}
 			count -= n
 			return n, err
 		},
 	}
-	if dup {
-		// There is no notification for dup operations.
-		return p.dup(ctx, ops)
-	}
 	n, err := p.read(ctx, ops)
 	if n > 0 {
 		p.Notify(waiter.EventOut)
@@ -96,8 +97,8 @@ func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error)
 		limit: func(l int64) {
 			src = src.TakeFirst64(l)
 		},
-		write: func(buf *buffer) (int64, error) {
-			n, err := src.CopyInTo(ctx, buf)
+		write: func(view *buffer.View) (int64, error) {
+			n, err := src.CopyInTo(ctx, view)
 			src = src.DropFirst64(n)
 			return n, err
 		},
@@ -117,8 +118,8 @@ func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, e
 		limit: func(l int64) {
 			count = l
 		},
-		write: func(buf *buffer) (int64, error) {
-			n, err := buf.WriteFromReader(r, count)
+		write: func(view *buffer.View) (int64, error) {
+			n, err := view.WriteFromReader(r, count)
 			count -= n
 			return n, err
 		},
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index e37e23231..2cee2e6ed 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -789,6 +789,15 @@ func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error)
 	return fds[0], nil
 }
 
+// NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.Get.
+func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
+	return t.fdTable.NewFDVFS2(t, fd, file, flags)
+}
+
 // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
 //
 // This automatically passes the task as the context.
@@ -798,6 +807,15 @@ func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error {
 	return t.fdTable.NewFDAt(t, fd, file, flags)
 }
 
+// NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error {
+	return t.fdTable.NewFDAtVFS2(t, fd, file, flags)
+}
+
 // WithMuLocked executes f with t.mu locked.
 func (t *Task) WithMuLocked(f func(*Task)) {
 	t.mu.Lock()
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 2be982684..0158b1788 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -140,7 +140,7 @@ func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*Task
 	}
 
 	// Prepare a new user address space to load into.
-	m := mm.NewMemoryManager(k, k)
+	m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
 	defer m.DecUsers(ctx)
 	args.MemoryManager = m
 
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 8f57a34a6..00c425cca 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -220,7 +220,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.mu.Unlock()
 	t.unstopVforkParent()
 	// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
-	t.MemoryManager().Activate()
+	t.MemoryManager().Activate(t)
 
 	t.ptraceExec(oldTID)
 	return (*runSyscallExit)(nil)
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index 6d737d3e5..eeccaa197 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -32,21 +32,21 @@ const (
 // Infof logs an formatted info message by calling log.Infof.
 func (t *Task) Infof(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Info) {
-		log.Infof(t.logPrefix.Load().(string)+fmt, v...)
+		log.InfofAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
 // Warningf logs a warning string by calling log.Warningf.
 func (t *Task) Warningf(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Warning) {
-		log.Warningf(t.logPrefix.Load().(string)+fmt, v...)
+		log.WarningfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
 // Debugf creates a debug string that includes the task ID.
 func (t *Task) Debugf(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Debug) {
-		log.Debugf(t.logPrefix.Load().(string)+fmt, v...)
+		log.DebugfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 5568c91bc..799cbcd93 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -126,13 +126,39 @@ func (t *Task) doStop() {
 	}
 }
 
+func (*runApp) handleCPUIDInstruction(t *Task) error {
+	if len(arch.CPUIDInstruction) == 0 {
+		// CPUID emulation isn't supported, but this code can be
+		// executed, because the ptrace platform returns
+		// ErrContextSignalCPUID on page faults too. Look at
+		// pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more
+		// details.
+		return platform.ErrContextSignal
+	}
+	// Is this a CPUID instruction?
+	region := trace.StartRegion(t.traceContext, cpuidRegion)
+	expected := arch.CPUIDInstruction[:]
+	found := make([]byte, len(expected))
+	_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+	if err == nil && bytes.Equal(expected, found) {
+		// Skip the cpuid instruction.
+		t.Arch().CPUIDEmulate(t)
+		t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+		region.End()
+
+		return nil
+	}
+	region.End() // Not an actual CPUID, but required copy-in.
+	return platform.ErrContextSignal
+}
+
 // The runApp state checks for interrupts before executing untrusted
 // application code.
 //
 // +stateify savable
 type runApp struct{}
 
-func (*runApp) execute(t *Task) taskRunState {
+func (app *runApp) execute(t *Task) taskRunState {
 	if t.interrupted() {
 		// Checkpointing instructs tasks to stop by sending an interrupt, so we
 		// must check for stops before entering runInterrupt (instead of
@@ -237,21 +263,10 @@ func (*runApp) execute(t *Task) taskRunState {
 		return (*runApp)(nil)
 
 	case platform.ErrContextSignalCPUID:
-		// Is this a CPUID instruction?
-		region := trace.StartRegion(t.traceContext, cpuidRegion)
-		expected := arch.CPUIDInstruction[:]
-		found := make([]byte, len(expected))
-		_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
-		if err == nil && bytes.Equal(expected, found) {
-			// Skip the cpuid instruction.
-			t.Arch().CPUIDEmulate(t)
-			t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
-			region.End()
-
+		if err := app.handleCPUIDInstruction(t); err == nil {
 			// Resume execution.
 			return (*runApp)(nil)
 		}
-		region.End() // Not an actual CPUID, but required copy-in.
 
 		// The instruction at the given RIP was not a CPUID, and we
 		// fallthrough to the default signal deliver behavior below.
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 2bf3ce8a8..b02044ad2 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -30,7 +30,7 @@ var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
 // Activate ensures that the task has an active address space.
 func (t *Task) Activate() {
 	if mm := t.MemoryManager(); mm != nil {
-		if err := mm.Activate(); err != nil {
+		if err := mm.Activate(t); err != nil {
 			panic("unable to activate mm: " + err.Error())
 		}
 	}
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 94d39af60..0332fc71c 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -38,7 +39,7 @@ func (mm *MemoryManager) AddressSpace() platform.AddressSpace {
 //
 // When this MemoryManager is no longer needed by a task, it should call
 // Deactivate to release the reference.
-func (mm *MemoryManager) Activate() error {
+func (mm *MemoryManager) Activate(ctx context.Context) error {
 	// Fast path: the MemoryManager already has an active
 	// platform.AddressSpace, and we just need to indicate that we need it too.
 	for {
@@ -91,16 +92,20 @@ func (mm *MemoryManager) Activate() error {
 		if as == nil {
 			// AddressSpace is unavailable, we must wait.
 			//
-			// activeMu must not be held while waiting, as the user
-			// of the address space we are waiting on may attempt
-			// to take activeMu.
-			//
-			// Don't call UninterruptibleSleepStart to register the
-			// wait to allow the watchdog stuck task to trigger in
-			// case a process is starved waiting for the address
-			// space.
+			// activeMu must not be held while waiting, as the user of the address
+			// space we are waiting on may attempt to take activeMu.
 			mm.activeMu.Unlock()
+
+			sleep := mm.p.CooperativelySchedulesAddressSpace() && mm.sleepForActivation
+			if sleep {
+				// Mark this task sleeping while waiting for the address space to
+				// prevent the watchdog from reporting it as a stuck task.
+				ctx.UninterruptibleSleepStart(false)
+			}
 			<-c
+			if sleep {
+				ctx.UninterruptibleSleepFinish(false)
+			}
 			continue
 		}
 
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 3c263ebaa..d8a5b9d29 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -28,16 +28,17 @@ import (
 )
 
 // NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
-func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *MemoryManager {
+func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager {
 	return &MemoryManager{
-		p:           p,
-		mfp:         mfp,
-		haveASIO:    p.SupportsAddressSpaceIO(),
-		privateRefs: &privateRefs{},
-		users:       1,
-		auxv:        arch.Auxv{},
-		dumpability: UserDumpable,
-		aioManager:  aioManager{contexts: make(map[uint64]*AIOContext)},
+		p:                  p,
+		mfp:                mfp,
+		haveASIO:           p.SupportsAddressSpaceIO(),
+		privateRefs:        &privateRefs{},
+		users:              1,
+		auxv:               arch.Auxv{},
+		dumpability:        UserDumpable,
+		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
+		sleepForActivation: sleepForActivation,
 	}
 }
 
@@ -79,9 +80,10 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		envv:                 mm.envv,
 		auxv:                 append(arch.Auxv(nil), mm.auxv...),
 		// IncRef'd below, once we know that there isn't an error.
-		executable:  mm.executable,
-		dumpability: mm.dumpability,
-		aioManager:  aioManager{contexts: make(map[uint64]*AIOContext)},
+		executable:         mm.executable,
+		dumpability:        mm.dumpability,
+		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
+		sleepForActivation: mm.sleepForActivation,
 	}
 
 	// Copy vmas.
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 637383c7a..c2195ae11 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -226,6 +226,11 @@ type MemoryManager struct {
 	// aioManager keeps track of AIOContexts used for async IOs. AIOManager
 	// must be cloned when CLONE_VM is used.
 	aioManager aioManager
+
+	// sleepForActivation indicates whether the task should report to be sleeping
+	// before trying to activate the address space. When set to true, delays in
+	// activation are not reported as stuck tasks by the watchdog.
+	sleepForActivation bool
 }
 
 // vma represents a virtual memory area.
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index edacca741..fdc308542 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -31,7 +31,7 @@ import (
 func testMemoryManager(ctx context.Context) *MemoryManager {
 	p := platform.FromContext(ctx)
 	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
-	mm := NewMemoryManager(p, mfp)
+	mm := NewMemoryManager(p, mfp, false)
 	mm.layout = arch.MmapLayout{
 		MinAddr:      p.MinUserAddress(),
 		MaxAddr:      p.MaxUserAddress(),
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index 35cd55fef..4b23f7803 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -81,12 +81,6 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) {
 	// Save the death message, which will be thrown.
 	c.dieState.message = msg
 
-	// Reload all registers to have an accurate stack trace when we return
-	// to host mode. This means that the stack should be unwound correctly.
-	if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 {
-		throw(msg)
-	}
-
 	// Setup the trampoline.
 	dieArchSetup(c, context, &c.dieState.guestRegs)
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index a63a6a071..99cac665d 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -31,6 +31,12 @@ import (
 //
 //go:nosplit
 func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
+	// Reload all registers to have an accurate stack trace when we return
+	// to host mode. This means that the stack should be unwound correctly.
+	if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 {
+		throw(c.dieState.message)
+	}
+
 	// If the vCPU is in user mode, we set the stack to the stored stack
 	// value in the vCPU itself. We don't want to unwind the user stack.
 	if guestRegs.RFLAGS&ring0.UserFlagsSet == ring0.UserFlagsSet {
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 8076c7529..f1afc74dc 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -329,10 +329,12 @@ func (m *machine) Destroy() {
 }
 
 // Get gets an available vCPU.
+//
+// This will return with the OS thread locked.
 func (m *machine) Get() *vCPU {
+	m.mu.RLock()
 	runtime.LockOSThread()
 	tid := procid.Current()
-	m.mu.RLock()
 
 	// Check for an exact match.
 	if c := m.vCPUs[tid]; c != nil {
@@ -343,8 +345,22 @@ func (m *machine) Get() *vCPU {
 
 	// The happy path failed. We now proceed to acquire an exclusive lock
 	// (because the vCPU map may change), and scan all available vCPUs.
+	// In this case, we first unlock the OS thread. Otherwise, if mu is
+	// not available, the current system thread will be parked and a new
+	// system thread spawned. We avoid this situation by simply refreshing
+	// tid after relocking the system thread.
 	m.mu.RUnlock()
+	runtime.UnlockOSThread()
 	m.mu.Lock()
+	runtime.LockOSThread()
+	tid = procid.Current()
+
+	// Recheck for an exact match.
+	if c := m.vCPUs[tid]; c != nil {
+		c.lock()
+		m.mu.Unlock()
+		return c
+	}
 
 	for {
 		// Scan for an available vCPU.
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 1c8384e6b..b531f2f85 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -29,30 +29,6 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// setMemoryRegion initializes a region.
-//
-// This may be called from bluepillHandler, and therefore returns an errno
-// directly (instead of wrapping in an error) to avoid allocations.
-//
-//go:nosplit
-func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno {
-	userRegion := userMemoryRegion{
-		slot:          uint32(slot),
-		flags:         0,
-		guestPhysAddr: uint64(physical),
-		memorySize:    uint64(length),
-		userspaceAddr: uint64(virtual),
-	}
-
-	// Set the region.
-	_, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(m.fd),
-		_KVM_SET_USER_MEMORY_REGION,
-		uintptr(unsafe.Pointer(&userRegion)))
-	return errno
-}
-
 type kvmVcpuInit struct {
 	target   uint32
 	features [7]uint32
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index e99798c56..cd74945e7 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -21,6 +21,7 @@ import (
 	"strings"
 	"syscall"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -183,13 +184,76 @@ func enableCpuidFault() {
 
 // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
 // Ref attachedThread() for more detail.
-func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
-	return append(rules, seccomp.RuleSet{
-		Rules: seccomp.SyscallRules{
-			syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
-				{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet {
+	rules = append(rules,
+		// Rules for trapping vsyscall access.
+		seccomp.RuleSet{
+			Rules: seccomp.SyscallRules{
+				syscall.SYS_GETTIMEOFDAY: {},
+				syscall.SYS_TIME:         {},
+				unix.SYS_GETCPU:          {}, // SYS_GETCPU was not defined in package syscall on amd64.
 			},
-		},
-		Action: linux.SECCOMP_RET_ALLOW,
-	})
+			Action:   linux.SECCOMP_RET_TRAP,
+			Vsyscall: true,
+		})
+	if defaultAction != linux.SECCOMP_RET_ALLOW {
+		rules = append(rules,
+			seccomp.RuleSet{
+				Rules: seccomp.SyscallRules{
+					syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+						{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+					},
+				},
+				Action: linux.SECCOMP_RET_ALLOW,
+			})
+	}
+	return rules
+}
+
+// probeSeccomp returns true iff seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8. This check is dynamic
+// because kernels have be backported behavior.
+//
+// See createStub for more information.
+//
+// Precondition: the runtime OS thread must be locked.
+func probeSeccomp() bool {
+	// Create a completely new, destroyable process.
+	t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO)
+	if err != nil {
+		panic(fmt.Sprintf("seccomp probe failed: %v", err))
+	}
+	defer t.destroy()
+
+	// Set registers to the yield system call. This call is not allowed
+	// by the filters specified in the attachThread function.
+	regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
+	if err := t.setRegs(&regs); err != nil {
+		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+	}
+
+	for {
+		// Attempt an emulation.
+		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
+			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+		}
+
+		sig := t.wait(stopped)
+		if sig == (syscallEvent | syscall.SIGTRAP) {
+			// Did the seccomp errno hook already run? This would
+			// indicate that seccomp is first in line and we're
+			// less than 4.8.
+			if err := t.getRegs(&regs); err != nil {
+				panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
+			}
+			if _, err := syscallReturnValue(&regs); err == nil {
+				// The seccomp errno mode ran first, and reset
+				// the error in the registers.
+				return false
+			}
+			// The seccomp hook did not run yet, and therefore it
+			// is safe to use RET_KILL mode for dispatched calls.
+			return true
+		}
+	}
 }
diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go
index 7b975137f..7f5c393f0 100644
--- a/pkg/sentry/platform/ptrace/subprocess_arm64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go
@@ -160,6 +160,15 @@ func enableCpuidFault() {
 
 // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
 // Ref attachedThread() for more detail.
-func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
+func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet {
 	return rules
 }
+
+// probeSeccomp returns true if seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8.
+//
+// On arm64, the support of PTRACE_SYSEMU was added in the 5.3 kernel, so
+// probeSeccomp can always return true.
+func probeSeccomp() bool {
+	return true
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 74968dfdf..2ce528601 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -20,7 +20,6 @@ import (
 	"fmt"
 	"syscall"
 
-	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/procid"
@@ -30,54 +29,6 @@ import (
 
 const syscallEvent syscall.Signal = 0x80
 
-// probeSeccomp returns true iff seccomp is run after ptrace notifications,
-// which is generally the case for kernel version >= 4.8. This check is dynamic
-// because kernels have be backported behavior.
-//
-// See createStub for more information.
-//
-// Precondition: the runtime OS thread must be locked.
-func probeSeccomp() bool {
-	// Create a completely new, destroyable process.
-	t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO)
-	if err != nil {
-		panic(fmt.Sprintf("seccomp probe failed: %v", err))
-	}
-	defer t.destroy()
-
-	// Set registers to the yield system call. This call is not allowed
-	// by the filters specified in the attachThread function.
-	regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
-	if err := t.setRegs(&regs); err != nil {
-		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
-	}
-
-	for {
-		// Attempt an emulation.
-		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
-			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
-		}
-
-		sig := t.wait(stopped)
-		if sig == (syscallEvent | syscall.SIGTRAP) {
-			// Did the seccomp errno hook already run? This would
-			// indicate that seccomp is first in line and we're
-			// less than 4.8.
-			if err := t.getRegs(&regs); err != nil {
-				panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
-			}
-			if _, err := syscallReturnValue(&regs); err == nil {
-				// The seccomp errno mode ran first, and reset
-				// the error in the registers.
-				return false
-			}
-			// The seccomp hook did not run yet, and therefore it
-			// is safe to use RET_KILL mode for dispatched calls.
-			return true
-		}
-	}
-}
-
 // createStub creates a fresh stub processes.
 //
 // Precondition: the runtime OS thread must be locked.
@@ -123,18 +74,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 	// stub and all its children. This is used to create child stubs
 	// (below), so we must include the ability to fork, but otherwise lock
 	// down available calls only to what is needed.
-	rules := []seccomp.RuleSet{
-		// Rules for trapping vsyscall access.
-		{
-			Rules: seccomp.SyscallRules{
-				syscall.SYS_GETTIMEOFDAY: {},
-				syscall.SYS_TIME:         {},
-				unix.SYS_GETCPU:          {}, // SYS_GETCPU was not defined in package syscall on amd64.
-			},
-			Action:   linux.SECCOMP_RET_TRAP,
-			Vsyscall: true,
-		},
-	}
+	rules := []seccomp.RuleSet{}
 	if defaultAction != linux.SECCOMP_RET_ALLOW {
 		rules = append(rules, seccomp.RuleSet{
 			Rules: seccomp.SyscallRules{
@@ -173,9 +113,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 			},
 			Action: linux.SECCOMP_RET_ALLOW,
 		})
-
-		rules = appendArchSeccompRules(rules)
 	}
+	rules = appendArchSeccompRules(rules, defaultAction)
 	instrs, err := seccomp.BuildProgram(rules, defaultAction)
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index baa6c4910..d42eda37b 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -25,10 +25,14 @@
 // not available for calls.
 //
 
+// ERET returns using the ELR and SPSR for the current exception level.
 #define ERET() \
   WORD $0xd69f03e0
 
+// RSV_REG is a register that holds el1 information temporarily.
 #define RSV_REG 	R18_PLATFORM
+
+// RSV_REG_APP is a register that holds el0 information temporarily.
 #define RSV_REG_APP 	R9
 
 #define FPEN_NOTRAP 	0x3
@@ -36,6 +40,12 @@
 
 #define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT)
 
+// Saves a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not saved: R9, R18.
 #define REGISTERS_SAVE(reg, offset) \
   MOVD R0, offset+PTRACE_R0(reg); \
   MOVD R1, offset+PTRACE_R1(reg); \
@@ -67,6 +77,12 @@
   MOVD R29, offset+PTRACE_R29(reg); \
   MOVD R30, offset+PTRACE_R30(reg);
 
+// Loads a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not loaded: R9, R18.
 #define REGISTERS_LOAD(reg, offset) \
   MOVD offset+PTRACE_R0(reg), R0; \
   MOVD offset+PTRACE_R1(reg), R1; \
@@ -98,7 +114,7 @@
   MOVD offset+PTRACE_R29(reg), R29; \
   MOVD offset+PTRACE_R30(reg), R30;
 
-//NOP
+// NOP-s
 #define nop31Instructions() \
         WORD $0xd503201f; \
         WORD $0xd503201f; \
@@ -254,6 +270,7 @@
 #define ESR_ELx_WFx_ISS_WFE	(UL(1) << 0)
 #define ESR_ELx_xVC_IMM_MASK	((1UL << 16) - 1)
 
+// LOAD_KERNEL_ADDRESS loads a kernel address.
 #define LOAD_KERNEL_ADDRESS(from, to) \
 	MOVD from, to; \
 	ORR $0xffff000000000000, to, to;
@@ -263,15 +280,18 @@
 	LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \
 	MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \
 	MOVD RSV_REG, RSP; \
+	WORD $0xd538d092; \   //MRS   TPIDR_EL1, R18
 	ISB $15; \
 	DSB $15;
 
+// SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application.
 #define SWITCH_TO_APP_PAGETABLE(from) \
 	MOVD CPU_TTBR0_APP(from), RSV_REG; \
 	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
 	ISB $15; \
 	DSB $15;
 
+// SWITCH_TO_KVM_PAGETABLE sets the kvm pagetable.
 #define SWITCH_TO_KVM_PAGETABLE(from) \
 	MOVD CPU_TTBR0_KVM(from), RSV_REG; \
 	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
@@ -294,6 +314,7 @@
 	WORD $0xd5181040; \ //MSR R0, CPACR_EL1
 	ISB $15;
 
+// KERNEL_ENTRY_FROM_EL0 is the entry code of the vcpu from el0 to el1.
 #define KERNEL_ENTRY_FROM_EL0 \
 	SUB $16, RSP, RSP; \		// step1, save r18, r9 into kernel temporary stack.
 	STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \
@@ -315,19 +336,22 @@
 	WORD $0xd5384103; \      //  MRS SP_EL0, R3
 	MOVD R3, PTRACE_SP(RSV_REG_APP);
 
+// KERNEL_ENTRY_FROM_EL1 is the entry code of the vcpu from el1 to el1.
 #define KERNEL_ENTRY_FROM_EL1 \
 	WORD $0xd538d092; \   //MRS   TPIDR_EL1, R18
-	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \	// save sentry context
+	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \	// Save sentry context.
 	MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG); \
 	WORD $0xd5384004; \    //    MRS SPSR_EL1, R4
 	MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \
 	MRS ELR_EL1, R4; \
 	MOVD R4, CPU_REGISTERS+PTRACE_PC(RSV_REG); \
 	MOVD RSP, R4; \
-	MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG);
+	MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \
+	LOAD_KERNEL_STACK(RSV_REG);  // Load the temporary stack.
 
+// Halt halts execution.
 TEXT ·Halt(SB),NOSPLIT,$0
-	// clear bluepill.
+	// Clear bluepill.
 	WORD $0xd538d092   //MRS   TPIDR_EL1, R18
 	CMP RSV_REG, R9
 	BNE mmio_exit
@@ -341,8 +365,22 @@ mmio_exit:
 	// MMIO_EXIT.
 	MOVD $0, R9
 	MOVD R0, 0xffff000000001000(R9)
-	B ·kernelExitToEl1(SB)
+	RET
+
+// HaltAndResume halts execution and point the pointer to the resume function.
+TEXT ·HaltAndResume(SB),NOSPLIT,$0
+	BL ·Halt(SB)
+	B ·kernelExitToEl1(SB) // Resume.
 
+// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume.
+TEXT ·HaltEl1SvcAndResume(SB),NOSPLIT,$0
+	WORD $0xd538d092            // MRS TPIDR_EL1, R18
+	MOVD CPU_SELF(RSV_REG), R3  // Load vCPU.
+	MOVD R3, 8(RSP)             // First argument (vCPU).
+	CALL ·kernelSyscall(SB)     // Call the trampoline.
+	B ·kernelExitToEl1(SB)      // Resume.
+
+// Shutdown stops the guest.
 TEXT ·Shutdown(SB),NOSPLIT,$0
 	// PSCI EVENT.
 	MOVD $0x84000009, R0
@@ -429,6 +467,7 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
 	ERET()
 
+// Start is the CPU entrypoint.
 TEXT ·Start(SB),NOSPLIT,$0
 	IRQ_DISABLE
 	MOVD R8, RSV_REG
@@ -437,18 +476,23 @@ TEXT ·Start(SB),NOSPLIT,$0
 
 	B ·kernelExitToEl1(SB)
 
+// El1_sync_invalid is the handler for an invalid EL1_sync.
 TEXT ·El1_sync_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_irq_invalid is the handler for an invalid El1_irq.
 TEXT ·El1_irq_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_fiq_invalid is the handler for an invalid El1_fiq.
 TEXT ·El1_fiq_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_error_invalid is the handler for an invalid El1_error.
 TEXT ·El1_error_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_sync is the handler for El1_sync.
 TEXT ·El1_sync(SB),NOSPLIT,$0
 	KERNEL_ENTRY_FROM_EL1
 	WORD $0xd5385219        // MRS ESR_EL1, R25
@@ -484,10 +528,10 @@ el1_da:
 	MOVD $PageFault, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el1_ia:
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el1_sp_pc:
 	B ·Shutdown(SB)
@@ -496,7 +540,9 @@ el1_undef:
 	B ·Shutdown(SB)
 
 el1_svc:
-	B ·Halt(SB)
+	MOVD $0, CPU_ERROR_CODE(RSV_REG)
+	MOVD $0, CPU_ERROR_TYPE(RSV_REG)
+	B ·HaltEl1SvcAndResume(SB)
 
 el1_dbg:
 	B ·Shutdown(SB)
@@ -508,15 +554,19 @@ el1_fpsimd_acc:
 el1_invalid:
 	B ·Shutdown(SB)
 
+// El1_irq is the handler for El1_irq.
 TEXT ·El1_irq(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_fiq is the handler for El1_fiq.
 TEXT ·El1_fiq(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_error is the handler for El1_error.
 TEXT ·El1_error(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El0_sync is the handler for El0_sync.
 TEXT ·El0_sync(SB),NOSPLIT,$0
 	KERNEL_ENTRY_FROM_EL0
 	WORD $0xd5385219	// MRS ESR_EL1, R25
@@ -554,7 +604,7 @@ el0_svc:
 	MOVD $Syscall, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el0_da:
 	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
@@ -568,7 +618,7 @@ el0_da:
 	MOVD $PageFault, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el0_ia:
 	B ·Shutdown(SB)
@@ -613,7 +663,7 @@ TEXT ·El0_error(SB),NOSPLIT,$0
 	MOVD $VirtualizationException, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 TEXT ·El0_sync_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
@@ -627,6 +677,7 @@ TEXT ·El0_fiq_invalid(SB),NOSPLIT,$0
 TEXT ·El0_error_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// Vectors implements exception vector table.
 TEXT ·Vectors(SB),NOSPLIT,$0
 	B ·El1_sync_invalid(SB)
 	nop31Instructions()
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index c3d341998..ccacaea6b 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -16,6 +16,14 @@
 
 package ring0
 
+// HaltAndResume halts execution and point the pointer to the resume function.
+//go:nosplit
+func HaltAndResume()
+
+// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume.
+//go:nosplit
+func HaltEl1SvcAndResume()
+
 // init initializes architecture-specific state.
 func (k *Kernel) init(opts KernelOpts) {
 	// Save the root page tables.
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 4f2406ce3..581841555 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -80,7 +80,7 @@ go_library(
         "pagetables_amd64.go",
         "pagetables_arm64.go",
         "pagetables_x86.go",
-        "pcids_x86.go",
+        "pcids.go",
         "walker_amd64.go",
         "walker_arm64.go",
         "walker_empty.go",
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids.go
index e199bae18..9206030bf 100644
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pcids.go
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
-
 package pagetables
 
 import (
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index e187276c5..13a9a60b4 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -712,14 +712,44 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
-	addr, family, err := AddressAndFamily(sockaddr)
-	if err != nil {
-		return err
+	if len(sockaddr) < 2 {
+		return syserr.ErrInvalidArgument
 	}
-	if err := s.checkFamily(family, true /* exact */); err != nil {
-		return err
+
+	family := usermem.ByteOrder.Uint16(sockaddr)
+	var addr tcpip.FullAddress
+
+	// Bind for AF_PACKET requires only family, protocol and ifindex.
+	// In function AddressAndFamily, we check the address length which is
+	// not needed for AF_PACKET bind.
+	if family == linux.AF_PACKET {
+		var a linux.SockAddrLink
+		if len(sockaddr) < sockAddrLinkSize {
+			return syserr.ErrInvalidArgument
+		}
+		binary.Unmarshal(sockaddr[:sockAddrLinkSize], usermem.ByteOrder, &a)
+
+		if a.Protocol != uint16(s.protocol) {
+			return syserr.ErrInvalidArgument
+		}
+
+		addr = tcpip.FullAddress{
+			NIC:  tcpip.NICID(a.InterfaceIndex),
+			Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
+		}
+	} else {
+		var err *syserr.Error
+		addr, family, err = AddressAndFamily(sockaddr)
+		if err != nil {
+			return err
+		}
+
+		if err = s.checkFamily(family, true /* exact */); err != nil {
+			return err
+		}
+
+		addr = s.mapFamily(addr, family)
 	}
-	addr = s.mapFamily(addr, family)
 
 	// Issue the bind request to the endpoint.
 	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
@@ -2637,7 +2667,9 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
 		}
 
 		// Add bytes removed from the endpoint but not yet sent to the caller.
+		s.readMu.Lock()
 		v += len(s.readView)
+		s.readMu.Unlock()
 
 		if v > math.MaxInt32 {
 			v = math.MaxInt32
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 51e6d81b2..c0512de89 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -632,4 +632,13 @@ var sockOptNames = map[uint64]abi.ValueSet{
 		linux.MCAST_MSFILTER:           "MCAST_MSFILTER",
 		linux.IPV6_ADDRFORM:            "IPV6_ADDRFORM",
 	},
+	linux.SOL_NETLINK: {
+		linux.NETLINK_BROADCAST_ERROR:  "NETLINK_BROADCAST_ERROR",
+		linux.NETLINK_CAP_ACK:          "NETLINK_CAP_ACK",
+		linux.NETLINK_DUMP_STRICT_CHK:  "NETLINK_DUMP_STRICT_CHK",
+		linux.NETLINK_EXT_ACK:          "NETLINK_EXT_ACK",
+		linux.NETLINK_LIST_MEMBERSHIPS: "NETLINK_LIST_MEMBERSHIPS",
+		linux.NETLINK_NO_ENOBUFS:       "NETLINK_NO_ENOBUFS",
+		linux.NETLINK_PKTINFO:          "NETLINK_PKTINFO",
+	},
 }
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index c7883e68e..0d24fd3c4 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -42,6 +42,8 @@ go_library(
         "sys_socket.go",
         "sys_splice.go",
         "sys_stat.go",
+        "sys_stat_amd64.go",
+        "sys_stat_arm64.go",
         "sys_sync.go",
         "sys_sysinfo.go",
         "sys_syslog.go",
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index fbef5b376..3ab93fbde 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // EpollCreate1 implements the epoll_create1(2) linux syscall.
 func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	flags := args[0].Int()
@@ -164,3 +166,5 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 
 	return EpollWait(t, args)
 }
+
+// LINT.ThenChange(vfs2/epoll.go)
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 421845ebb..d10a9bed8 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -130,6 +130,8 @@ func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string
 	return path, dirPath, nil
 }
 
+// LINT.IfChange
+
 func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) {
 	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -575,6 +577,10 @@ func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
 // Ioctl implements linux syscall ioctl(2).
 func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -650,6 +656,10 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 }
 
+// LINT.ThenChange(vfs2/ioctl.go)
+
+// LINT.IfChange
+
 // Getcwd implements the linux syscall getcwd(2).
 func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
@@ -760,6 +770,10 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	return 0, nil, nil
 }
 
+// LINT.ThenChange(vfs2/fscontext.go)
+
+// LINT.IfChange
+
 // Close implements linux syscall close(2).
 func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -1094,6 +1108,8 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 }
 
+// LINT.ThenChange(vfs2/fd.go)
+
 const (
 	_FADV_NORMAL     = 0
 	_FADV_RANDOM     = 1
@@ -1141,6 +1157,8 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, nil
 }
 
+// LINT.IfChange
+
 func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
 	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1218,7 +1236,7 @@ func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 			return syserror.ENOTEMPTY
 		}
 
-		if err := fs.MayDelete(t, root, d, name); err != nil {
+		if err := d.MayDelete(t, root, name); err != nil {
 			return err
 		}
 
@@ -1421,6 +1439,10 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
 func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
 	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1480,6 +1502,10 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	return n, nil, err
 }
 
+// LINT.ThenChange(vfs2/stat.go)
+
+// LINT.IfChange
+
 func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1491,7 +1517,7 @@ func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 			return syserror.ENOTDIR
 		}
 
-		if err := fs.MayDelete(t, root, d, name); err != nil {
+		if err := d.MayDelete(t, root, name); err != nil {
 			return err
 		}
 
@@ -1516,6 +1542,10 @@ func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	return 0, nil, unlinkAt(t, dirFD, addr)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
 // Truncate implements linux syscall truncate(2).
 func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
@@ -1614,6 +1644,8 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, nil
 }
 
+// LINT.ThenChange(vfs2/setstat.go)
+
 // Umask implements linux syscall umask(2).
 func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	mask := args[0].ModeT()
@@ -1621,6 +1653,8 @@ func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	return uintptr(mask), nil, nil
 }
 
+// LINT.IfChange
+
 // Change ownership of a file.
 //
 // uid and gid may be -1, in which case they will not be changed.
@@ -1987,6 +2021,10 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
 }
 
+// LINT.ThenChange(vfs2/setstat.go)
+
+// LINT.IfChange
+
 func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error {
 	newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
 	if err != nil {
@@ -2042,6 +2080,8 @@ func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
 // Fallocate implements linux system call fallocate(2).
 func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index f66f4ffde..b126fecc0 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -27,6 +27,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // Getdents implements linux syscall getdents(2) for 64bit systems.
 func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -244,3 +246,5 @@ func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error {
 func (ds *direntSerializer) Written() int {
 	return ds.written
 }
+
+// LINT.ThenChange(vfs2/getdents.go)
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
index 297e920c4..3f7691eae 100644
--- a/pkg/sentry/syscalls/linux/sys_lseek.go
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -21,6 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// LINT.IfChange
+
 // Lseek implements linux syscall lseek(2).
 func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -52,3 +54,5 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 	return uintptr(offset), nil, err
 }
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 9959f6e61..91694d374 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -35,6 +35,8 @@ func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
 	return uintptr(addr), nil, nil
 }
 
+// LINT.IfChange
+
 // Mmap implements linux syscall mmap(2).
 func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	prot := args[2].Int()
@@ -104,6 +106,8 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 	return uintptr(rv), nil, err
 }
 
+// LINT.ThenChange(vfs2/mmap.go)
+
 // Munmap implements linux syscall munmap(2).
 func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64())
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 227692f06..78a2cb750 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -28,6 +28,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 const (
 	// EventMaskRead contains events that can be triggered on reads.
 	EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
@@ -388,3 +390,5 @@ func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (i
 
 	return total, err
 }
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 11f25e00d..9bd2df104 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -23,23 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
-	return linux.Stat{
-		Dev:     sattr.DeviceID,
-		Ino:     sattr.InodeID,
-		Nlink:   uattr.Links,
-		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
-		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
-		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
-		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
-		Size:    uattr.Size,
-		Blksize: sattr.BlockSize,
-		Blocks:  uattr.Usage / 512,
-		ATime:   uattr.AccessTime.Timespec(),
-		MTime:   uattr.ModificationTime.Timespec(),
-		CTime:   uattr.StatusChangeTime.Timespec(),
-	}
-}
+// LINT.IfChange
 
 // Stat implements linux syscall stat(2).
 func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
@@ -297,3 +281,5 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
 	_, err = t.CopyOut(addr, &statfs)
 	return err
 }
+
+// LINT.ThenChange(vfs2/stat.go)
diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
new file mode 100644
index 000000000..0a04a6113
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// LINT.IfChange
+
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+	return linux.Stat{
+		Dev:     sattr.DeviceID,
+		Ino:     sattr.InodeID,
+		Nlink:   uattr.Links,
+		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+		Size:    uattr.Size,
+		Blksize: sattr.BlockSize,
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	}
+}
+
+// LINT.ThenChange(vfs2/stat_amd64.go)
diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
new file mode 100644
index 000000000..5a3b1bfad
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// LINT.IfChange
+
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+	return linux.Stat{
+		Dev:     sattr.DeviceID,
+		Ino:     sattr.InodeID,
+		Nlink:   uint32(uattr.Links),
+		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+		Size:    uattr.Size,
+		Blksize: int32(sattr.BlockSize),
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	}
+}
+
+// LINT.ThenChange(vfs2/stat_arm64.go)
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index 3e55235bd..5ad465ae3 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -22,6 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// LINT.IfChange
+
 // Sync implements linux system call sync(2).
 func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	t.MountNamespace().SyncAll(t)
@@ -135,3 +137,5 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 
 	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
 }
+
+// LINT.ThenChange(vfs2/sync.go)
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index aba892939..506ee54ce 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -28,6 +28,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 const (
 	// EventMaskWrite contains events that can be triggered on writes.
 	//
@@ -358,3 +360,5 @@ func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) (
 
 	return total, err
 }
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 9d8140b8a..2de5e3422 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // GetXattr implements linux syscall getxattr(2).
 func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return getXattrFromPath(t, args, true)
@@ -418,3 +420,5 @@ func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
 
 	return d.Inode.RemoveXattr(t, d, name)
 }
+
+// LINT.ThenChange(vfs2/xattr.go)
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index 6b8a00b6e..e7695e995 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -5,18 +5,46 @@ package(licenses = ["notice"])
 go_library(
     name = "vfs2",
     srcs = [
+        "epoll.go",
+        "epoll_unsafe.go",
+        "execve.go",
+        "fd.go",
+        "filesystem.go",
+        "fscontext.go",
+        "getdents.go",
+        "ioctl.go",
         "linux64.go",
         "linux64_override_amd64.go",
         "linux64_override_arm64.go",
-        "sys_read.go",
+        "mmap.go",
+        "path.go",
+        "poll.go",
+        "read_write.go",
+        "setstat.go",
+        "stat.go",
+        "stat_amd64.go",
+        "stat_arm64.go",
+        "sync.go",
+        "xattr.go",
     ],
+    marshal = True,
     visibility = ["//:sandbox"],
     deps = [
+        "//pkg/abi/linux",
+        "//pkg/fspath",
+        "//pkg/gohacks",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/memmap",
         "//pkg/sentry/syscalls",
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/vfs",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
new file mode 100644
index 000000000..d6cb0e79a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -0,0 +1,225 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"math"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// EpollCreate1 implements Linux syscall epoll_create1(2).
+func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	if flags&^linux.EPOLL_CLOEXEC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file, err := t.Kernel().VFS().NewEpollInstanceFD()
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// EpollCreate implements Linux syscall epoll_create(2).
+func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := args[0].Int()
+
+	// "Since Linux 2.6.8, the size argument is ignored, but must be greater
+	// than zero" - epoll_create(2)
+	if size <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file, err := t.Kernel().VFS().NewEpollInstanceFD()
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// EpollCtl implements Linux syscall epoll_ctl(2).
+func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := args[0].Int()
+	op := args[1].Int()
+	fd := args[2].Int()
+	eventAddr := args[3].Pointer()
+
+	epfile := t.GetFileVFS2(epfd)
+	if epfile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer epfile.DecRef()
+	ep, ok := epfile.Impl().(*vfs.EpollInstance)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+	if epfile == file {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var event linux.EpollEvent
+	switch op {
+	case linux.EPOLL_CTL_ADD:
+		if err := event.CopyIn(t, eventAddr); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, ep.AddInterest(file, fd, event)
+	case linux.EPOLL_CTL_DEL:
+		return 0, nil, ep.DeleteInterest(file, fd)
+	case linux.EPOLL_CTL_MOD:
+		if err := event.CopyIn(t, eventAddr); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, ep.ModifyInterest(file, fd, event)
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+// EpollWait implements Linux syscall epoll_wait(2).
+func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := args[0].Int()
+	eventsAddr := args[1].Pointer()
+	maxEvents := int(args[2].Int())
+	timeout := int(args[3].Int())
+
+	const _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS
+	if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS {
+		return 0, nil, syserror.EINVAL
+	}
+
+	epfile := t.GetFileVFS2(epfd)
+	if epfile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer epfile.DecRef()
+	ep, ok := epfile.Impl().(*vfs.EpollInstance)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent,
+	// maxEvents), so that the buffer can be allocated on the stack.
+	var (
+		events       [16]linux.EpollEvent
+		total        int
+		ch           chan struct{}
+		haveDeadline bool
+		deadline     ktime.Time
+	)
+	for {
+		batchEvents := len(events)
+		if batchEvents > maxEvents {
+			batchEvents = maxEvents
+		}
+		n := ep.ReadEvents(events[:batchEvents])
+		maxEvents -= n
+		if n != 0 {
+			// Copy what we read out.
+			copiedEvents, err := copyOutEvents(t, eventsAddr, events[:n])
+			eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
+			total += copiedEvents
+			if err != nil {
+				if total != 0 {
+					return uintptr(total), nil, nil
+				}
+				return 0, nil, err
+			}
+			// If we've filled the application's event buffer, we're done.
+			if maxEvents == 0 {
+				return uintptr(total), nil, nil
+			}
+			// Loop if we read a full batch, under the expectation that there
+			// may be more events to read.
+			if n == batchEvents {
+				continue
+			}
+		}
+		// We get here if n != batchEvents. If we read any number of events
+		// (just now, or in a previous iteration of this loop), or if timeout
+		// is 0 (such that epoll_wait should be non-blocking), return the
+		// events we've read so far to the application.
+		if total != 0 || timeout == 0 {
+			return uintptr(total), nil, nil
+		}
+		// In the first iteration of this loop, register with the epoll
+		// instance for readability events, but then immediately continue the
+		// loop since we need to retry ReadEvents() before blocking. In all
+		// subsequent iterations, block until events are available, the timeout
+		// expires, or an interrupt arrives.
+		if ch == nil {
+			var w waiter.Entry
+			w, ch = waiter.NewChannelEntry(nil)
+			epfile.EventRegister(&w, waiter.EventIn)
+			defer epfile.EventUnregister(&w)
+		} else {
+			// Set up the timer if a timeout was specified.
+			if timeout > 0 && !haveDeadline {
+				timeoutDur := time.Duration(timeout) * time.Millisecond
+				deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
+				haveDeadline = true
+			}
+			if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+				if err == syserror.ETIMEDOUT {
+					err = nil
+				}
+				// total must be 0 since otherwise we would have returned
+				// above.
+				return 0, nil, err
+			}
+		}
+	}
+}
+
+// EpollPwait implements Linux syscall epoll_pwait(2).
+func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	maskAddr := args[4].Pointer()
+	maskSize := uint(args[5].Uint())
+
+	if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
+		return 0, nil, err
+	}
+
+	return EpollWait(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go
new file mode 100644
index 000000000..825f325bf
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go
@@ -0,0 +1,44 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"reflect"
+	"runtime"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const sizeofEpollEvent = int(unsafe.Sizeof(linux.EpollEvent{}))
+
+func copyOutEvents(t *kernel.Task, addr usermem.Addr, events []linux.EpollEvent) (int, error) {
+	if len(events) == 0 {
+		return 0, nil
+	}
+	// Cast events to a byte slice for copying.
+	var eventBytes []byte
+	eventBytesHdr := (*reflect.SliceHeader)(unsafe.Pointer(&eventBytes))
+	eventBytesHdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(&events[0])))
+	eventBytesHdr.Len = len(events) * sizeofEpollEvent
+	eventBytesHdr.Cap = len(events) * sizeofEpollEvent
+	copiedBytes, err := t.CopyOutBytes(addr, eventBytes)
+	runtime.KeepAlive(events)
+	copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
+	return copiedEvents, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go
new file mode 100644
index 000000000..aef0078a8
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/execve.go
@@ -0,0 +1,137 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Execve implements linux syscall execve(2).
+func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathnameAddr := args[0].Pointer()
+	argvAddr := args[1].Pointer()
+	envvAddr := args[2].Pointer()
+	return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */)
+}
+
+// Execveat implements linux syscall execveat(2).
+func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathnameAddr := args[1].Pointer()
+	argvAddr := args[2].Pointer()
+	envvAddr := args[3].Pointer()
+	flags := args[4].Int()
+	return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags)
+}
+
+func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
+	if err != nil {
+		return 0, nil, err
+	}
+	var argv, envv []string
+	if argvAddr != 0 {
+		var err error
+		argv, err = t.CopyInVector(argvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+	if envvAddr != 0 {
+		var err error
+		envv, err = t.CopyInVector(envvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	var executable fsbridge.File
+	closeOnExec := false
+	if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute {
+		// We must open the executable ourselves since dirfd is used as the
+		// starting point while resolving path, but the task working directory
+		// is used as the starting point while resolving interpreters (Linux:
+		// fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() =>
+		// do_open_execat(fd=AT_FDCWD)), and the loader package is currently
+		// incapable of handling this correctly.
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return 0, nil, syserror.ENOENT
+		}
+		dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd)
+		if dirfile == nil {
+			return 0, nil, syserror.EBADF
+		}
+		start := dirfile.VirtualDentry()
+		start.IncRef()
+		dirfile.DecRef()
+		closeOnExec = dirfileFlags.CloseOnExec
+		file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{
+			Root:               root,
+			Start:              start,
+			Path:               path,
+			FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+		}, &vfs.OpenOptions{
+			Flags:    linux.O_RDONLY,
+			FileExec: true,
+		})
+		start.DecRef()
+		if err != nil {
+			return 0, nil, err
+		}
+		defer file.DecRef()
+		executable = fsbridge.NewVFSFile(file)
+	}
+
+	// Load the new TaskContext.
+	mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change
+	defer mntns.DecRef()
+	wd := t.FSContext().WorkingDirectoryVFS2()
+	defer wd.DecRef()
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
+	loadArgs := loader.LoadArgs{
+		Opener:              fsbridge.NewVFSLookup(mntns, root, wd),
+		RemainingTraversals: &remainingTraversals,
+		ResolveFinal:        flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+		Filename:            pathname,
+		File:                executable,
+		CloseOnExec:         closeOnExec,
+		Argv:                argv,
+		Envv:                envv,
+		Features:            t.Arch().FeatureSet(),
+	}
+
+	tc, se := t.Kernel().LoadTaskImage(t, loadArgs)
+	if se != nil {
+		return 0, nil, se.ToError()
+	}
+
+	ctrl, err := t.Execve(tc)
+	return 0, ctrl, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
new file mode 100644
index 000000000..3afcea665
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -0,0 +1,147 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Close implements Linux syscall close(2).
+func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	// Note that Remove provides a reference on the file that we may use to
+	// flush. It is still active until we drop the final reference below
+	// (and other reference-holding operations complete).
+	_, file := t.FDTable().Remove(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.OnClose(t)
+	return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file)
+}
+
+// Dup implements Linux syscall dup(2).
+func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
+	if err != nil {
+		return 0, nil, syserror.EMFILE
+	}
+	return uintptr(newFD), nil, nil
+}
+
+// Dup2 implements Linux syscall dup2(2).
+func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := args[0].Int()
+	newfd := args[1].Int()
+
+	if oldfd == newfd {
+		// As long as oldfd is valid, dup2() does nothing and returns newfd.
+		file := t.GetFileVFS2(oldfd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		file.DecRef()
+		return uintptr(newfd), nil, nil
+	}
+
+	return dup3(t, oldfd, newfd, 0)
+}
+
+// Dup3 implements Linux syscall dup3(2).
+func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := args[0].Int()
+	newfd := args[1].Int()
+	flags := args[2].Uint()
+
+	if oldfd == newfd {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return dup3(t, oldfd, newfd, flags)
+}
+
+func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) {
+	if flags&^linux.O_CLOEXEC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(oldfd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(newfd), nil, nil
+}
+
+// Fcntl implements linux syscall fcntl(2).
+func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	cmd := args[1].Int()
+
+	file, flags := t.FDTable().GetVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	switch cmd {
+	case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
+		minfd := args[2].Int()
+		fd, err := t.NewFDFromVFS2(minfd, file, kernel.FDFlags{
+			CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
+		})
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(fd), nil, nil
+	case linux.F_GETFD:
+		return uintptr(flags.ToLinuxFDFlags()), nil, nil
+	case linux.F_SETFD:
+		flags := args[2].Uint()
+		t.FDTable().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
+		})
+		return 0, nil, nil
+	case linux.F_GETFL:
+		return uintptr(file.StatusFlags()), nil, nil
+	case linux.F_SETFL:
+		return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
+	default:
+		// TODO(gvisor.dev/issue/1623): Everything else is not yet supported.
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
new file mode 100644
index 000000000..fc5ceea4c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -0,0 +1,326 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Link implements Linux syscall link(2).
+func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldpathAddr := args[0].Pointer()
+	newpathAddr := args[1].Pointer()
+	return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
+}
+
+// Linkat implements Linux syscall linkat(2).
+func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	flags := args[4].Int()
+	return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
+}
+
+func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+	if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) {
+		return syserror.ENOENT
+	}
+
+	oldpath, err := copyInPath(t, oldpathAddr)
+	if err != nil {
+		return err
+	}
+	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0))
+	if err != nil {
+		return err
+	}
+	defer oldtpop.Release()
+
+	newpath, err := copyInPath(t, newpathAddr)
+	if err != nil {
+		return err
+	}
+	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer newtpop.Release()
+
+	return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop)
+}
+
+// Mkdir implements Linux syscall mkdir(2).
+func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Mkdirat implements Linux syscall mkdirat(2).
+func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+	return 0, nil, mkdirat(t, dirfd, addr, mode)
+}
+
+func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error {
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{
+		Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()),
+	})
+}
+
+// Mknod implements Linux syscall mknod(2).
+func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	dev := args[2].Uint()
+	return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev)
+}
+
+// Mknodat implements Linux syscall mknodat(2).
+func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+	dev := args[3].Uint()
+	return 0, nil, mknodat(t, dirfd, addr, mode, dev)
+}
+
+func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error {
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	major, minor := linux.DecodeDeviceID(dev)
+	return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
+		Mode:     linux.FileMode(mode &^ t.FSContext().Umask()),
+		DevMajor: uint32(major),
+		DevMinor: minor,
+	})
+}
+
+// Open implements Linux syscall open(2).
+func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Uint()
+	mode := args[2].ModeT()
+	return openat(t, linux.AT_FDCWD, addr, flags, mode)
+}
+
+// Openat implements Linux syscall openat(2).
+func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	flags := args[2].Uint()
+	mode := args[3].ModeT()
+	return openat(t, dirfd, addr, flags, mode)
+}
+
+// Creat implements Linux syscall creat(2).
+func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode)
+}
+
+func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0))
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
+		Flags: flags,
+		Mode:  linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()),
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	return uintptr(fd), nil, err
+}
+
+// Rename implements Linux syscall rename(2).
+func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldpathAddr := args[0].Pointer()
+	newpathAddr := args[1].Pointer()
+	return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
+}
+
+// Renameat implements Linux syscall renameat(2).
+func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */)
+}
+
+// Renameat2 implements Linux syscall renameat2(2).
+func Renameat2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	flags := args[4].Uint()
+	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
+}
+
+func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags uint32) error {
+	oldpath, err := copyInPath(t, oldpathAddr)
+	if err != nil {
+		return err
+	}
+	// "If oldpath refers to a symbolic link, the link is renamed" - rename(2)
+	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer oldtpop.Release()
+
+	newpath, err := copyInPath(t, newpathAddr)
+	if err != nil {
+		return err
+	}
+	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer newtpop.Release()
+
+	return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{
+		Flags: flags,
+	})
+}
+
+// Rmdir implements Linux syscall rmdir(2).
+func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr)
+}
+
+func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop)
+}
+
+// Unlink implements Linux syscall unlink(2).
+func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr)
+}
+
+func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop)
+}
+
+// Unlinkat implements Linux syscall unlinkat(2).
+func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if flags&^linux.AT_REMOVEDIR != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if flags&linux.AT_REMOVEDIR != 0 {
+		return 0, nil, rmdirat(t, dirfd, pathAddr)
+	}
+	return 0, nil, unlinkat(t, dirfd, pathAddr)
+}
+
+// Symlink implements Linux syscall symlink(2).
+func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	targetAddr := args[0].Pointer()
+	linkpathAddr := args[1].Pointer()
+	return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr)
+}
+
+// Symlinkat implements Linux syscall symlinkat(2).
+func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	targetAddr := args[0].Pointer()
+	newdirfd := args[1].Int()
+	linkpathAddr := args[2].Pointer()
+	return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr)
+}
+
+func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpathAddr usermem.Addr) error {
+	target, err := t.CopyInString(targetAddr, linux.PATH_MAX)
+	if err != nil {
+		return err
+	}
+	linkpath, err := copyInPath(t, linkpathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
new file mode 100644
index 000000000..317409a18
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Getcwd implements Linux syscall getcwd(2).
+func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	size := args[1].SizeT()
+
+	root := t.FSContext().RootDirectoryVFS2()
+	wd := t.FSContext().WorkingDirectoryVFS2()
+	s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd)
+	root.DecRef()
+	wd.DecRef()
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Note this is >= because we need a terminator.
+	if uint(len(s)) >= size {
+		return 0, nil, syserror.ERANGE
+	}
+
+	// Construct a byte slice containing a NUL terminator.
+	buf := t.CopyScratchBuffer(len(s) + 1)
+	copy(buf, s)
+	buf[len(buf)-1] = 0
+
+	// Write the pathname slice.
+	n, err := t.CopyOutBytes(addr, buf)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Chdir implements Linux syscall chdir(2).
+func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetWorkingDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
+
+// Fchdir implements Linux syscall fchdir(2).
+func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetWorkingDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
+
+// Chroot implements Linux syscall chroot(2).
+func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	if !t.HasCapability(linux.CAP_SYS_CHROOT) {
+		return 0, nil, syserror.EPERM
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetRootDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
new file mode 100644
index 000000000..ddc140b65
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -0,0 +1,149 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Getdents implements Linux syscall getdents(2).
+func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getdents(t, args, false /* isGetdents64 */)
+}
+
+// Getdents64 implements Linux syscall getdents64(2).
+func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getdents(t, args, true /* isGetdents64 */)
+}
+
+func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := int(args[2].Uint())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	cb := getGetdentsCallback(t, addr, size, isGetdents64)
+	err := file.IterDirents(t, cb)
+	n := size - cb.remaining
+	putGetdentsCallback(cb)
+	if n == 0 {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+type getdentsCallback struct {
+	t            *kernel.Task
+	addr         usermem.Addr
+	remaining    int
+	isGetdents64 bool
+}
+
+var getdentsCallbackPool = sync.Pool{
+	New: func() interface{} {
+		return &getdentsCallback{}
+	},
+}
+
+func getGetdentsCallback(t *kernel.Task, addr usermem.Addr, size int, isGetdents64 bool) *getdentsCallback {
+	cb := getdentsCallbackPool.Get().(*getdentsCallback)
+	*cb = getdentsCallback{
+		t:            t,
+		addr:         addr,
+		remaining:    size,
+		isGetdents64: isGetdents64,
+	}
+	return cb
+}
+
+func putGetdentsCallback(cb *getdentsCallback) {
+	cb.t = nil
+	getdentsCallbackPool.Put(cb)
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
+	var buf []byte
+	if cb.isGetdents64 {
+		// struct linux_dirent64 {
+		//     ino64_t        d_ino;    /* 64-bit inode number */
+		//     off64_t        d_off;    /* 64-bit offset to next structure */
+		//     unsigned short d_reclen; /* Size of this dirent */
+		//     unsigned char  d_type;   /* File type */
+		//     char           d_name[]; /* Filename (null-terminated) */
+		// };
+		size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
+		if size < cb.remaining {
+			return syserror.EINVAL
+		}
+		buf = cb.t.CopyScratchBuffer(size)
+		usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
+		usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
+		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
+		buf[18] = dirent.Type
+		copy(buf[19:], dirent.Name)
+		buf[size-1] = 0 // NUL terminator
+	} else {
+		// struct linux_dirent {
+		//     unsigned long  d_ino;     /* Inode number */
+		//     unsigned long  d_off;     /* Offset to next linux_dirent */
+		//     unsigned short d_reclen;  /* Length of this linux_dirent */
+		//     char           d_name[];  /* Filename (null-terminated) */
+		//                       /* length is actually (d_reclen - 2 -
+		//                          offsetof(struct linux_dirent, d_name)) */
+		//     /*
+		//     char           pad;       // Zero padding byte
+		//     char           d_type;    // File type (only since Linux
+		//                               // 2.6.4); offset is (d_reclen - 1)
+		//     */
+		// };
+		if cb.t.Arch().Width() != 8 {
+			panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
+		}
+		size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name)
+		if size < cb.remaining {
+			return syserror.EINVAL
+		}
+		buf = cb.t.CopyScratchBuffer(size)
+		usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
+		usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
+		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
+		copy(buf[18:], dirent.Name)
+		buf[size-3] = 0 // NUL terminator
+		buf[size-2] = 0 // zero padding byte
+		buf[size-1] = dirent.Type
+	}
+	n, err := cb.t.CopyOutBytes(cb.addr, buf)
+	if err != nil {
+		// Don't report partially-written dirents by advancing cb.addr or
+		// cb.remaining.
+		return err
+	}
+	cb.addr += usermem.Addr(n)
+	cb.remaining -= n
+	return nil
+}
diff --git a/pkg/sentry/kernel/pipe/buffer_test.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
index 4d54b8b8f..5a2418da9 100644
--- a/pkg/sentry/kernel/pipe/buffer_test.go
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -1,4 +1,4 @@
-// Copyright 2019 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,21 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package pipe
+package vfs2
 
 import (
-	"testing"
-	"unsafe"
-
-	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-func TestBufferSize(t *testing.T) {
-	bufferSize := unsafe.Sizeof(buffer{})
-	if bufferSize < usermem.PageSize {
-		t.Errorf("buffer is less than a page")
-	}
-	if bufferSize > (2 * usermem.PageSize) {
-		t.Errorf("buffer is greater than two pages")
+// Ioctl implements Linux syscall ioctl(2).
+func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
 	}
+	defer file.DecRef()
+
+	ret, err := file.Ioctl(t, t.MemoryManager(), args)
+	return ret, nil, err
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
index e0ac32b33..7d220bc20 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build amd64
+
 package vfs2
 
 import (
@@ -22,110 +24,142 @@ import (
 // Override syscall table to add syscalls implementations from this package.
 func Override(table map[uintptr]kernel.Syscall) {
 	table[0] = syscalls.Supported("read", Read)
-
-	// Remove syscalls that haven't been converted yet. It's better to get ENOSYS
-	// rather than a SIGSEGV deep in the stack.
-	delete(table, 1)   // write
-	delete(table, 2)   // open
-	delete(table, 3)   // close
-	delete(table, 4)   // stat
-	delete(table, 5)   // fstat
-	delete(table, 6)   // lstat
-	delete(table, 7)   // poll
-	delete(table, 8)   // lseek
-	delete(table, 9)   // mmap
-	delete(table, 16)  // ioctl
-	delete(table, 17)  // pread64
-	delete(table, 18)  // pwrite64
-	delete(table, 19)  // readv
-	delete(table, 20)  // writev
-	delete(table, 21)  // access
-	delete(table, 22)  // pipe
-	delete(table, 32)  // dup
-	delete(table, 33)  // dup2
-	delete(table, 40)  // sendfile
-	delete(table, 59)  // execve
-	delete(table, 72)  // fcntl
-	delete(table, 73)  // flock
-	delete(table, 74)  // fsync
-	delete(table, 75)  // fdatasync
-	delete(table, 76)  // truncate
-	delete(table, 77)  // ftruncate
-	delete(table, 78)  // getdents
-	delete(table, 79)  // getcwd
-	delete(table, 80)  // chdir
-	delete(table, 81)  // fchdir
-	delete(table, 82)  // rename
-	delete(table, 83)  // mkdir
-	delete(table, 84)  // rmdir
-	delete(table, 85)  // creat
-	delete(table, 86)  // link
-	delete(table, 87)  // unlink
-	delete(table, 88)  // symlink
-	delete(table, 89)  // readlink
-	delete(table, 90)  // chmod
-	delete(table, 91)  // fchmod
-	delete(table, 92)  // chown
-	delete(table, 93)  // fchown
-	delete(table, 94)  // lchown
-	delete(table, 133) // mknod
-	delete(table, 137) // statfs
-	delete(table, 138) // fstatfs
-	delete(table, 161) // chroot
-	delete(table, 162) // sync
+	table[1] = syscalls.Supported("write", Write)
+	table[2] = syscalls.Supported("open", Open)
+	table[3] = syscalls.Supported("close", Close)
+	table[4] = syscalls.Supported("stat", Stat)
+	table[5] = syscalls.Supported("fstat", Fstat)
+	table[6] = syscalls.Supported("lstat", Lstat)
+	table[7] = syscalls.Supported("poll", Poll)
+	table[8] = syscalls.Supported("lseek", Lseek)
+	table[9] = syscalls.Supported("mmap", Mmap)
+	table[16] = syscalls.Supported("ioctl", Ioctl)
+	table[17] = syscalls.Supported("pread64", Pread64)
+	table[18] = syscalls.Supported("pwrite64", Pwrite64)
+	table[19] = syscalls.Supported("readv", Readv)
+	table[20] = syscalls.Supported("writev", Writev)
+	table[21] = syscalls.Supported("access", Access)
+	delete(table, 22) // pipe
+	table[23] = syscalls.Supported("select", Select)
+	table[32] = syscalls.Supported("dup", Dup)
+	table[33] = syscalls.Supported("dup2", Dup2)
+	delete(table, 40) // sendfile
+	delete(table, 41) // socket
+	delete(table, 42) // connect
+	delete(table, 43) // accept
+	delete(table, 44) // sendto
+	delete(table, 45) // recvfrom
+	delete(table, 46) // sendmsg
+	delete(table, 47) // recvmsg
+	delete(table, 48) // shutdown
+	delete(table, 49) // bind
+	delete(table, 50) // listen
+	delete(table, 51) // getsockname
+	delete(table, 52) // getpeername
+	delete(table, 53) // socketpair
+	delete(table, 54) // setsockopt
+	delete(table, 55) // getsockopt
+	table[59] = syscalls.Supported("execve", Execve)
+	table[72] = syscalls.Supported("fcntl", Fcntl)
+	delete(table, 73) // flock
+	table[74] = syscalls.Supported("fsync", Fsync)
+	table[75] = syscalls.Supported("fdatasync", Fdatasync)
+	table[76] = syscalls.Supported("truncate", Truncate)
+	table[77] = syscalls.Supported("ftruncate", Ftruncate)
+	table[78] = syscalls.Supported("getdents", Getdents)
+	table[79] = syscalls.Supported("getcwd", Getcwd)
+	table[80] = syscalls.Supported("chdir", Chdir)
+	table[81] = syscalls.Supported("fchdir", Fchdir)
+	table[82] = syscalls.Supported("rename", Rename)
+	table[83] = syscalls.Supported("mkdir", Mkdir)
+	table[84] = syscalls.Supported("rmdir", Rmdir)
+	table[85] = syscalls.Supported("creat", Creat)
+	table[86] = syscalls.Supported("link", Link)
+	table[87] = syscalls.Supported("unlink", Unlink)
+	table[88] = syscalls.Supported("symlink", Symlink)
+	table[89] = syscalls.Supported("readlink", Readlink)
+	table[90] = syscalls.Supported("chmod", Chmod)
+	table[91] = syscalls.Supported("fchmod", Fchmod)
+	table[92] = syscalls.Supported("chown", Chown)
+	table[93] = syscalls.Supported("fchown", Fchown)
+	table[94] = syscalls.Supported("lchown", Lchown)
+	table[132] = syscalls.Supported("utime", Utime)
+	table[133] = syscalls.Supported("mknod", Mknod)
+	table[137] = syscalls.Supported("statfs", Statfs)
+	table[138] = syscalls.Supported("fstatfs", Fstatfs)
+	table[161] = syscalls.Supported("chroot", Chroot)
+	table[162] = syscalls.Supported("sync", Sync)
 	delete(table, 165) // mount
 	delete(table, 166) // umount2
-	delete(table, 172) // iopl
-	delete(table, 173) // ioperm
 	delete(table, 187) // readahead
-	delete(table, 188) // setxattr
-	delete(table, 189) // lsetxattr
-	delete(table, 190) // fsetxattr
-	delete(table, 191) // getxattr
-	delete(table, 192) // lgetxattr
-	delete(table, 193) // fgetxattr
+	table[188] = syscalls.Supported("setxattr", Setxattr)
+	table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
+	table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
+	table[191] = syscalls.Supported("getxattr", Getxattr)
+	table[192] = syscalls.Supported("lgetxattr", Lgetxattr)
+	table[193] = syscalls.Supported("fgetxattr", Fgetxattr)
+	table[194] = syscalls.Supported("listxattr", Listxattr)
+	table[195] = syscalls.Supported("llistxattr", Llistxattr)
+	table[196] = syscalls.Supported("flistxattr", Flistxattr)
+	table[197] = syscalls.Supported("removexattr", Removexattr)
+	table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
+	table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
 	delete(table, 206) // io_setup
 	delete(table, 207) // io_destroy
 	delete(table, 208) // io_getevents
 	delete(table, 209) // io_submit
 	delete(table, 210) // io_cancel
-	delete(table, 213) // epoll_create
-	delete(table, 214) // epoll_ctl_old
-	delete(table, 215) // epoll_wait_old
-	delete(table, 216) // remap_file_pages
-	delete(table, 217) // getdents64
-	delete(table, 232) // epoll_wait
-	delete(table, 233) // epoll_ctl
+	table[213] = syscalls.Supported("epoll_create", EpollCreate)
+	table[217] = syscalls.Supported("getdents64", Getdents64)
+	delete(table, 221) // fdavise64
+	table[232] = syscalls.Supported("epoll_wait", EpollWait)
+	table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
+	table[235] = syscalls.Supported("utimes", Utimes)
 	delete(table, 253) // inotify_init
 	delete(table, 254) // inotify_add_watch
 	delete(table, 255) // inotify_rm_watch
-	delete(table, 257) // openat
-	delete(table, 258) // mkdirat
-	delete(table, 259) // mknodat
-	delete(table, 260) // fchownat
-	delete(table, 261) // futimesat
-	delete(table, 262) // fstatat
-	delete(table, 263) // unlinkat
-	delete(table, 264) // renameat
-	delete(table, 265) // linkat
-	delete(table, 266) // symlinkat
-	delete(table, 267) // readlinkat
-	delete(table, 268) // fchmodat
-	delete(table, 269) // faccessat
-	delete(table, 270) // pselect
-	delete(table, 271) // ppoll
+	table[257] = syscalls.Supported("openat", Openat)
+	table[258] = syscalls.Supported("mkdirat", Mkdirat)
+	table[259] = syscalls.Supported("mknodat", Mknodat)
+	table[260] = syscalls.Supported("fchownat", Fchownat)
+	table[261] = syscalls.Supported("futimens", Futimens)
+	table[262] = syscalls.Supported("newfstatat", Newfstatat)
+	table[263] = syscalls.Supported("unlinkat", Unlinkat)
+	table[264] = syscalls.Supported("renameat", Renameat)
+	table[265] = syscalls.Supported("linkat", Linkat)
+	table[266] = syscalls.Supported("symlinkat", Symlinkat)
+	table[267] = syscalls.Supported("readlinkat", Readlinkat)
+	table[268] = syscalls.Supported("fchmodat", Fchmodat)
+	table[269] = syscalls.Supported("faccessat", Faccessat)
+	table[270] = syscalls.Supported("pselect", Pselect)
+	table[271] = syscalls.Supported("ppoll", Ppoll)
+	delete(table, 275) // splice
+	delete(table, 276) // tee
+	table[277] = syscalls.Supported("sync_file_range", SyncFileRange)
+	table[280] = syscalls.Supported("utimensat", Utimensat)
+	table[281] = syscalls.Supported("epoll_pwait", EpollPwait)
+	delete(table, 282) // signalfd
+	delete(table, 283) // timerfd_create
+	delete(table, 284) // eventfd
 	delete(table, 285) // fallocate
-	delete(table, 291) // epoll_create1
-	delete(table, 292) // dup3
+	delete(table, 286) // timerfd_settime
+	delete(table, 287) // timerfd_gettime
+	delete(table, 288) // accept4
+	delete(table, 289) // signalfd4
+	delete(table, 290) // eventfd2
+	table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
+	table[292] = syscalls.Supported("dup3", Dup3)
 	delete(table, 293) // pipe2
 	delete(table, 294) // inotify_init1
-	delete(table, 295) // preadv
-	delete(table, 296) // pwritev
-	delete(table, 306) // syncfs
-	delete(table, 316) // renameat2
+	table[295] = syscalls.Supported("preadv", Preadv)
+	table[296] = syscalls.Supported("pwritev", Pwritev)
+	delete(table, 299) // recvmmsg
+	table[306] = syscalls.Supported("syncfs", Syncfs)
+	delete(table, 307) // sendmmsg
+	table[316] = syscalls.Supported("renameat2", Renameat2)
 	delete(table, 319) // memfd_create
-	delete(table, 322) // execveat
-	delete(table, 327) // preadv2
-	delete(table, 328) // pwritev2
-	delete(table, 332) // statx
+	table[322] = syscalls.Supported("execveat", Execveat)
+	table[327] = syscalls.Supported("preadv2", Preadv2)
+	table[328] = syscalls.Supported("pwritev2", Pwritev2)
+	table[332] = syscalls.Supported("statx", Statx)
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
index 6af5c400f..a6b367468 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build arm64
+
 package vfs2
 
 import (
diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go
new file mode 100644
index 000000000..60a43f0a0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go
@@ -0,0 +1,92 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mmap implements Linux syscall mmap(2).
+func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	prot := args[2].Int()
+	flags := args[3].Int()
+	fd := args[4].Int()
+	fixed := flags&linux.MAP_FIXED != 0
+	private := flags&linux.MAP_PRIVATE != 0
+	shared := flags&linux.MAP_SHARED != 0
+	anon := flags&linux.MAP_ANONYMOUS != 0
+	map32bit := flags&linux.MAP_32BIT != 0
+
+	// Require exactly one of MAP_PRIVATE and MAP_SHARED.
+	if private == shared {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts := memmap.MMapOpts{
+		Length:   args[1].Uint64(),
+		Offset:   args[5].Uint64(),
+		Addr:     args[0].Pointer(),
+		Fixed:    fixed,
+		Unmap:    fixed,
+		Map32Bit: map32bit,
+		Private:  private,
+		Perms: usermem.AccessType{
+			Read:    linux.PROT_READ&prot != 0,
+			Write:   linux.PROT_WRITE&prot != 0,
+			Execute: linux.PROT_EXEC&prot != 0,
+		},
+		MaxPerms:  usermem.AnyAccess,
+		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
+		Precommit: linux.MAP_POPULATE&flags != 0,
+	}
+	if linux.MAP_LOCKED&flags != 0 {
+		opts.MLockMode = memmap.MLockEager
+	}
+	defer func() {
+		if opts.MappingIdentity != nil {
+			opts.MappingIdentity.DecRef()
+		}
+	}()
+
+	if !anon {
+		// Convert the passed FD to a file reference.
+		file := t.GetFileVFS2(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		// mmap unconditionally requires that the FD is readable.
+		if !file.IsReadable() {
+			return 0, nil, syserror.EACCES
+		}
+		// MAP_SHARED requires that the FD be writable for PROT_WRITE.
+		if shared && !file.IsWritable() {
+			opts.MaxPerms.Write = false
+		}
+
+		if err := file.ConfigureMMap(t, &opts); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	rv, err := t.MemoryManager().MMap(t, opts)
+	return uintptr(rv), nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go
new file mode 100644
index 000000000..97da6c647
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/path.go
@@ -0,0 +1,94 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func copyInPath(t *kernel.Task, addr usermem.Addr) (fspath.Path, error) {
+	pathname, err := t.CopyInString(addr, linux.PATH_MAX)
+	if err != nil {
+		return fspath.Path{}, err
+	}
+	return fspath.Parse(pathname), nil
+}
+
+type taskPathOperation struct {
+	pop          vfs.PathOperation
+	haveStartRef bool
+}
+
+func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink) (taskPathOperation, error) {
+	root := t.FSContext().RootDirectoryVFS2()
+	start := root
+	haveStartRef := false
+	if !path.Absolute {
+		if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
+			root.DecRef()
+			return taskPathOperation{}, syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			haveStartRef = true
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				root.DecRef()
+				return taskPathOperation{}, syserror.EBADF
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			haveStartRef = true
+			dirfile.DecRef()
+		}
+	}
+	return taskPathOperation{
+		pop: vfs.PathOperation{
+			Root:               root,
+			Start:              start,
+			Path:               path,
+			FollowFinalSymlink: bool(shouldFollowFinalSymlink),
+		},
+		haveStartRef: haveStartRef,
+	}, nil
+}
+
+func (tpop *taskPathOperation) Release() {
+	tpop.pop.Root.DecRef()
+	if tpop.haveStartRef {
+		tpop.pop.Start.DecRef()
+		tpop.haveStartRef = false
+	}
+}
+
+type shouldAllowEmptyPath bool
+
+const (
+	disallowEmptyPath shouldAllowEmptyPath = false
+	allowEmptyPath    shouldAllowEmptyPath = true
+)
+
+type shouldFollowFinalSymlink bool
+
+const (
+	nofollowFinalSymlink shouldFollowFinalSymlink = false
+	followFinalSymlink   shouldFollowFinalSymlink = true
+)
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
new file mode 100644
index 000000000..dbf4882da
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -0,0 +1,584 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"fmt"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// fileCap is the maximum allowable files for poll & select. This has no
+// equivalent in Linux; it exists in gVisor since allocation failure in Go is
+// unrecoverable.
+const fileCap = 1024 * 1024
+
+// Masks for "readable", "writable", and "exceptional" events as defined by
+// select(2).
+const (
+	// selectReadEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLIN_SET.
+	selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR
+
+	// selectWriteEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLOUT_SET.
+	selectWriteEvents = linux.POLLOUT | linux.POLLERR
+
+	// selectExceptEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLEX_SET.
+	selectExceptEvents = linux.POLLPRI
+)
+
+// pollState tracks the associated file description and waiter of a PollFD.
+type pollState struct {
+	file   *vfs.FileDescription
+	waiter waiter.Entry
+}
+
+// initReadiness gets the current ready mask for the file represented by the FD
+// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
+// used to register with the file for event notifications, and a reference to
+// the file is stored in "state".
+func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) {
+	if pfd.FD < 0 {
+		pfd.REvents = 0
+		return
+	}
+
+	file := t.GetFileVFS2(pfd.FD)
+	if file == nil {
+		pfd.REvents = linux.POLLNVAL
+		return
+	}
+
+	if ch == nil {
+		defer file.DecRef()
+	} else {
+		state.file = file
+		state.waiter, _ = waiter.NewChannelEntry(ch)
+		file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	}
+
+	r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	pfd.REvents = int16(r.ToLinux()) & pfd.Events
+}
+
+// releaseState releases all the pollState in "state".
+func releaseState(state []pollState) {
+	for i := range state {
+		if state[i].file != nil {
+			state[i].file.EventUnregister(&state[i].waiter)
+			state[i].file.DecRef()
+		}
+	}
+}
+
+// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout"
+// when "timeout" is greater than zero.
+//
+// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or
+// positive if interrupted by a signal.
+func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
+	var ch chan struct{}
+	if timeout != 0 {
+		ch = make(chan struct{}, 1)
+	}
+
+	// Register for event notification in the files involved if we may
+	// block (timeout not zero). Once we find a file that has a non-zero
+	// result, we stop registering for events but still go through all files
+	// to get their ready masks.
+	state := make([]pollState, len(pfd))
+	defer releaseState(state)
+	n := uintptr(0)
+	for i := range pfd {
+		initReadiness(t, &pfd[i], &state[i], ch)
+		if pfd[i].REvents != 0 {
+			n++
+			ch = nil
+		}
+	}
+
+	if timeout == 0 {
+		return timeout, n, nil
+	}
+
+	haveTimeout := timeout >= 0
+
+	for n == 0 {
+		var err error
+		// Wait for a notification.
+		timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout)
+		if err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = nil
+			}
+			return timeout, 0, err
+		}
+
+		// We got notified, count how many files are ready. If none,
+		// then this was a spurious notification, and we just go back
+		// to sleep with the remaining timeout.
+		for i := range state {
+			if state[i].file == nil {
+				continue
+			}
+
+			r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events)))
+			rl := int16(r.ToLinux()) & pfd[i].Events
+			if rl != 0 {
+				pfd[i].REvents = rl
+				n++
+			}
+		}
+	}
+
+	return timeout, n, nil
+}
+
+// copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max.
+func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) {
+	if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
+		return nil, syserror.EINVAL
+	}
+
+	pfd := make([]linux.PollFD, nfds)
+	if nfds > 0 {
+		if _, err := t.CopyIn(addr, &pfd); err != nil {
+			return nil, err
+		}
+	}
+
+	return pfd, nil
+}
+
+func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
+	pfd, err := copyInPollFDs(t, addr, nfds)
+	if err != nil {
+		return timeout, 0, err
+	}
+
+	// Compatibility warning: Linux adds POLLHUP and POLLERR just before
+	// polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
+	// polling, changing event masks here is an application-visible difference.
+	// (Linux also doesn't copy out event masks at all, only revents.)
+	for i := range pfd {
+		pfd[i].Events |= linux.POLLHUP | linux.POLLERR
+	}
+	remainingTimeout, n, err := pollBlock(t, pfd, timeout)
+	err = syserror.ConvertIntr(err, syserror.EINTR)
+
+	// The poll entries are copied out regardless of whether
+	// any are set or not. This aligns with the Linux behavior.
+	if nfds > 0 && err == nil {
+		if _, err := t.CopyOut(addr, pfd); err != nil {
+			return remainingTimeout, 0, err
+		}
+	}
+
+	return remainingTimeout, n, err
+}
+
+// CopyInFDSet copies an fd set from select(2)/pselect(2).
+func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
+	set := make([]byte, nBytes)
+
+	if addr != 0 {
+		if _, err := t.CopyIn(addr, &set); err != nil {
+			return nil, err
+		}
+		// If we only use part of the last byte, mask out the extraneous bits.
+		//
+		// N.B. This only works on little-endian architectures.
+		if nBitsInLastPartialByte != 0 {
+			set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
+		}
+	}
+	return set, nil
+}
+
+func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
+	if nfds < 0 || nfds > fileCap {
+		return 0, syserror.EINVAL
+	}
+
+	// Calculate the size of the fd sets (one bit per fd).
+	nBytes := (nfds + 7) / 8
+	nBitsInLastPartialByte := nfds % 8
+
+	// Capture all the provided input vectors.
+	r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+	w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+	e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+
+	// Count how many FDs are actually being requested so that we can build
+	// a PollFD array.
+	fdCount := 0
+	for i := 0; i < nBytes; i++ {
+		v := r[i] | w[i] | e[i]
+		for v != 0 {
+			v &= (v - 1)
+			fdCount++
+		}
+	}
+
+	// Build the PollFD array.
+	pfd := make([]linux.PollFD, 0, fdCount)
+	var fd int32
+	for i := 0; i < nBytes; i++ {
+		rV, wV, eV := r[i], w[i], e[i]
+		v := rV | wV | eV
+		m := byte(1)
+		for j := 0; j < 8; j++ {
+			if (v & m) != 0 {
+				// Make sure the fd is valid and decrement the reference
+				// immediately to ensure we don't leak. Note, another thread
+				// might be about to close fd. This is racy, but that's
+				// OK. Linux is racy in the same way.
+				file := t.GetFileVFS2(fd)
+				if file == nil {
+					return 0, syserror.EBADF
+				}
+				file.DecRef()
+
+				var mask int16
+				if (rV & m) != 0 {
+					mask |= selectReadEvents
+				}
+
+				if (wV & m) != 0 {
+					mask |= selectWriteEvents
+				}
+
+				if (eV & m) != 0 {
+					mask |= selectExceptEvents
+				}
+
+				pfd = append(pfd, linux.PollFD{
+					FD:     fd,
+					Events: mask,
+				})
+			}
+
+			fd++
+			m <<= 1
+		}
+	}
+
+	// Do the syscall, then count the number of bits set.
+	if _, _, err = pollBlock(t, pfd, timeout); err != nil {
+		return 0, syserror.ConvertIntr(err, syserror.EINTR)
+	}
+
+	// r, w, and e are currently event mask bitsets; unset bits corresponding
+	// to events that *didn't* occur.
+	bitSetCount := uintptr(0)
+	for idx := range pfd {
+		events := pfd[idx].REvents
+		i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
+		m := byte(1) << j
+		if r[i]&m != 0 {
+			if (events & selectReadEvents) != 0 {
+				bitSetCount++
+			} else {
+				r[i] &^= m
+			}
+		}
+		if w[i]&m != 0 {
+			if (events & selectWriteEvents) != 0 {
+				bitSetCount++
+			} else {
+				w[i] &^= m
+			}
+		}
+		if e[i]&m != 0 {
+			if (events & selectExceptEvents) != 0 {
+				bitSetCount++
+			} else {
+				e[i] &^= m
+			}
+		}
+	}
+
+	// Copy updated vectors back.
+	if readFDs != 0 {
+		if _, err := t.CopyOut(readFDs, r); err != nil {
+			return 0, err
+		}
+	}
+
+	if writeFDs != 0 {
+		if _, err := t.CopyOut(writeFDs, w); err != nil {
+			return 0, err
+		}
+	}
+
+	if exceptFDs != 0 {
+		if _, err := t.CopyOut(exceptFDs, e); err != nil {
+			return 0, err
+		}
+	}
+
+	return bitSetCount, nil
+}
+
+// timeoutRemaining returns the amount of time remaining for the specified
+// timeout or 0 if it has elapsed.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
+	now := t.Kernel().MonotonicClock().Now()
+	remaining := timeout - now.Sub(startNs)
+	if remaining < 0 {
+		remaining = 0
+	}
+	return remaining
+}
+
+// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
+	return tsRemaining.CopyOut(t, timespecAddr)
+}
+
+// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
+	return tvRemaining.CopyOut(t, timevalAddr)
+}
+
+// pollRestartBlock encapsulates the state required to restart poll(2) via
+// restart_syscall(2).
+//
+// +stateify savable
+type pollRestartBlock struct {
+	pfdAddr usermem.Addr
+	nfds    uint
+	timeout time.Duration
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return poll(t, p.pfdAddr, p.nfds, p.timeout)
+}
+
+func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
+	remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	// On an interrupt poll(2) is restarted with the remaining timeout.
+	if err == syserror.EINTR {
+		t.SetSyscallRestartBlock(&pollRestartBlock{
+			pfdAddr: pfdAddr,
+			nfds:    nfds,
+			timeout: remainingTimeout,
+		})
+		return 0, kernel.ERESTART_RESTARTBLOCK
+	}
+	return n, err
+}
+
+// Poll implements linux syscall poll(2).
+func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timeout := time.Duration(args[2].Int()) * time.Millisecond
+	n, err := poll(t, pfdAddr, nfds, timeout)
+	return n, nil, err
+}
+
+// Ppoll implements linux syscall ppoll(2).
+func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timespecAddr := args[2].Pointer()
+	maskAddr := args[3].Pointer()
+	maskSize := uint(args[4].Uint())
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
+		return 0, nil, err
+	}
+
+	_, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// doPoll returns EINTR if interrupted, but ppoll is normally restartable
+	// if interrupted by something other than a signal handled by the
+	// application (i.e. returns ERESTARTNOHAND). However, if
+	// copyOutTimespecRemaining failed, then the restarted ppoll would use the
+	// wrong timeout, so the error should be left as EINTR.
+	//
+	// Note that this means that if err is nil but copyErr is not, copyErr is
+	// ignored. This is consistent with Linux.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Select implements linux syscall select(2).
+func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timevalAddr := args[4].Pointer()
+
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timevalAddr != 0 {
+		var timeval linux.Timeval
+		if err := timeval.CopyIn(t, timevalAddr); err != nil {
+			return 0, nil, err
+		}
+		if timeval.Sec < 0 || timeval.Usec < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		timeout = time.Duration(timeval.ToNsecCapped())
+	}
+	startNs := t.Kernel().MonotonicClock().Now()
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Pselect implements linux syscall pselect(2).
+func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timespecAddr := args[4].Pointer()
+	maskWithSizeAddr := args[5].Pointer()
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if maskWithSizeAddr != 0 {
+		if t.Arch().Width() != 8 {
+			panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width()))
+		}
+		var maskStruct sigSetWithSize
+		if err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil {
+			return 0, nil, err
+		}
+		if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// +marshal
+type sigSetWithSize struct {
+	sigsetAddr   uint64
+	sizeofSigset uint64
+}
+
+// copyTimespecInToDuration copies a Timespec from the untrusted app range,
+// validates it and converts it to a Duration.
+//
+// If the Timespec is larger than what can be represented in a Duration, the
+// returned value is the maximum that Duration will allow.
+//
+// If timespecAddr is NULL, the returned value is negative.
+func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) {
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timespecAddr != 0 {
+		var timespec linux.Timespec
+		if err := timespec.CopyIn(t, timespecAddr); err != nil {
+			return 0, err
+		}
+		if !timespec.Valid() {
+			return 0, syserror.EINVAL
+		}
+		timeout = time.Duration(timespec.ToNsecCapped())
+	}
+	return timeout, nil
+}
+
+func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) error {
+	if maskAddr == 0 {
+		return nil
+	}
+	if maskSize != linux.SignalSetSize {
+		return syserror.EINVAL
+	}
+	var mask linux.SignalSet
+	if err := mask.CopyIn(t, maskAddr); err != nil {
+		return err
+	}
+	mask &^= kernel.UnblockableSignals
+	oldmask := t.SignalMask()
+	t.SetSignalMask(mask)
+	t.SetSavedSignalMask(oldmask)
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
new file mode 100644
index 000000000..35f6308d6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -0,0 +1,511 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	eventMaskRead  = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+	eventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr
+)
+
+// Read implements Linux syscall read(2).
+func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := read(t, file, dst, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+}
+
+// Readv implements Linux syscall readv(2).
+func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := read(t, file, dst, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+}
+
+func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	n, err := file.Read(t, dst, opts)
+	if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err := file.Read(t, dst, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+		if err := t.Block(ch); err != nil {
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	return total, err
+}
+
+// Pread64 implements Linux syscall pread64(2).
+func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+}
+
+// Preadv implements Linux syscall preadv(2).
+func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+}
+
+// Preadv2 implements Linux syscall preadv2(2).
+func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the glibc signature is
+	// preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the actual syscall
+	// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 6th argument (index 5).
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := args[5].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.ReadOptions{
+		Flags: uint32(flags),
+	}
+	var n int64
+	if offset == -1 {
+		n, err = read(t, file, dst, opts)
+	} else {
+		n, err = pread(t, file, dst, offset, opts)
+	}
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+}
+
+func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	n, err := file.PRead(t, dst, offset, opts)
+	if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err := file.PRead(t, dst, offset+total, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+		if err := t.Block(ch); err != nil {
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	return total, err
+}
+
+// Write implements Linux syscall write(2).
+func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := write(t, file, src, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+}
+
+// Writev implements Linux syscall writev(2).
+func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := write(t, file, src, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+}
+
+func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	n, err := file.Write(t, src, opts)
+	if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err := file.Write(t, src, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+		if err := t.Block(ch); err != nil {
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	return total, err
+}
+
+// Pwrite64 implements Linux syscall pwrite64(2).
+func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+}
+
+// Pwritev implements Linux syscall pwritev(2).
+func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+}
+
+// Pwritev2 implements Linux syscall pwritev2(2).
+func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the glibc signature is
+	// pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the actual syscall
+	// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 6th argument (index 5).
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := args[5].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.WriteOptions{
+		Flags: uint32(flags),
+	}
+	var n int64
+	if offset == -1 {
+		n, err = write(t, file, src, opts)
+	} else {
+		n, err = pwrite(t, file, src, offset, opts)
+	}
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+}
+
+func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, err := file.PWrite(t, src, offset, opts)
+	if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err := file.PWrite(t, src, offset+total, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+		if err := t.Block(ch); err != nil {
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	return total, err
+}
+
+// Lseek implements Linux syscall lseek(2).
+func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	whence := args[2].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newoff, err := file.Seek(t, offset, whence)
+	return uintptr(newoff), nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
new file mode 100644
index 000000000..9250659ff
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -0,0 +1,380 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
+
+// Chmod implements Linux syscall chmod(2).
+func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode)
+}
+
+// Fchmodat implements Linux syscall fchmodat(2).
+func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	mode := args[2].ModeT()
+	return 0, nil, fchmodat(t, dirfd, pathAddr, mode)
+}
+
+func fchmodat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_MODE,
+			Mode: uint16(mode & chmodMask),
+		},
+	})
+}
+
+// Fchmod implements Linux syscall fchmod(2).
+func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	mode := args[1].ModeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.SetStat(t, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_MODE,
+			Mode: uint16(mode & chmodMask),
+		},
+	})
+}
+
+// Chown implements Linux syscall chown(2).
+func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	owner := args[1].Int()
+	group := args[2].Int()
+	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */)
+}
+
+// Lchown implements Linux syscall lchown(2).
+func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	owner := args[1].Int()
+	group := args[2].Int()
+	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW)
+}
+
+// Fchownat implements Linux syscall fchownat(2).
+func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	owner := args[2].Int()
+	group := args[3].Int()
+	flags := args[4].Int()
+	return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags)
+}
+
+func fchownat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, owner, group, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
+		return err
+	}
+
+	return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
+}
+
+func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error {
+	userns := t.UserNamespace()
+	if owner != -1 {
+		kuid := userns.MapToKUID(auth.UID(owner))
+		if !kuid.Ok() {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_UID
+		opts.Stat.UID = uint32(kuid)
+	}
+	if group != -1 {
+		kgid := userns.MapToKGID(auth.GID(group))
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_GID
+		opts.Stat.GID = uint32(kgid)
+	}
+	return nil
+}
+
+// Fchown implements Linux syscall fchown(2).
+func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	owner := args[1].Int()
+	group := args[2].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, file.SetStat(t, opts)
+}
+
+// Truncate implements Linux syscall truncate(2).
+func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: uint64(length),
+		},
+	})
+}
+
+// Ftruncate implements Linux syscall ftruncate(2).
+func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.SetStat(t, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: uint64(length),
+		},
+	})
+}
+
+// Utime implements Linux syscall utime(2).
+func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_ATIME | linux.STATX_MTIME,
+		},
+	}
+	if timesAddr == 0 {
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+	} else {
+		var times linux.Utime
+		if err := times.CopyIn(t, timesAddr); err != nil {
+			return 0, nil, err
+		}
+		opts.Stat.Atime.Sec = times.Actime
+		opts.Stat.Mtime.Sec = times.Modtime
+	}
+
+	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Utimes implements Linux syscall utimes(2).
+func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_ATIME | linux.STATX_MTIME,
+		},
+	}
+	if timesAddr == 0 {
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+	} else {
+		var times [2]linux.Timeval
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		opts.Stat.Atime = linux.StatxTimestamp{
+			Sec:  times[0].Sec,
+			Nsec: uint32(times[0].Usec * 1000),
+		}
+		opts.Stat.Mtime = linux.StatxTimestamp{
+			Sec:  times[1].Sec,
+			Nsec: uint32(times[1].Usec * 1000),
+		}
+	}
+
+	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Utimensat implements Linux syscall utimensat(2).
+func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+	flags := args[3].Int()
+
+	if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Futimens implements Linux syscall futimens(2).
+func Futimens(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	timesAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, file.SetStat(t, opts)
+}
+
+func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error {
+	if timesAddr == 0 {
+		opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+		return nil
+	}
+	var times [2]linux.Timespec
+	if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		return err
+	}
+	if times[0].Nsec != linux.UTIME_OMIT {
+		opts.Stat.Mask |= linux.STATX_ATIME
+		opts.Stat.Atime = linux.StatxTimestamp{
+			Sec:  times[0].Sec,
+			Nsec: uint32(times[0].Nsec),
+		}
+	}
+	if times[1].Nsec != linux.UTIME_OMIT {
+		opts.Stat.Mask |= linux.STATX_MTIME
+		opts.Stat.Mtime = linux.StatxTimestamp{
+			Sec:  times[1].Sec,
+			Nsec: uint32(times[1].Nsec),
+		}
+	}
+	return nil
+}
+
+func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
+			return syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.SetStat() instead of
+				// VirtualFilesystem.SetStatAt(), since the former may be able
+				// to use opened file state to expedite the SetStat.
+				err := dirfile.SetStat(t, *opts)
+				dirfile.DecRef()
+				return err
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+	return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: bool(shouldFollowFinalSymlink),
+	}, opts)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
new file mode 100644
index 000000000..12c532310
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -0,0 +1,323 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Stat implements Linux syscall stat(2).
+func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+	return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, 0 /* flags */)
+}
+
+// Lstat implements Linux syscall lstat(2).
+func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+	return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, linux.AT_SYMLINK_NOFOLLOW)
+}
+
+// Newfstatat implements Linux syscall newfstatat, which backs fstatat(2).
+func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	statAddr := args[2].Pointer()
+	flags := args[3].Int()
+	return 0, nil, fstatat(t, dirfd, pathAddr, statAddr, flags)
+}
+
+func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+
+	opts := vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS,
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.Stat() instead of
+				// VirtualFilesystem.StatAt() for fstatat(fd, ""), since the
+				// former may be able to use opened file state to expedite the
+				// Stat.
+				statx, err := dirfile.Stat(t, opts)
+				dirfile.DecRef()
+				if err != nil {
+					return err
+				}
+				var stat linux.Stat
+				convertStatxToUserStat(t, &statx, &stat)
+				return stat.CopyOut(t, statAddr)
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+
+	statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+	}, &opts)
+	if err != nil {
+		return err
+	}
+	var stat linux.Stat
+	convertStatxToUserStat(t, &statx, &stat)
+	return stat.CopyOut(t, statAddr)
+}
+
+func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec {
+	return linux.Timespec{
+		Sec:  sxts.Sec,
+		Nsec: int64(sxts.Nsec),
+	}
+}
+
+// Fstat implements Linux syscall fstat(2).
+func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	statAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	statx, err := file.Stat(t, vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	var stat linux.Stat
+	convertStatxToUserStat(t, &statx, &stat)
+	return 0, nil, stat.CopyOut(t, statAddr)
+}
+
+// Statx implements Linux syscall statx(2).
+func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	flags := args[2].Int()
+	mask := args[3].Uint()
+	statxAddr := args[4].Pointer()
+
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts := vfs.StatOptions{
+		Mask: mask,
+		Sync: uint32(flags & linux.AT_STATX_SYNC_TYPE),
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return 0, nil, syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return 0, nil, syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.Stat() instead of
+				// VirtualFilesystem.StatAt() for statx(fd, ""), since the
+				// former may be able to use opened file state to expedite the
+				// Stat.
+				statx, err := dirfile.Stat(t, opts)
+				dirfile.DecRef()
+				if err != nil {
+					return 0, nil, err
+				}
+				userifyStatx(t, &statx)
+				return 0, nil, statx.CopyOut(t, statxAddr)
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+
+	statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+	}, &opts)
+	if err != nil {
+		return 0, nil, err
+	}
+	userifyStatx(t, &statx)
+	return 0, nil, statx.CopyOut(t, statxAddr)
+}
+
+func userifyStatx(t *kernel.Task, statx *linux.Statx) {
+	userns := t.UserNamespace()
+	statx.UID = uint32(auth.KUID(statx.UID).In(userns).OrOverflow())
+	statx.GID = uint32(auth.KGID(statx.GID).In(userns).OrOverflow())
+}
+
+// Readlink implements Linux syscall readlink(2).
+func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+	size := args[2].SizeT()
+	return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size)
+}
+
+// Access implements Linux syscall access(2).
+func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// FIXME(jamieliu): actually implement
+	return 0, nil, nil
+}
+
+// Faccessat implements Linux syscall access(2).
+func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// FIXME(jamieliu): actually implement
+	return 0, nil, nil
+}
+
+// Readlinkat implements Linux syscall mknodat(2).
+func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	bufAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	return readlinkat(t, dirfd, pathAddr, bufAddr, size)
+}
+
+func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, size uint) (uintptr, *kernel.SyscallControl, error) {
+	if int(size) <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	// "Since Linux 2.6.39, pathname can be an empty string, in which case the
+	// call operates on the symbolic link referred to by dirfd ..." -
+	// readlinkat(2)
+	tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if len(target) > int(size) {
+		target = target[:size]
+	}
+	n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target))
+	if n == 0 {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Statfs implements Linux syscall statfs(2).
+func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, statfs.CopyOut(t, bufAddr)
+}
+
+// Fstatfs implements Linux syscall fstatfs(2).
+func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufAddr := args[1].Pointer()
+
+	tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, statfs.CopyOut(t, bufAddr)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
new file mode 100644
index 000000000..2da538fc6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+	// Linux just copies fields from struct kstat without regard to struct
+	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+	userns := t.UserNamespace()
+	*stat = linux.Stat{
+		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+		Ino:     statx.Ino,
+		Nlink:   uint64(statx.Nlink),
+		Mode:    uint32(statx.Mode),
+		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+		Size:    int64(statx.Size),
+		Blksize: int64(statx.Blksize),
+		Blocks:  int64(statx.Blocks),
+		ATime:   timespecFromStatxTimestamp(statx.Atime),
+		MTime:   timespecFromStatxTimestamp(statx.Mtime),
+		CTime:   timespecFromStatxTimestamp(statx.Ctime),
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go
new file mode 100644
index 000000000..88b9c7627
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+	// Linux just copies fields from struct kstat without regard to struct
+	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+	userns := t.UserNamespace()
+	*stat = linux.Stat{
+		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+		Ino:     statx.Ino,
+		Nlink:   uint32(statx.Nlink),
+		Mode:    uint32(statx.Mode),
+		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+		Size:    int64(statx.Size),
+		Blksize: int32(statx.Blksize),
+		Blocks:  int64(statx.Blocks),
+		ATime:   timespecFromStatxTimestamp(statx.Atime),
+		MTime:   timespecFromStatxTimestamp(statx.Mtime),
+		CTime:   timespecFromStatxTimestamp(statx.Ctime),
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go
new file mode 100644
index 000000000..365250b0b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/sync.go
@@ -0,0 +1,87 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements Linux syscall sync(2).
+func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.Kernel().VFS().SyncAllFilesystems(t)
+}
+
+// Syncfs implements Linux syscall syncfs(2).
+func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.SyncFS(t)
+}
+
+// Fsync implements Linux syscall fsync(2).
+func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.Sync(t)
+}
+
+// Fdatasync implements Linux syscall fdatasync(2).
+func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// TODO(gvisor.dev/issue/1897): Avoid writeback of unnecessary metadata.
+	return Fsync(t, args)
+}
+
+// SyncFileRange implements Linux syscall sync_file_range(2).
+func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	nbytes := args[2].Int64()
+	flags := args[3].Uint()
+
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if nbytes < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of
+	// [offset, offset+nbytes).
+	return 0, nil, file.Sync(t)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go
deleted file mode 100644
index 7667524c7..000000000
--- a/pkg/sentry/syscalls/linux/vfs2/sys_read.go
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs2
-
-import (
-	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/pkg/waiter"
-)
-
-const (
-	// EventMaskRead contains events that can be triggered on reads.
-	EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
-)
-
-// Read implements linux syscall read(2).  Note that we try to get a buffer that
-// is exactly the size requested because some applications like qemu expect
-// they can do large reads all at once.  Bug for bug.  Same for other read
-// calls below.
-func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	fd := args[0].Int()
-	addr := args[1].Pointer()
-	size := args[2].SizeT()
-
-	file := t.GetFileVFS2(fd)
-	if file == nil {
-		return 0, nil, syserror.EBADF
-	}
-	defer file.DecRef()
-
-	// Check that the size is legitimate.
-	si := int(size)
-	if si < 0 {
-		return 0, nil, syserror.EINVAL
-	}
-
-	// Get the destination of the read.
-	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
-	if err != nil {
-		return 0, nil, err
-	}
-
-	n, err := read(t, file, dst, vfs.ReadOptions{})
-	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
-}
-
-func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	n, err := file.Read(t, dst, opts)
-	if err != syserror.ErrWouldBlock {
-		return n, err
-	}
-
-	// Register for notifications.
-	w, ch := waiter.NewChannelEntry(nil)
-	file.EventRegister(&w, EventMaskRead)
-
-	total := n
-	for {
-		// Shorten dst to reflect bytes previously read.
-		dst = dst.DropFirst(int(n))
-
-		// Issue the request and break out if it completes with anything other than
-		// "would block".
-		n, err := file.Read(t, dst, opts)
-		total += n
-		if err != syserror.ErrWouldBlock {
-			break
-		}
-		if err := t.Block(ch); err != nil {
-			break
-		}
-	}
-	file.EventUnregister(&w)
-
-	return total, err
-}
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
new file mode 100644
index 000000000..89e9ff4d7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -0,0 +1,353 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"bytes"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Listxattr implements Linux syscall listxattr(2).
+func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listxattr(t, args, followFinalSymlink)
+}
+
+// Llistxattr implements Linux syscall llistxattr(2).
+func Llistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listxattr(t, args, nofollowFinalSymlink)
+}
+
+func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	listAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrNameList(t, listAddr, size, names)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Flistxattr implements Linux syscall flistxattr(2).
+func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	listAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	names, err := file.Listxattr(t)
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrNameList(t, listAddr, size, names)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Getxattr implements Linux syscall getxattr(2).
+func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getxattr(t, args, followFinalSymlink)
+}
+
+// Lgetxattr implements Linux syscall lgetxattr(2).
+func Lgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getxattr(t, args, nofollowFinalSymlink)
+}
+
+func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, name)
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrValue(t, valueAddr, size, value)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Fgetxattr implements Linux syscall fgetxattr(2).
+func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	value, err := file.Getxattr(t, name)
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrValue(t, valueAddr, size, value)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Setxattr implements Linux syscall setxattr(2).
+func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, setxattr(t, args, followFinalSymlink)
+}
+
+// Lsetxattr implements Linux syscall lsetxattr(2).
+func Lsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, setxattr(t, args, nofollowFinalSymlink)
+}
+
+func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	flags := args[4].Int()
+
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+	value, err := copyInXattrValue(t, valueAddr, size)
+	if err != nil {
+		return err
+	}
+
+	return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{
+		Name:  name,
+		Value: value,
+		Flags: uint32(flags),
+	})
+}
+
+// Fsetxattr implements Linux syscall fsetxattr(2).
+func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	flags := args[4].Int()
+
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	value, err := copyInXattrValue(t, valueAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, file.Setxattr(t, vfs.SetxattrOptions{
+		Name:  name,
+		Value: value,
+		Flags: uint32(flags),
+	})
+}
+
+// Removexattr implements Linux syscall removexattr(2).
+func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, removexattr(t, args, followFinalSymlink)
+}
+
+// Lremovexattr implements Linux syscall lremovexattr(2).
+func Lremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, removexattr(t, args, nofollowFinalSymlink)
+}
+
+func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name)
+}
+
+// Fremovexattr implements Linux syscall fremovexattr(2).
+func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, file.Removexattr(t, name)
+}
+
+func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
+	name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
+	if err != nil {
+		if err == syserror.ENAMETOOLONG {
+			return "", syserror.ERANGE
+		}
+		return "", err
+	}
+	if len(name) == 0 {
+		return "", syserror.ERANGE
+	}
+	return name, nil
+}
+
+func copyOutXattrNameList(t *kernel.Task, listAddr usermem.Addr, size uint, names []string) (int, error) {
+	if size > linux.XATTR_LIST_MAX {
+		size = linux.XATTR_LIST_MAX
+	}
+	var buf bytes.Buffer
+	for _, name := range names {
+		buf.WriteString(name)
+		buf.WriteByte(0)
+	}
+	if size == 0 {
+		// Return the size that would be required to accomodate the list.
+		return buf.Len(), nil
+	}
+	if buf.Len() > int(size) {
+		if size >= linux.XATTR_LIST_MAX {
+			return 0, syserror.E2BIG
+		}
+		return 0, syserror.ERANGE
+	}
+	return t.CopyOutBytes(listAddr, buf.Bytes())
+}
+
+func copyInXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint) (string, error) {
+	if size > linux.XATTR_SIZE_MAX {
+		return "", syserror.E2BIG
+	}
+	buf := make([]byte, size)
+	if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
+		return "", err
+	}
+	return gohacks.StringFromImmutableBytes(buf), nil
+}
+
+func copyOutXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint, value string) (int, error) {
+	if size > linux.XATTR_SIZE_MAX {
+		size = linux.XATTR_SIZE_MAX
+	}
+	if size == 0 {
+		// Return the size that would be required to accomodate the value.
+		return len(value), nil
+	}
+	if len(value) > int(size) {
+		if size >= linux.XATTR_SIZE_MAX {
+			return 0, syserror.E2BIG
+		}
+		return 0, syserror.ERANGE
+	}
+	return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value))
+}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 0b4f18ab5..07c8383e6 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -43,6 +43,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/gohacks",
         "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index eed41139b..3da45d744 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -202,6 +202,9 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin
 	// Add epi to file.epolls so that it is removed when the last
 	// FileDescription reference is dropped.
 	file.epollMu.Lock()
+	if file.epolls == nil {
+		file.epolls = make(map[*epollInterest]struct{})
+	}
 	file.epolls[epi] = struct{}{}
 	file.epollMu.Unlock()
 
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 9912df799..31a4e5480 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -139,6 +139,23 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 	return mntns, nil
 }
 
+// NewDisconnectedMount returns a Mount representing fs with the given root
+// (which may be nil). The new Mount is not associated with any MountNamespace
+// and is not connected to any other Mounts. References are taken on fs and
+// root.
+func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) (*Mount, error) {
+	fs.IncRef()
+	if root != nil {
+		root.IncRef()
+	}
+	return &Mount{
+		vfs:  vfs,
+		fs:   fs,
+		root: root,
+		refs: 1,
+	}, nil
+}
+
 // MountAt creates and mounts a Filesystem configured by the given arguments.
 func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
 	rft := vfs.getFilesystemType(fsTypeName)
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index 1fe766a44..bc7581698 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -26,6 +26,7 @@ import (
 	"sync/atomic"
 	"unsafe"
 
+	"gvisor.dev/gvisor/pkg/gohacks"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
@@ -160,7 +161,7 @@ func newMountTableSlots(cap uintptr) unsafe.Pointer {
 // Lookup may be called even if there are concurrent mutators of mt.
 func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
 	key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
-	hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
+	hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
 
 loop:
 	for {
@@ -361,12 +362,3 @@ func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
 
 //go:linkname rand32 runtime.fastrand
 func rand32() uint32
-
-// This is copy/pasted from runtime.noescape(), and is needed because arguments
-// apparently escape from all functions defined by linkname.
-//
-//go:nosplit
-func noescape(p unsafe.Pointer) unsafe.Pointer {
-	x := uintptr(p)
-	return unsafe.Pointer(x ^ 0)
-}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 8a0b382f6..eb4ebb511 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -228,7 +228,7 @@ func (rp *ResolvingPath) Advance() {
 		rp.pit = next
 	} else { // at end of path segment, continue with next one
 		rp.curPart--
-		rp.pit = rp.parts[rp.curPart-1]
+		rp.pit = rp.parts[rp.curPart]
 	}
 }
 
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 8f29031b2..bde81e1ef 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -126,17 +126,23 @@ func (vfs *VirtualFilesystem) Init() error {
 	// Construct vfs.anonMount.
 	anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
 	if err != nil {
-		return err
+		// This shouldn't be possible since anonBlockDevMinorNext was
+		// initialized to 1 above (no device numbers have been allocated yet).
+		panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err))
 	}
 	anonfs := anonFilesystem{
 		devMinor: anonfsDevMinor,
 	}
 	anonfs.vfsfs.Init(vfs, &anonfs)
-	vfs.anonMount = &Mount{
-		vfs:  vfs,
-		fs:   &anonfs.vfsfs,
-		refs: 1,
+	defer anonfs.vfsfs.DecRef()
+	anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{})
+	if err != nil {
+		// We should not be passing any MountOptions that would cause
+		// construction of this mount to fail.
+		panic(fmt.Sprintf("VirtualFilesystem.Init: anonfs mount failed: %v", err))
 	}
+	vfs.anonMount = anonMount
+
 	return nil
 }
 
@@ -385,15 +391,11 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 				// Only a regular file can be executed.
 				stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
 				if err != nil {
+					fd.DecRef()
 					return nil, err
 				}
-				if stat.Mask&linux.STATX_TYPE != 0 {
-					// This shouldn't happen, but if type can't be retrieved, file can't
-					// be executed.
-					return nil, syserror.EACCES
-				}
-				if t := linux.FileMode(stat.Mode).FileType(); t != linux.ModeRegular {
-					ctx.Infof("%q is not a regular file: %v", pop.Path, t)
+				if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG {
+					fd.DecRef()
 					return nil, syserror.EACCES
 				}
 			}
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index bfb2fac26..f7d6009a0 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -221,7 +221,7 @@ func (w *Watchdog) waitForStart() {
 		return
 	}
 	var buf bytes.Buffer
-	buf.WriteString("Watchdog.Start() not called within %s:\n")
+	buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
 	w.doAction(w.StartupTimeoutAction, false, &buf)
 }
 
@@ -325,7 +325,7 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo
 
 func (w *Watchdog) reportStuckWatchdog() {
 	var buf bytes.Buffer
-	buf.WriteString("Watchdog goroutine is stuck:\n")
+	buf.WriteString("Watchdog goroutine is stuck:")
 	w.doAction(w.TaskTimeoutAction, false, &buf)
 }
 
@@ -359,7 +359,7 @@ func (w *Watchdog) doAction(action Action, skipStack bool, msg *bytes.Buffer) {
 		case <-metricsEmitted:
 		case <-time.After(1 * time.Second):
 		}
-		panic(fmt.Sprintf("Stack for running G's are skipped while panicking.\n%s", msg.String()))
+		panic(fmt.Sprintf("%s\nStack for running G's are skipped while panicking.", msg.String()))
 	default:
 		panic(fmt.Sprintf("Unknown watchdog action %v", action))