268 files changed, 13054 insertions, 2470 deletions
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 343f81f59..fd95eb2d2 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -17,7 +17,6 @@
 package arch
 
 import (
-	"encoding/binary"
 	"fmt"
 	"io"
 
@@ -29,7 +28,14 @@ import (
 )
 
 // Registers represents the CPU registers for this architecture.
-type Registers = linux.PtraceRegs
+//
+// +stateify savable
+type Registers struct {
+	linux.PtraceRegs
+
+	// TPIDR_EL0 is the EL0 Read/Write Software Thread ID Register.
+	TPIDR_EL0 uint64
+}
 
 const (
 	// SyscallWidth is the width of insturctions.
@@ -49,9 +55,14 @@ const ARMTrapFlag = uint64(1) << 21
 type aarch64FPState []byte
 
 // initAarch64FPState sets up initial state.
+//
+// Related code in Linux kernel: fpsimd_flush_thread().
+// FPCR = FPCR_RM_RN (0x0 << 22).
+//
+// Currently, aarch64FPState is only a space of 0x210 length for fpstate.
+// The fp head is useless in sentry/ptrace/kvm.
+//
 func initAarch64FPState(data aarch64FPState) {
-	binary.LittleEndian.PutUint32(data, fpsimdMagic)
-	binary.LittleEndian.PutUint32(data[4:], fpsimdContextSize)
 }
 
 func newAarch64FPStateSlice() []byte {
@@ -97,9 +108,6 @@ type State struct {
 	// Our floating point state.
 	aarch64FPState `state:"wait"`
 
-	// TLS pointer
-	TPValue uint64
-
 	// FeatureSet is a pointer to the currently active feature set.
 	FeatureSet *cpuid.FeatureSet
 
@@ -153,7 +161,6 @@ func (s *State) Fork() State {
 	return State{
 		Regs:           s.Regs,
 		aarch64FPState: s.aarch64FPState.fork(),
-		TPValue:        s.TPValue,
 		FeatureSet:     s.FeatureSet,
 		OrigR0:         s.OrigR0,
 	}
@@ -237,18 +244,18 @@ func (s *State) ptraceGetRegs() Registers {
 	return s.Regs
 }
 
-var registersSize = (*Registers)(nil).SizeBytes()
+var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes()
 
 // PtraceSetRegs implements Context.PtraceSetRegs.
 func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
 	var regs Registers
-	buf := make([]byte, registersSize)
+	buf := make([]byte, ptraceRegistersSize)
 	if _, err := io.ReadFull(src, buf); err != nil {
 		return 0, err
 	}
 	regs.UnmarshalUnsafe(buf)
 	s.Regs = regs
-	return registersSize, nil
+	return ptraceRegistersSize, nil
 }
 
 // PtraceGetFPRegs implements Context.PtraceGetFPRegs.
@@ -274,7 +281,7 @@ const (
 func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
 	switch regset {
 	case _NT_PRSTATUS:
-		if maxlen < registersSize {
+		if maxlen < ptraceRegistersSize {
 			return 0, syserror.EFAULT
 		}
 		return s.PtraceGetRegs(dst)
@@ -287,7 +294,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int,
 func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
 	switch regset {
 	case _NT_PRSTATUS:
-		if maxlen < registersSize {
+		if maxlen < ptraceRegistersSize {
 			return 0, syserror.EFAULT
 		}
 		return s.PtraceSetRegs(src)
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 3b3a0a272..1c3e3c14c 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -300,7 +300,7 @@ func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
 	// PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and
 	// u_debugreg, returning 0 or silently no-oping for other fields
 	// respectively.
-	if addr < uintptr(registersSize) {
+	if addr < uintptr(ptraceRegistersSize) {
 		regs := c.ptraceGetRegs()
 		buf := make([]byte, regs.SizeBytes())
 		regs.MarshalUnsafe(buf)
@@ -315,7 +315,7 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error {
 	if addr&7 != 0 || addr >= userStructSize {
 		return syscall.EIO
 	}
-	if addr < uintptr(registersSize) {
+	if addr < uintptr(ptraceRegistersSize) {
 		regs := c.ptraceGetRegs()
 		buf := make([]byte, regs.SizeBytes())
 		regs.MarshalUnsafe(buf)
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index ada7ac7b8..cabbf60e0 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -142,7 +142,7 @@ func (c *context64) SetStack(value uintptr) {
 
 // TLS returns the current TLS pointer.
 func (c *context64) TLS() uintptr {
-	return uintptr(c.TPValue)
+	return uintptr(c.Regs.TPIDR_EL0)
 }
 
 // SetTLS sets the current TLS pointer. Returns false if value is invalid.
@@ -151,7 +151,7 @@ func (c *context64) SetTLS(value uintptr) bool {
 		return false
 	}
 
-	c.TPValue = uint64(value)
+	c.Regs.TPIDR_EL0 = uint64(value)
 	return true
 }
 
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index dc458b37f..b9405b320 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -31,7 +31,11 @@ import (
 )
 
 // Registers represents the CPU registers for this architecture.
-type Registers = linux.PtraceRegs
+//
+// +stateify savable
+type Registers struct {
+	linux.PtraceRegs
+}
 
 // System-related constants for x86.
 const (
@@ -311,12 +315,12 @@ func (s *State) ptraceGetRegs() Registers {
 	return regs
 }
 
-var registersSize = (*Registers)(nil).SizeBytes()
+var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes()
 
 // PtraceSetRegs implements Context.PtraceSetRegs.
 func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
 	var regs Registers
-	buf := make([]byte, registersSize)
+	buf := make([]byte, ptraceRegistersSize)
 	if _, err := io.ReadFull(src, buf); err != nil {
 		return 0, err
 	}
@@ -374,7 +378,7 @@ func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
 	}
 	regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable)
 	s.Regs = regs
-	return registersSize, nil
+	return ptraceRegistersSize, nil
 }
 
 // isUserSegmentSelector returns true if the given segment selector specifies a
@@ -543,7 +547,7 @@ const (
 func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
 	switch regset {
 	case _NT_PRSTATUS:
-		if maxlen < registersSize {
+		if maxlen < ptraceRegistersSize {
 			return 0, syserror.EFAULT
 		}
 		return s.PtraceGetRegs(dst)
@@ -563,7 +567,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int,
 func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
 	switch regset {
 	case _NT_PRSTATUS:
-		if maxlen < registersSize {
+		if maxlen < ptraceRegistersSize {
 			return 0, syserror.EFAULT
 		}
 		return s.PtraceSetRegs(src)
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index e74275d2d..2c5d14be5 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -16,14 +16,12 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/context",
         "//pkg/fd",
-        "//pkg/fspath",
         "//pkg/log",
         "//pkg/sentry/fdimport",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/host",
-        "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fs/user",
         "//pkg/sentry/fsimpl/host",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
@@ -35,7 +33,6 @@ go_library(
         "//pkg/sentry/vfs",
         "//pkg/sentry/watchdog",
         "//pkg/sync",
-        "//pkg/syserror",
         "//pkg/tcpip/link/sniffer",
         "//pkg/urpc",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/pkg/sentry/control/logging.go b/pkg/sentry/control/logging.go
index 811f24324..8a500a515 100644
--- a/pkg/sentry/control/logging.go
+++ b/pkg/sentry/control/logging.go
@@ -70,8 +70,8 @@ type LoggingArgs struct {
 type Logging struct{}
 
 // Change will change the log level and strace arguments. Although
-// this functions signature requires an error it never acctually
-// return san error. It's required by the URPC interface.
+// this functions signature requires an error it never actually
+// returns an error. It's required by the URPC interface.
 // Additionally, it may look odd that this is the only method
 // attached to an empty struct but this is also part of how
 // URPC dispatches.
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 2ed17ee09..1bae7cfaf 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -18,7 +18,6 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
-	"path"
 	"sort"
 	"strings"
 	"text/tabwriter"
@@ -26,13 +25,10 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fdimport"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
-	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
 	hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -40,7 +36,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
@@ -108,6 +103,9 @@ type ExecArgs struct {
 
 // String prints the arguments as a string.
 func (args ExecArgs) String() string {
+	if len(args.Argv) == 0 {
+		return args.Filename
+	}
 	a := make([]string, len(args.Argv))
 	copy(a, args.Argv)
 	if args.Filename != "" {
@@ -180,42 +178,30 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 	}
 	ctx := initArgs.NewContext(proc.Kernel)
 
-	if initArgs.Filename == "" {
-		if kernel.VFS2Enabled {
-			// Get the full path to the filename from the PATH env variable.
-			if initArgs.MountNamespaceVFS2 == nil {
-				// Set initArgs so that 'ctx' returns the namespace.
-				//
-				// MountNamespaceVFS2 adds a reference to the namespace, which is
-				// transferred to the new process.
-				initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
-			}
+	if kernel.VFS2Enabled {
+		// Get the full path to the filename from the PATH env variable.
+		if initArgs.MountNamespaceVFS2 == nil {
+			// Set initArgs so that 'ctx' returns the namespace.
+			//
+			// MountNamespaceVFS2 adds a reference to the namespace, which is
+			// transferred to the new process.
+			initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
+		}
+	} else {
+		if initArgs.MountNamespace == nil {
+			// Set initArgs so that 'ctx' returns the namespace.
+			initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
 
-			paths := fs.GetPath(initArgs.Envv)
-			vfsObj := proc.Kernel.VFS()
-			file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
-			if err != nil {
-				return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
-			}
-			initArgs.File = fsbridge.NewVFSFile(file)
-		} else {
-			// Get the full path to the filename from the PATH env variable.
-			paths := fs.GetPath(initArgs.Envv)
-			if initArgs.MountNamespace == nil {
-				// Set initArgs so that 'ctx' returns the namespace.
-				initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
-
-				// initArgs must hold a reference on MountNamespace, which will
-				// be donated to the new process in CreateProcess.
-				initArgs.MountNamespace.IncRef()
-			}
-			f, err := initArgs.MountNamespace.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
-			if err != nil {
-				return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
-			}
-			initArgs.Filename = f
+			// initArgs must hold a reference on MountNamespace, which will
+			// be donated to the new process in CreateProcess.
+			initArgs.MountNamespace.IncRef()
 		}
 	}
+	resolved, err := user.ResolveExecutablePath(ctx, &initArgs)
+	if err != nil {
+		return nil, 0, nil, nil, err
+	}
+	initArgs.Filename = resolved
 
 	fds := make([]int, len(args.FilePayload.Files))
 	for i, file := range args.FilePayload.Files {
@@ -428,67 +414,3 @@ func ttyName(tty *kernel.TTY) string {
 	}
 	return fmt.Sprintf("pts/%d", tty.Index)
 }
-
-// ResolveExecutablePath resolves the given executable name given a set of
-// paths that might contain it.
-func ResolveExecutablePath(ctx context.Context, vfsObj *vfs.VirtualFilesystem, wd, name string, paths []string) (*vfs.FileDescription, error) {
-	root := vfs.RootFromContext(ctx)
-	defer root.DecRef()
-	creds := auth.CredentialsFromContext(ctx)
-
-	// Absolute paths can be used directly.
-	if path.IsAbs(name) {
-		return openExecutable(ctx, vfsObj, creds, root, name)
-	}
-
-	// Paths with '/' in them should be joined to the working directory, or
-	// to the root if working directory is not set.
-	if strings.IndexByte(name, '/') > 0 {
-		if len(wd) == 0 {
-			wd = "/"
-		}
-		if !path.IsAbs(wd) {
-			return nil, fmt.Errorf("working directory %q must be absolute", wd)
-		}
-		return openExecutable(ctx, vfsObj, creds, root, path.Join(wd, name))
-	}
-
-	// Otherwise, we must lookup the name in the paths, starting from the
-	// calling context's root directory.
-	for _, p := range paths {
-		if !path.IsAbs(p) {
-			// Relative paths aren't safe, no one should be using them.
-			log.Warningf("Skipping relative path %q in $PATH", p)
-			continue
-		}
-
-		binPath := path.Join(p, name)
-		f, err := openExecutable(ctx, vfsObj, creds, root, binPath)
-		if err != nil {
-			return nil, err
-		}
-		if f == nil {
-			continue // Not found/no access.
-		}
-		return f, nil
-	}
-	return nil, syserror.ENOENT
-}
-
-func openExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry, path string) (*vfs.FileDescription, error) {
-	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root, // binPath is absolute, Start can be anything.
-		Path:               fspath.Parse(path),
-		FollowFinalSymlink: true,
-	}
-	opts := &vfs.OpenOptions{
-		Flags:    linux.O_RDONLY,
-		FileExec: true,
-	}
-	f, err := vfsObj.OpenAt(ctx, creds, &pop, opts)
-	if err == syserror.ENOENT || err == syserror.EACCES {
-		return nil, nil
-	}
-	return f, err
-}
diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go
index 69e71e322..f45b2bd2b 100644
--- a/pkg/sentry/device/device.go
+++ b/pkg/sentry/device/device.go
@@ -188,6 +188,9 @@ type MultiDevice struct {
 
 // String stringifies MultiDevice.
 func (m *MultiDevice) String() string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
 	buf := bytes.NewBuffer(nil)
 	buf.WriteString("cache{")
 	for k, v := range m.cache {
diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go
index c7e197691..af66fe4dc 100644
--- a/pkg/sentry/devices/memdev/full.go
+++ b/pkg/sentry/devices/memdev/full.go
@@ -42,6 +42,7 @@ type fullFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go
index 33d060d02..92d3d71be 100644
--- a/pkg/sentry/devices/memdev/null.go
+++ b/pkg/sentry/devices/memdev/null.go
@@ -43,6 +43,7 @@ type nullFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go
index acfa23149..6b81da5ef 100644
--- a/pkg/sentry/devices/memdev/random.go
+++ b/pkg/sentry/devices/memdev/random.go
@@ -48,6 +48,7 @@ type randomFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 
 	// off is the "file offset". off is accessed using atomic memory
 	// operations.
diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go
index 3b1372b9e..c6f15054d 100644
--- a/pkg/sentry/devices/memdev/zero.go
+++ b/pkg/sentry/devices/memdev/zero.go
@@ -44,6 +44,7 @@ type zeroFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
diff --git a/pkg/sentry/devices/ttydev/BUILD b/pkg/sentry/devices/ttydev/BUILD
new file mode 100644
index 000000000..12e49b58a
--- /dev/null
+++ b/pkg/sentry/devices/ttydev/BUILD
@@ -0,0 +1,16 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "ttydev",
+    srcs = ["ttydev.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/vfs",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go
new file mode 100644
index 000000000..fbb7fd92c
--- /dev/null
+++ b/pkg/sentry/devices/ttydev/ttydev.go
@@ -0,0 +1,91 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ttydev implements devices for /dev/tty and (eventually)
+// /dev/console.
+//
+// TODO(b/159623826): Support /dev/console.
+package ttydev
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+	// See drivers/tty/tty_io.c:tty_init().
+	ttyDevMinor     = 0
+	consoleDevMinor = 1
+)
+
+// ttyDevice implements vfs.Device for /dev/tty.
+type ttyDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (ttyDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &ttyFD{}
+	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// ttyFD implements vfs.FileDescriptionImpl for /dev/tty.
+type ttyFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *ttyFD) Release() {}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *ttyFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return 0, nil
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *ttyFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return src.NumBytes(), nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *ttyFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return src.NumBytes(), nil
+}
+
+// Register registers all devices implemented by this package in vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+	return vfsObj.RegisterDevice(vfs.CharDevice, linux.TTYAUX_MAJOR, ttyDevMinor, ttyDevice{}, &vfs.RegisterDeviceOptions{
+		GroupName: "tty",
+	})
+}
+
+// CreateDevtmpfsFiles creates device special files in dev representing all
+// devices implemented by this package.
+func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error {
+	return dev.CreateDeviceFile(ctx, "tty", vfs.CharDevice, linux.TTYAUX_MAJOR, ttyDevMinor, 0666 /* mode */)
+}
diff --git a/pkg/sentry/devices/tundev/BUILD b/pkg/sentry/devices/tundev/BUILD
new file mode 100644
index 000000000..71c59287c
--- /dev/null
+++ b/pkg/sentry/devices/tundev/BUILD
@@ -0,0 +1,23 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "tundev",
+    srcs = ["tundev.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/socket/netstack",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/tcpip/link/tun",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go
new file mode 100644
index 000000000..dfbd069af
--- /dev/null
+++ b/pkg/sentry/devices/tundev/tundev.go
@@ -0,0 +1,178 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tundev implements the /dev/net/tun device.
+package tundev
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	netTunDevMajor = 10
+	netTunDevMinor = 200
+)
+
+// tunDevice implements vfs.Device for /dev/net/tun.
+type tunDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (tunDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &tunFD{}
+	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// tunFD implements vfs.FileDescriptionImpl for /dev/net/tun.
+type tunFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
+
+	device tun.Device
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	request := args[1].Uint()
+	data := args[2].Pointer()
+
+	switch request {
+	case linux.TUNSETIFF:
+		t := kernel.TaskFromContext(ctx)
+		if t == nil {
+			panic("Ioctl should be called from a task context")
+		}
+		if !t.HasCapability(linux.CAP_NET_ADMIN) {
+			return 0, syserror.EPERM
+		}
+		stack, ok := t.NetworkContext().(*netstack.Stack)
+		if !ok {
+			return 0, syserror.EINVAL
+		}
+
+		var req linux.IFReq
+		if _, err := usermem.CopyObjectIn(ctx, uio, data, &req, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		flags := usermem.ByteOrder.Uint16(req.Data[:])
+		return 0, fd.device.SetIff(stack.Stack, req.Name(), flags)
+
+	case linux.TUNGETIFF:
+		var req linux.IFReq
+
+		copy(req.IFName[:], fd.device.Name())
+
+		// Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when
+		// there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c.
+		flags := fd.device.Flags() | linux.IFF_NOFILTER
+		usermem.ByteOrder.PutUint16(req.Data[:], flags)
+
+		_, err := usermem.CopyObjectOut(ctx, uio, data, &req, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *tunFD) Release() {
+	fd.device.Release()
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *tunFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return fd.Read(ctx, dst, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *tunFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	data, err := fd.device.Read()
+	if err != nil {
+		return 0, err
+	}
+	n, err := dst.CopyOut(ctx, data)
+	if n > 0 && n < len(data) {
+		// Not an error for partial copying. Packet truncated.
+		err = nil
+	}
+	return int64(n), err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *tunFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return fd.Write(ctx, src, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *tunFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	data := make([]byte, src.NumBytes())
+	if _, err := src.CopyIn(ctx, data); err != nil {
+		return 0, err
+	}
+	return fd.device.Write(data)
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (fd *tunFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fd.device.Readiness(mask)
+}
+
+// EventRegister implements watier.Waitable.EventRegister.
+func (fd *tunFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.device.EventRegister(e, mask)
+}
+
+// EventUnregister implements watier.Waitable.EventUnregister.
+func (fd *tunFD) EventUnregister(e *waiter.Entry) {
+	fd.device.EventUnregister(e)
+}
+
+// isNetTunSupported returns whether /dev/net/tun device is supported for s.
+func isNetTunSupported(s inet.Stack) bool {
+	_, ok := s.(*netstack.Stack)
+	return ok
+}
+
+// Register registers all devices implemented by this package in vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+	return vfsObj.RegisterDevice(vfs.CharDevice, netTunDevMajor, netTunDevMinor, tunDevice{}, &vfs.RegisterDeviceOptions{})
+}
+
+// CreateDevtmpfsFiles creates device special files in dev representing all
+// devices implemented by this package.
+func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error {
+	return dev.CreateDeviceFile(ctx, "net/tun", vfs.CharDevice, netTunDevMajor, netTunDevMinor, 0666 /* mode */)
+}
diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go
index a4199f9e9..b8686adb4 100644
--- a/pkg/sentry/fdimport/fdimport.go
+++ b/pkg/sentry/fdimport/fdimport.go
@@ -15,6 +15,8 @@
 package fdimport
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
@@ -84,6 +86,9 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
 
 func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []int) (*hostvfs2.TTYFileDescription, error) {
 	k := kernel.KernelFromContext(ctx)
+	if k == nil {
+		return nil, fmt.Errorf("cannot find kernel from context")
+	}
 
 	var ttyFile *vfs.FileDescription
 	for appFD, hostFD := range stdioFDs {
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 846252c89..ca41520b4 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -146,7 +146,7 @@ func (f *File) DecRef() {
 	f.DecRefWithDestructor(func() {
 		// Drop BSD style locks.
 		lockRng := lock.LockRange{Start: 0, End: lock.LockEOF}
-		f.Dirent.Inode.LockCtx.BSD.UnlockRegion(lock.UniqueID(f.UniqueID), lockRng)
+		f.Dirent.Inode.LockCtx.BSD.UnlockRegion(f, lockRng)
 
 		// Release resources held by the FileOperations.
 		f.FileOperations.Release()
@@ -310,7 +310,6 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error
 	if !f.mu.Lock(ctx) {
 		return 0, syserror.ErrInterrupted
 	}
-
 	unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
 	// Handle append mode.
 	if f.Flags().Append {
@@ -355,7 +354,6 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64
 	// offset."
 	unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
 	defer unlockAppendMu()
-
 	if f.Flags().Append {
 		if err := f.offsetForAppend(ctx, &offset); err != nil {
 			return 0, err
@@ -374,9 +372,10 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64
 	return f.FileOperations.Write(ctx, f, src, offset)
 }
 
-// offsetForAppend sets the given offset to the end of the file.
+// offsetForAppend atomically sets the given offset to the end of the file.
 //
-// Precondition: the file.Dirent.Inode.appendMu mutex should be held for writing.
+// Precondition: the file.Dirent.Inode.appendMu mutex should be held for
+// writing.
 func (f *File) offsetForAppend(ctx context.Context, offset *int64) error {
 	uattr, err := f.Dirent.Inode.UnstableAttr(ctx)
 	if err != nil {
@@ -386,7 +385,7 @@ func (f *File) offsetForAppend(ctx context.Context, offset *int64) error {
 	}
 
 	// Update the offset.
-	*offset = uattr.Size
+	atomic.StoreInt64(offset, uattr.Size)
 
 	return nil
 }
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index beba0f771..f5537411e 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -160,6 +160,7 @@ type FileOperations interface {
 	// refer.
 	//
 	// Preconditions: The AddressSpace (if any) that io refers to is activated.
+	// Must only be called from a task goroutine.
 	Ioctl(ctx context.Context, file *File, io usermem.IO, args arch.SyscallArguments) (uintptr, error)
 }
 
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index 084da2a8d..d41f30bbb 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -87,20 +87,6 @@ func RegisterFilesystem(f Filesystem) {
 	filesystems.registered[f.Name()] = f
 }
 
-// UnregisterFilesystem removes a file system from the global set. To keep the
-// file system set compatible with save/restore, UnregisterFilesystem must be
-// called before save/restore methods.
-//
-// For instance, packages may unregister their file system after it is mounted.
-// This makes sense for pseudo file systems that should not be visible or
-// mountable. See whitelistfs in fs/host/fs.go for one example.
-func UnregisterFilesystem(name string) {
-	filesystems.mu.Lock()
-	defer filesystems.mu.Unlock()
-
-	delete(filesystems.registered, name)
-}
-
 // FindFilesystem returns a Filesystem registered at name or (nil, false) if name
 // is not a file system type that can be found in /proc/filesystems.
 func FindFilesystem(name string) (Filesystem, bool) {
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index bdba6efe5..d2dbff268 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -42,9 +42,10 @@
 //     Dirent.dirMu
 //       Dirent.mu
 //         DirentCache.mu
-//         Locks in InodeOperations implementations or overlayEntry
 //         Inode.Watches.mu (see `Inotify` for other lock ordering)
 //         MountSource.mu
+//         Inode.appendMu
+//           Locks in InodeOperations implementations or overlayEntry
 //
 // If multiple Dirent or MountSource locks must be taken, locks in the parent must be
 // taken before locks in their children.
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 789369220..5fb419bcd 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -8,7 +8,6 @@ go_template_instance(
     out = "dirty_set_impl.go",
     imports = {
         "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
-        "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
     },
     package = "fsutil",
     prefix = "Dirty",
@@ -25,14 +24,14 @@ go_template_instance(
     name = "frame_ref_set_impl",
     out = "frame_ref_set_impl.go",
     imports = {
-        "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
+        "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
     },
     package = "fsutil",
     prefix = "FrameRef",
     template = "//pkg/segment:generic_set",
     types = {
         "Key": "uint64",
-        "Range": "platform.FileRange",
+        "Range": "memmap.FileRange",
         "Value": "uint64",
         "Functions": "FrameRefSetFunctions",
     },
@@ -43,7 +42,6 @@ go_template_instance(
     out = "file_range_set_impl.go",
     imports = {
         "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
-        "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
     },
     package = "fsutil",
     prefix = "FileRange",
@@ -86,7 +84,6 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/pgalloc",
-        "//pkg/sentry/platform",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/state",
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index c6cd45087..2c9446c1d 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -20,7 +20,6 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -159,7 +158,7 @@ func (ds *DirtySet) AllowClean(mr memmap.MappableRange) {
 // repeatedly until all bytes have been written. max is the true size of the
 // cached object; offsets beyond max will not be passed to writeAt, even if
 // they are marked dirty.
-func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
 	var changedDirty bool
 	defer func() {
 		if changedDirty {
@@ -194,7 +193,7 @@ func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet
 // successful partial write, SyncDirtyAll will call it repeatedly until all
 // bytes have been written. max is the true size of the cached object; offsets
 // beyond max will not be passed to writeAt, even if they are marked dirty.
-func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
 	dseg := dirty.FirstSegment()
 	for dseg.Ok() {
 		if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil {
@@ -210,7 +209,7 @@ func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max
 }
 
 // Preconditions: mr must be page-aligned.
-func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
 	for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() {
 		wbr := cseg.Range().Intersect(mr)
 		if max < wbr.Start {
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index 5643cdac9..bbafebf03 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -23,13 +23,12 @@ import (
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // FileRangeSet maps offsets into a memmap.Mappable to offsets into a
-// platform.File. It is used to implement Mappables that store data in
+// memmap.File. It is used to implement Mappables that store data in
 // sparsely-allocated memory.
 //
 // type FileRangeSet <generated by go_generics>
@@ -65,20 +64,20 @@ func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, spli
 }
 
 // FileRange returns the FileRange mapped by seg.
-func (seg FileRangeIterator) FileRange() platform.FileRange {
+func (seg FileRangeIterator) FileRange() memmap.FileRange {
 	return seg.FileRangeOf(seg.Range())
 }
 
 // FileRangeOf returns the FileRange mapped by mr.
 //
 // Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0.
-func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileRange {
+func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRange {
 	frstart := seg.Value() + (mr.Start - seg.Start())
-	return platform.FileRange{frstart, frstart + mr.Length()}
+	return memmap.FileRange{frstart, frstart + mr.Length()}
 }
 
 // Fill attempts to ensure that all memmap.Mappable offsets in required are
-// mapped to a platform.File offset, by allocating from mf with the given
+// mapped to a memmap.File offset, by allocating from mf with the given
 // memory usage kind and invoking readAt to store data into memory. (If readAt
 // returns a successful partial read, Fill will call it repeatedly until all
 // bytes have been read.) EOF is handled consistently with the requirements of
@@ -141,7 +140,7 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
 }
 
 // Drop removes segments for memmap.Mappable offsets in mr, freeing the
-// corresponding platform.FileRanges.
+// corresponding memmap.FileRanges.
 //
 // Preconditions: mr must be page-aligned.
 func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) {
@@ -154,7 +153,7 @@ func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) {
 }
 
 // DropAll removes all segments in mr, freeing the corresponding
-// platform.FileRanges.
+// memmap.FileRanges.
 func (frs *FileRangeSet) DropAll(mf *pgalloc.MemoryFile) {
 	for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
 		mf.DecRef(seg.FileRange())
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
index dd6f5aba6..a808894df 100644
--- a/pkg/sentry/fs/fsutil/frame_ref_set.go
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -17,7 +17,7 @@ package fsutil
 import (
 	"math"
 
-	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 )
 
@@ -39,7 +39,7 @@ func (FrameRefSetFunctions) ClearValue(val *uint64) {
 }
 
 // Merge implements segment.Functions.Merge.
-func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) {
+func (FrameRefSetFunctions) Merge(_ memmap.FileRange, val1 uint64, _ memmap.FileRange, val2 uint64) (uint64, bool) {
 	if val1 != val2 {
 		return 0, false
 	}
@@ -47,13 +47,13 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.
 }
 
 // Split implements segment.Functions.Split.
-func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
+func (FrameRefSetFunctions) Split(_ memmap.FileRange, val uint64, _ uint64) (uint64, uint64) {
 	return val, val
 }
 
 // IncRefAndAccount adds a reference on the range fr. All newly inserted segments
 // are accounted as host page cache memory mappings.
-func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) {
+func (refs *FrameRefSet) IncRefAndAccount(fr memmap.FileRange) {
 	seg, gap := refs.Find(fr.Start)
 	for {
 		switch {
@@ -74,7 +74,7 @@ func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) {
 
 // DecRefAndAccount removes a reference on the range fr and untracks segments
 // that are removed from memory accounting.
-func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) {
+func (refs *FrameRefSet) DecRefAndAccount(fr memmap.FileRange) {
 	seg := refs.FindSegment(fr.Start)
 
 	for seg.Ok() && seg.Start() < fr.End {
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index e82afd112..ef0113b52 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -21,7 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -126,7 +125,7 @@ func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) {
 // offsets in fr or until the next call to UnmapAll.
 //
 // Preconditions: The caller must hold a reference on all offsets in fr.
-func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) (safemem.BlockSeq, error) {
+func (f *HostFileMapper) MapInternal(fr memmap.FileRange, fd int, write bool) (safemem.BlockSeq, error) {
 	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
 	f.mapsMu.Lock()
 	defer f.mapsMu.Unlock()
@@ -146,7 +145,7 @@ func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool)
 }
 
 // Preconditions: f.mapsMu must be locked.
-func (f *HostFileMapper) forEachMappingBlockLocked(fr platform.FileRange, fd int, write bool, fn func(safemem.Block)) error {
+func (f *HostFileMapper) forEachMappingBlockLocked(fr memmap.FileRange, fd int, write bool, fn func(safemem.Block)) error {
 	prot := syscall.PROT_READ
 	if write {
 		prot |= syscall.PROT_WRITE
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index 78fec553e..c15d8a946 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -21,18 +21,17 @@ import (
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// HostMappable implements memmap.Mappable and platform.File over a
+// HostMappable implements memmap.Mappable and memmap.File over a
 // CachedFileObject.
 //
 // Lock order (compare the lock order model in mm/mm.go):
 //   truncateMu ("fs locks")
 //     mu ("memmap.Mappable locks not taken by Translate")
-//       ("platform.File locks")
+//       ("memmap.File locks")
 //   	     backingFile ("CachedFileObject locks")
 //
 // +stateify savable
@@ -124,24 +123,24 @@ func (h *HostMappable) NotifyChangeFD() error {
 	return nil
 }
 
-// MapInternal implements platform.File.MapInternal.
-func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+// MapInternal implements memmap.File.MapInternal.
+func (h *HostMappable) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
 	return h.hostFileMapper.MapInternal(fr, h.backingFile.FD(), at.Write)
 }
 
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
 func (h *HostMappable) FD() int {
 	return h.backingFile.FD()
 }
 
-// IncRef implements platform.File.IncRef.
-func (h *HostMappable) IncRef(fr platform.FileRange) {
+// IncRef implements memmap.File.IncRef.
+func (h *HostMappable) IncRef(fr memmap.FileRange) {
 	mr := memmap.MappableRange{Start: fr.Start, End: fr.End}
 	h.hostFileMapper.IncRefOn(mr)
 }
 
-// DecRef implements platform.File.DecRef.
-func (h *HostMappable) DecRef(fr platform.FileRange) {
+// DecRef implements memmap.File.DecRef.
+func (h *HostMappable) DecRef(fr memmap.FileRange) {
 	mr := memmap.MappableRange{Start: fr.Start, End: fr.End}
 	h.hostFileMapper.DecRefOn(mr)
 }
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 800c8b4e1..fe8b0b6ac 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -26,7 +26,6 @@ import (
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -934,7 +933,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange
 
 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
 func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error {
-	// Whether we have a host fd (and consequently what platform.File is
+	// Whether we have a host fd (and consequently what memmap.File is
 	// mapped) can change across save/restore, so invalidate all translations
 	// unconditionally.
 	c.mapsMu.Lock()
@@ -999,10 +998,10 @@ func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.Evictable
 	}
 }
 
-// IncRef implements platform.File.IncRef. This is used when we directly map an
-// underlying host fd and CachingInodeOperations is used as the platform.File
+// IncRef implements memmap.File.IncRef. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the memmap.File
 // during translation.
-func (c *CachingInodeOperations) IncRef(fr platform.FileRange) {
+func (c *CachingInodeOperations) IncRef(fr memmap.FileRange) {
 	// Hot path. Avoid defers.
 	c.dataMu.Lock()
 	seg, gap := c.refs.Find(fr.Start)
@@ -1024,10 +1023,10 @@ func (c *CachingInodeOperations) IncRef(fr platform.FileRange) {
 	}
 }
 
-// DecRef implements platform.File.DecRef. This is used when we directly map an
-// underlying host fd and CachingInodeOperations is used as the platform.File
+// DecRef implements memmap.File.DecRef. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the memmap.File
 // during translation.
-func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
+func (c *CachingInodeOperations) DecRef(fr memmap.FileRange) {
 	// Hot path. Avoid defers.
 	c.dataMu.Lock()
 	seg := c.refs.FindSegment(fr.Start)
@@ -1046,15 +1045,15 @@ func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
 	c.dataMu.Unlock()
 }
 
-// MapInternal implements platform.File.MapInternal. This is used when we
+// MapInternal implements memmap.File.MapInternal. This is used when we
 // directly map an underlying host fd and CachingInodeOperations is used as the
-// platform.File during translation.
-func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+// memmap.File during translation.
+func (c *CachingInodeOperations) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
 	return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write)
 }
 
-// FD implements platform.File.FD. This is used when we directly map an
-// underlying host fd and CachingInodeOperations is used as the platform.File
+// FD implements memmap.File.FD. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the memmap.File
 // during translation.
 func (c *CachingInodeOperations) FD() int {
 	return c.backingFile.FD()
diff --git a/pkg/sentry/fs/g3doc/.gitignore b/pkg/sentry/fs/g3doc/.gitignore
new file mode 100644
index 000000000..2d19fc766
--- /dev/null
+++ b/pkg/sentry/fs/g3doc/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md
new file mode 100644
index 000000000..2ca84dd74
--- /dev/null
+++ b/pkg/sentry/fs/g3doc/fuse.md
@@ -0,0 +1,263 @@
+# Foreword
+
+This document describes an on-going project to support FUSE filesystems within
+the sentry. This is intended to become the final documentation for this
+subsystem, and is therefore written in the past tense. However FUSE support is
+currently incomplete and the document will be updated as things progress.
+
+# FUSE: Filesystem in Userspace
+
+The sentry supports dispatching filesystem operations to a FUSE server, allowing
+FUSE filesystem to be used with a sandbox.
+
+## Overview
+
+FUSE has two main components:
+
+1.  A client kernel driver (canonically `fuse.ko` in Linux), which forwards
+    filesystem operations (usually initiated by syscalls) to the server.
+
+2.  A server, which is a userspace daemon that implements the actual filesystem.
+
+The sentry implements the client component, which allows a server daemon running
+within the sandbox to implement a filesystem within the sandbox.
+
+A FUSE filesystem is initialized with `mount(2)`, typically with the help of a
+utility like `fusermount(1)`. Various mount options exist for establishing
+ownership and access permissions on the filesystem, but the most important mount
+option is a file descriptor used to establish communication between the client
+and server.
+
+The FUSE device FD is obtained by opening `/dev/fuse`. During regular operation,
+the client and server use the FUSE protocol described in `fuse(4)` to service
+filesystem operations. See the "Protocol" section below for more information
+about this protocol. The core of the sentry support for FUSE is the client-side
+implementation of this protocol.
+
+## FUSE in the Sentry
+
+The sentry's FUSE client targets VFS2 and has the following components:
+
+-   An implementation of `/dev/fuse`.
+
+-   A VFS2 filesystem for mapping syscalls to FUSE ops. Since we're targeting
+    VFS2, one point of contention may be the lack of inodes in VFS2. We can
+    tentatively implement a kernfs-based filesystem to bridge the gap in APIs.
+    The kernfs base functionality can serve the role of the Linux inode cache
+    and, the filesystem can map VFS2 syscalls to kernfs inode operations; see
+    the `kernfs.Inode` interface.
+
+The FUSE protocol lends itself well to marshaling with `go_marshal`. The various
+request and response packets can be defined in the ABI package and converted to
+and from the wire format using `go_marshal`.
+
+### Design Goals
+
+-   While filesystem performance is always important, the sentry's FUSE support
+    is primarily concerned with compatibility, with performance as a secondary
+    concern.
+
+-   Avoiding deadlocks from a hung server daemon.
+
+-   Consider the potential for denial of service from a malicious server daemon.
+    Protecting itself from userspace is already a design goal for the sentry,
+    but needs additional consideration for FUSE. Normally, an operating system
+    doesn't rely on userspace to make progress with filesystem operations. Since
+    this changes with FUSE, it opens up the possibility of creating a chain of
+    dependencies controlled by userspace, which could affect an entire sandbox.
+    For example: a FUSE op can block a syscall, which could be holding a
+    subsystem lock, which can then block another task goroutine.
+
+### Milestones
+
+Below are some broad goals to aim for while implementing FUSE in the sentry.
+Many FUSE ops can be grouped into broad categories of functionality, and most
+ops can be implemented in parallel.
+
+#### Minimal client that can mount a trivial FUSE filesystem.
+
+-   Implement `/dev/fuse` - a character device used to establish an FD for
+    communication between the sentry and the server daemon.
+
+-   Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`.
+
+#### Read-only mount with basic file operations
+
+-   Implement the majority of file, directory and file descriptor FUSE ops. For
+    this milestone, we can skip uncommon or complex operations like mmap, mknod,
+    file locking, poll, and extended attributes. We can stub these out along
+    with any ops that modify the filesystem. The exact list of required ops are
+    to be determined, but the goal is to mount a real filesystem as read-only,
+    and be able to read contents from the filesystem in the sentry.
+
+#### Full read-write support
+
+-   Implement the remaining FUSE ops and decide if we can omit rarely used
+    operations like ioctl.
+
+# Appendix
+
+## FUSE Protocol
+
+The FUSE protocol is a request-response protocol. All requests are initiated by
+the client. The wire-format for the protocol is raw C structs serialized to
+memory.
+
+All FUSE requests begin with the following request header:
+
+```c
+struct fuse_in_header {
+  uint32_t len;       // Length of the request, including this header.
+  uint32_t opcode;    // Requested operation.
+  uint64_t unique;    // A unique identifier for this request.
+  uint64_t nodeid;    // ID of the filesystem object being operated on.
+  uint32_t uid;       // UID of the requesting process.
+  uint32_t gid;       // GID of the requesting process.
+  uint32_t pid;       // PID of the requesting process.
+  uint32_t padding;
+};
+```
+
+The request is then followed by a payload specific to the `opcode`.
+
+All responses begin with this response header:
+
+```c
+struct fuse_out_header {
+  uint32_t len;       // Length of the response, including this header.
+  int32_t  error;     // Status of the request, 0 if success.
+  uint64_t unique;    // The unique identifier from the corresponding request.
+};
+```
+
+The response payload also depends on the request `opcode`. If `error != 0`, the
+response payload must be empty.
+
+### Operations
+
+The following is a list of all FUSE operations used in `fuse_in_header.opcode`
+as of Linux v4.4, and a brief description of their purpose. These are defined in
+`uapi/linux/fuse.h`. Many of these have a corresponding request and response
+payload struct; `fuse(4)` has details for some of these. We also note how these
+operations map to the sentry virtual filesystem.
+
+#### FUSE meta-operations
+
+These operations are specific to FUSE and don't have a corresponding action in a
+generic filesystem.
+
+-   `FUSE_INIT`: This operation initializes a new FUSE filesystem, and is the
+    first message sent by the client after mount. This is used for version and
+    feature negotiation. This is related to `mount(2)`.
+-   `FUSE_DESTROY`: Teardown a FUSE filesystem, related to `unmount(2)`.
+-   `FUSE_INTERRUPT`: Interrupts an in-flight operation, specified by the
+    `fuse_in_header.unique` value provided in the corresponding request header.
+    The client can send at most one of these per request, and will enter an
+    uninterruptible wait for a reply. The server is expected to reply promptly.
+-   `FUSE_FORGET`: A hint to the server that server should evict the indicate
+    node from any caches. This is wired up to `(struct
+    super_operations).evict_inode` in Linux, which is in turned hooked as the
+    inode cache shrinker which is typically triggered by system memory pressure.
+-   `FUSE_BATCH_FORGET`: Batch version of `FUSE_FORGET`.
+
+#### Filesystem Syscalls
+
+These FUSE ops map directly to an equivalent filesystem syscall, or family of
+syscalls. The relevant syscalls have a similar name to the operation, unless
+otherwise noted.
+
+Node creation:
+
+-   `FUSE_MKNOD`
+-   `FUSE_MKDIR`
+-   `FUSE_CREATE`: This is equivalent to `open(2)` and `creat(2)`, which
+    atomically creates and opens a node.
+
+Node attributes and extended attributes:
+
+-   `FUSE_GETATTR`
+-   `FUSE_SETATTR`
+-   `FUSE_SETXATTR`
+-   `FUSE_GETXATTR`
+-   `FUSE_LISTXATTR`
+-   `FUSE_REMOVEXATTR`
+
+Node link manipulation:
+
+-   `FUSE_READLINK`
+-   `FUSE_LINK`
+-   `FUSE_SYMLINK`
+-   `FUSE_UNLINK`
+
+Directory operations:
+
+-   `FUSE_RMDIR`
+-   `FUSE_RENAME`
+-   `FUSE_RENAME2`
+-   `FUSE_OPENDIR`: `open(2)` for directories.
+-   `FUSE_RELEASEDIR`: `close(2)` for directories.
+-   `FUSE_READDIR`
+-   `FUSE_READDIRPLUS`
+-   `FUSE_FSYNCDIR`: `fsync(2)` for directories.
+-   `FUSE_LOOKUP`: Establishes a unique identifier for a FS node. This is
+    reminiscent of `VirtualFilesystem.GetDentryAt` in that it resolves a path
+    component to a node. However the returned identifier is opaque to the
+    client. The server must remember this mapping, as this is how the client
+    will reference the node in the future.
+
+File operations:
+
+-   `FUSE_OPEN`: `open(2)` for files.
+-   `FUSE_RELEASE`: `close(2)` for files.
+-   `FUSE_FSYNC`
+-   `FUSE_FALLOCATE`
+-   `FUSE_SETUPMAPPING`: Creates a memory map on a file for `mmap(2)`.
+-   `FUSE_REMOVEMAPPING`: Removes a memory map for `munmap(2)`.
+
+File locking:
+
+-   `FUSE_GETLK`
+-   `FUSE_SETLK`
+-   `FUSE_SETLKW`
+-   `FUSE_COPY_FILE_RANGE`
+
+File descriptor operations:
+
+-   `FUSE_IOCTL`
+-   `FUSE_POLL`
+-   `FUSE_LSEEK`
+
+Filesystem operations:
+
+-   `FUSE_STATFS`
+
+#### Permissions
+
+-   `FUSE_ACCESS` is used to check if a node is accessible, as part of many
+    syscall implementations. Maps to `vfs.FilesystemImpl.AccessAt` in the
+    sentry.
+
+#### I/O Operations
+
+These ops are used to read and write file pages. They're used to implement both
+I/O syscalls like `read(2)`, `write(2)` and `mmap(2)`.
+
+-   `FUSE_READ`
+-   `FUSE_WRITE`
+
+#### Miscellaneous
+
+-   `FUSE_FLUSH`: Used by the client to indicate when a file descriptor is
+    closed. Distinct from `FUSE_FSYNC`, which corresponds to an `fsync(2)`
+    syscall from the user. Maps to `vfs.FileDescriptorImpl.Release` in the
+    sentry.
+-   `FUSE_BMAP`: Old address space API for block defrag. Probably not needed.
+-   `FUSE_NOTIFY_REPLY`: [TODO: what does this do?]
+
+# References
+
+-   [fuse(4) Linux manual page](https://www.man7.org/linux/man-pages/man4/fuse.4.html)
+-   [Linux kernel FUSE documentation](https://www.kernel.org/doc/html/latest/filesystems/fuse.html)
+-   [The reference implementation of the Linux FUSE (Filesystem in Userspace)
+    interface](https://github.com/libfuse/libfuse)
+-   [The kernel interface of FUSE](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fuse.h)
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index a016c896e..51d7368a1 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -640,7 +640,7 @@ func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset,
 
 // WriteOut implements fs.InodeOperations.WriteOut.
 func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
-	if !i.session().cachePolicy.cacheUAttrs(inode) {
+	if inode.MountSource.Flags.ReadOnly || !i.session().cachePolicy.cacheUAttrs(inode) {
 		return nil
 	}
 
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index aabce6cc9..d41d23a43 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -30,6 +30,7 @@ go_library(
         "//pkg/context",
         "//pkg/fd",
         "//pkg/fdnotifier",
+        "//pkg/iovec",
         "//pkg/log",
         "//pkg/refs",
         "//pkg/safemem",
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 62f1246aa..fbfba1b58 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -368,6 +368,9 @@ func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset,
 
 // WriteOut implements fs.InodeOperations.WriteOut.
 func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
+	if inode.MountSource.Flags.ReadOnly {
+		return nil
+	}
 	// Have we been using host kernel metadata caches?
 	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
 		// Then the metadata is already up to date on the host.
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index b6e94583e..cfb089e43 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
-	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/socket/control"
@@ -39,11 +38,6 @@ import (
 
 // LINT.IfChange
 
-// maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
-//
-// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
-const maxSendBufferSize = 8 << 20
-
 // ConnectedEndpoint is a host FD backed implementation of
 // transport.ConnectedEndpoint and transport.Receiver.
 //
@@ -103,10 +97,6 @@ func (c *ConnectedEndpoint) init() *syserr.Error {
 	if err != nil {
 		return syserr.FromError(err)
 	}
-	if sndbuf > maxSendBufferSize {
-		log.Warningf("Socket send buffer too large: %d", sndbuf)
-		return syserr.ErrInvalidEndpointState
-	}
 
 	c.stype = linux.SockType(stype)
 	c.sndbuf = int64(sndbuf)
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
index 5c18dbd5e..905afb50d 100644
--- a/pkg/sentry/fs/host/socket_iovec.go
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -17,15 +17,12 @@ package host
 import (
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/iovec"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // LINT.IfChange
 
-// maxIovs is the maximum number of iovecs to pass to the host.
-var maxIovs = linux.UIO_MAXIOV
-
 // copyToMulti copies as many bytes from src to dst as possible.
 func copyToMulti(dst [][]byte, src []byte) {
 	for _, d := range dst {
@@ -76,7 +73,7 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec
 		}
 	}
 
-	if iovsRequired > maxIovs {
+	if iovsRequired > iovec.MaxIovs {
 		// The kernel will reject our call if we pass this many iovs.
 		// Use a single intermediate buffer instead.
 		b := make([]byte, stopLen)
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index cb91355ab..82a02fcb2 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -308,9 +308,9 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		// No task? Linux does not have an analog for this case, but
-		// tty_check_change is more of a blacklist of cases than a
-		// whitelist, and is surprisingly permissive. Allowing the
-		// change seems most appropriate.
+		// tty_check_change only blocks specific cases and is
+		// surprisingly permissive. Allowing the change seems
+		// appropriate.
 		return nil
 	}
 
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index 926538d90..8a5d9c7eb 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -62,7 +62,7 @@ import (
 type LockType int
 
 // UniqueID is a unique identifier of the holder of a regional file lock.
-type UniqueID uint64
+type UniqueID interface{}
 
 const (
 	// ReadLock describes a POSIX regional file lock to be taken
@@ -98,12 +98,7 @@ type Lock struct {
 	// If len(Readers) > 0 then HasWriter must be false.
 	Readers map[UniqueID]bool
 
-	// HasWriter indicates that this is a write lock held by a single
-	// UniqueID.
-	HasWriter bool
-
-	// Writer is only valid if HasWriter is true.  It identifies a
-	// single write lock holder.
+	// Writer holds the writer unique ID. It's nil if there are no writers.
 	Writer UniqueID
 }
 
@@ -186,7 +181,6 @@ func makeLock(uid UniqueID, t LockType) Lock {
 	case ReadLock:
 		value.Readers[uid] = true
 	case WriteLock:
-		value.HasWriter = true
 		value.Writer = uid
 	default:
 		panic(fmt.Sprintf("makeLock: invalid lock type %d", t))
@@ -196,10 +190,7 @@ func makeLock(uid UniqueID, t LockType) Lock {
 
 // isHeld returns true if uid is a holder of Lock.
 func (l Lock) isHeld(uid UniqueID) bool {
-	if l.HasWriter && l.Writer == uid {
-		return true
-	}
-	return l.Readers[uid]
+	return l.Writer == uid || l.Readers[uid]
 }
 
 // lock sets uid as a holder of a typed lock on Lock.
@@ -214,20 +205,20 @@ func (l *Lock) lock(uid UniqueID, t LockType) {
 		}
 		// We cannot downgrade a write lock to a read lock unless the
 		// uid is the same.
-		if l.HasWriter {
+		if l.Writer != nil {
 			if l.Writer != uid {
 				panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer))
 			}
 			// Ensure that there is only one reader if upgrading.
 			l.Readers = make(map[UniqueID]bool)
 			// Ensure that there is no longer a writer.
-			l.HasWriter = false
+			l.Writer = nil
 		}
 		l.Readers[uid] = true
 		return
 	case WriteLock:
 		// If we are already the writer, then this is a no-op.
-		if l.HasWriter && l.Writer == uid {
+		if l.Writer == uid {
 			return
 		}
 		// We can only upgrade a read lock to a write lock if there
@@ -243,7 +234,6 @@ func (l *Lock) lock(uid UniqueID, t LockType) {
 		}
 		// Ensure that there is only a writer.
 		l.Readers = make(map[UniqueID]bool)
-		l.HasWriter = true
 		l.Writer = uid
 	default:
 		panic(fmt.Sprintf("lock: invalid lock type %d", t))
@@ -277,9 +267,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
 	switch t {
 	case ReadLock:
 		return l.lockable(r, func(value Lock) bool {
-			// If there is no writer, there's no problem adding
-			// another reader.
-			if !value.HasWriter {
+			// If there is no writer, there's no problem adding another reader.
+			if value.Writer == nil {
 				return true
 			}
 			// If there is a writer, then it must be the same uid
@@ -289,10 +278,9 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
 	case WriteLock:
 		return l.lockable(r, func(value Lock) bool {
 			// If there are only readers.
-			if !value.HasWriter {
-				// Then this uid can only take a write lock if
-				// this is a private upgrade, meaning that the
-				// only reader is uid.
+			if value.Writer == nil {
+				// Then this uid can only take a write lock if this is a private
+				// upgrade, meaning that the only reader is uid.
 				return len(value.Readers) == 1 && value.Readers[uid]
 			}
 			// If the uid is already a writer on this region, then
@@ -304,7 +292,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
 	}
 }
 
-// lock returns true if uid took a lock of type t on the entire range of LockRange.
+// lock returns true if uid took a lock of type t on the entire range of
+// LockRange.
 //
 // Preconditions: r.Start <= r.End (will panic otherwise).
 func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
@@ -339,7 +328,7 @@ func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
 			seg, _ = l.SplitUnchecked(seg, r.End)
 		}
 
-		// Set the lock on the segment.  This is guaranteed to
+		// Set the lock on the segment. This is guaranteed to
 		// always be safe, given canLock above.
 		value := seg.ValuePtr()
 		value.lock(uid, t)
@@ -386,7 +375,7 @@ func (l *LockSet) unlock(uid UniqueID, r LockRange) {
 
 		value := seg.Value()
 		var remove bool
-		if value.HasWriter && value.Writer == uid {
+		if value.Writer == uid {
 			// If we are unlocking a writer, then since there can
 			// only ever be one writer and no readers, then this
 			// lock should always be removed from the set.
diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go
index 8a3ace0c1..50a16e662 100644
--- a/pkg/sentry/fs/lock/lock_set_functions.go
+++ b/pkg/sentry/fs/lock/lock_set_functions.go
@@ -44,14 +44,9 @@ func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock)
 			return Lock{}, false
 		}
 	}
-	if val1.HasWriter != val2.HasWriter {
+	if val1.Writer != val2.Writer {
 		return Lock{}, false
 	}
-	if val1.HasWriter {
-		if val1.Writer != val2.Writer {
-			return Lock{}, false
-		}
-	}
 	return val1, true
 }
 
@@ -62,7 +57,6 @@ func (lockSetFunctions) Split(r LockRange, val Lock, split uint64) (Lock, Lock)
 	for k, v := range val.Readers {
 		val0.Readers[k] = v
 	}
-	val0.HasWriter = val.HasWriter
 	val0.Writer = val.Writer
 
 	return val, val0
diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go
index ba002aeb7..fad90984b 100644
--- a/pkg/sentry/fs/lock/lock_test.go
+++ b/pkg/sentry/fs/lock/lock_test.go
@@ -42,9 +42,6 @@ func equals(e0, e1 []entry) bool {
 		if !reflect.DeepEqual(e0[i].LockRange, e1[i].LockRange) {
 			return false
 		}
-		if e0[i].Lock.HasWriter != e1[i].Lock.HasWriter {
-			return false
-		}
 		if e0[i].Lock.Writer != e1[i].Lock.Writer {
 			return false
 		}
@@ -105,7 +102,7 @@ func TestCanLock(t *testing.T) {
 			LockRange: LockRange{2048, 3072},
 		},
 		{
-			Lock:      Lock{HasWriter: true, Writer: 1},
+			Lock:      Lock{Writer: 1},
 			LockRange: LockRange{3072, 4096},
 		},
 	})
@@ -241,7 +238,7 @@ func TestSetLock(t *testing.T) {
 			// 0                                  max uint64
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, LockEOF},
 				},
 			},
@@ -254,7 +251,7 @@ func TestSetLock(t *testing.T) {
 			// 0                                  max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, LockEOF},
 				},
 			},
@@ -273,7 +270,7 @@ func TestSetLock(t *testing.T) {
 					LockRange: LockRange{0, 4096},
 				},
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{4096, LockEOF},
 				},
 			},
@@ -301,7 +298,7 @@ func TestSetLock(t *testing.T) {
 			// 0          4096                    max uint64
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 4096},
 				},
 				{
@@ -318,7 +315,7 @@ func TestSetLock(t *testing.T) {
 			// 0                                  max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, LockEOF},
 				},
 			},
@@ -550,7 +547,7 @@ func TestSetLock(t *testing.T) {
 					LockRange: LockRange{0, 1024},
 				},
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{1024, 4096},
 				},
 				{
@@ -594,7 +591,7 @@ func TestSetLock(t *testing.T) {
 					LockRange: LockRange{0, 1024},
 				},
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{1024, 3072},
 				},
 				{
@@ -633,7 +630,7 @@ func TestSetLock(t *testing.T) {
 			// 0            1024        2048        4096   max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 1024},
 				},
 				{
@@ -663,11 +660,11 @@ func TestSetLock(t *testing.T) {
 			// 0            1024                     max uint64
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 1024},
 				},
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{1024, LockEOF},
 				},
 			},
@@ -675,28 +672,30 @@ func TestSetLock(t *testing.T) {
 	}
 
 	for _, test := range tests {
-		l := fill(test.before)
+		t.Run(test.name, func(t *testing.T) {
+			l := fill(test.before)
 
-		r := LockRange{Start: test.start, End: test.end}
-		success := l.lock(test.uid, test.lockType, r)
-		var got []entry
-		for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-			got = append(got, entry{
-				Lock:      seg.Value(),
-				LockRange: seg.Range(),
-			})
-		}
+			r := LockRange{Start: test.start, End: test.end}
+			success := l.lock(test.uid, test.lockType, r)
+			var got []entry
+			for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+				got = append(got, entry{
+					Lock:      seg.Value(),
+					LockRange: seg.Range(),
+				})
+			}
 
-		if success != test.success {
-			t.Errorf("%s: setlock(%v, %+v, %d, %d) got success %v, want %v", test.name, test.before, r, test.uid, test.lockType, success, test.success)
-			continue
-		}
+			if success != test.success {
+				t.Errorf("setlock(%v, %+v, %d, %d) got success %v, want %v", test.before, r, test.uid, test.lockType, success, test.success)
+				return
+			}
 
-		if success {
-			if !equals(got, test.after) {
-				t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after)
+			if success {
+				if !equals(got, test.after) {
+					t.Errorf("got set %+v, want %+v", got, test.after)
+				}
 			}
-		}
+		})
 	}
 }
 
@@ -782,7 +781,7 @@ func TestUnlock(t *testing.T) {
 			// 0                                  max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, LockEOF},
 				},
 			},
@@ -824,7 +823,7 @@ func TestUnlock(t *testing.T) {
 			// 0                                  max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, LockEOF},
 				},
 			},
@@ -837,7 +836,7 @@ func TestUnlock(t *testing.T) {
 			// 0     4096                    max uint64
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{4096, LockEOF},
 				},
 			},
@@ -876,7 +875,7 @@ func TestUnlock(t *testing.T) {
 			// 0                                  max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, LockEOF},
 				},
 			},
@@ -889,7 +888,7 @@ func TestUnlock(t *testing.T) {
 			// 0                          4096
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 4096},
 				},
 			},
@@ -906,7 +905,7 @@ func TestUnlock(t *testing.T) {
 					LockRange: LockRange{0, 1024},
 				},
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{1024, 4096},
 				},
 				{
@@ -974,7 +973,7 @@ func TestUnlock(t *testing.T) {
 			// 0        1024    4096            max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 1024},
 				},
 				{
@@ -991,7 +990,7 @@ func TestUnlock(t *testing.T) {
 			// 0           8    4096            max uint64
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 8},
 				},
 				{
@@ -1008,7 +1007,7 @@ func TestUnlock(t *testing.T) {
 			// 0        1024    4096            max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 1024},
 				},
 				{
@@ -1025,7 +1024,7 @@ func TestUnlock(t *testing.T) {
 			// 0       1024     4096        8192      max uint64
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 1024},
 				},
 				{
@@ -1041,19 +1040,21 @@ func TestUnlock(t *testing.T) {
 	}
 
 	for _, test := range tests {
-		l := fill(test.before)
+		t.Run(test.name, func(t *testing.T) {
+			l := fill(test.before)
 
-		r := LockRange{Start: test.start, End: test.end}
-		l.unlock(test.uid, r)
-		var got []entry
-		for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-			got = append(got, entry{
-				Lock:      seg.Value(),
-				LockRange: seg.Range(),
-			})
-		}
-		if !equals(got, test.after) {
-			t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after)
-		}
+			r := LockRange{Start: test.start, End: test.end}
+			l.unlock(test.uid, r)
+			var got []entry
+			for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+				got = append(got, entry{
+					Lock:      seg.Value(),
+					LockRange: seg.Range(),
+				})
+			}
+			if !equals(got, test.after) {
+				t.Errorf("got set %+v, want %+v", got, test.after)
+			}
+		})
 	}
 }
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index b414ddaee..3f2bd0e87 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -17,13 +17,9 @@ package fs
 import (
 	"fmt"
 	"math"
-	"path"
-	"strings"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -625,71 +621,3 @@ func (mns *MountNamespace) SyncAll(ctx context.Context) {
 	defer mns.mu.Unlock()
 	mns.root.SyncAll(ctx)
 }
-
-// ResolveExecutablePath resolves the given executable name given a set of
-// paths that might contain it.
-func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name string, paths []string) (string, error) {
-	// Absolute paths can be used directly.
-	if path.IsAbs(name) {
-		return name, nil
-	}
-
-	// Paths with '/' in them should be joined to the working directory, or
-	// to the root if working directory is not set.
-	if strings.IndexByte(name, '/') > 0 {
-		if wd == "" {
-			wd = "/"
-		}
-		if !path.IsAbs(wd) {
-			return "", fmt.Errorf("working directory %q must be absolute", wd)
-		}
-		return path.Join(wd, name), nil
-	}
-
-	// Otherwise, We must lookup the name in the paths, starting from the
-	// calling context's root directory.
-	root := RootFromContext(ctx)
-	if root == nil {
-		// Caller has no root. Don't bother traversing anything.
-		return "", syserror.ENOENT
-	}
-	defer root.DecRef()
-	for _, p := range paths {
-		binPath := path.Join(p, name)
-		traversals := uint(linux.MaxSymlinkTraversals)
-		d, err := mns.FindInode(ctx, root, nil, binPath, &traversals)
-		if err == syserror.ENOENT || err == syserror.EACCES {
-			// Didn't find it here.
-			continue
-		}
-		if err != nil {
-			return "", err
-		}
-		defer d.DecRef()
-
-		// Check that it is a regular file.
-		if !IsRegular(d.Inode.StableAttr) {
-			continue
-		}
-
-		// Check whether we can read and execute the found file.
-		if err := d.Inode.CheckPermission(ctx, PermMask{Read: true, Execute: true}); err != nil {
-			log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
-			continue
-		}
-		return path.Join("/", p, name), nil
-	}
-	return "", syserror.ENOENT
-}
-
-// GetPath returns the PATH as a slice of strings given the environment
-// variables.
-func GetPath(env []string) []string {
-	const prefix = "PATH="
-	for _, e := range env {
-		if strings.HasPrefix(e, prefix) {
-			return strings.Split(strings.TrimPrefix(e, prefix), ":")
-		}
-	}
-	return nil
-}
diff --git a/pkg/sentry/fs/user/BUILD b/pkg/sentry/fs/user/BUILD
index f37f979f1..66e949c95 100644
--- a/pkg/sentry/fs/user/BUILD
+++ b/pkg/sentry/fs/user/BUILD
@@ -4,15 +4,21 @@ package(licenses = ["notice"])
 
 go_library(
     name = "user",
-    srcs = ["user.go"],
+    srcs = [
+        "path.go",
+        "user.go",
+    ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/log",
         "//pkg/sentry/fs",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/vfs",
+        "//pkg/syserror",
         "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go
new file mode 100644
index 000000000..397e96045
--- /dev/null
+++ b/pkg/sentry/fs/user/path.go
@@ -0,0 +1,170 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package user
+
+import (
+	"fmt"
+	"path"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// ResolveExecutablePath resolves the given executable name given the working
+// dir and environment.
+func ResolveExecutablePath(ctx context.Context, args *kernel.CreateProcessArgs) (string, error) {
+	name := args.Filename
+	if len(name) == 0 {
+		if len(args.Argv) == 0 {
+			return "", fmt.Errorf("no filename or command provided")
+		}
+		name = args.Argv[0]
+	}
+
+	// Absolute paths can be used directly.
+	if path.IsAbs(name) {
+		return name, nil
+	}
+
+	// Paths with '/' in them should be joined to the working directory, or
+	// to the root if working directory is not set.
+	if strings.IndexByte(name, '/') > 0 {
+		wd := args.WorkingDirectory
+		if wd == "" {
+			wd = "/"
+		}
+		if !path.IsAbs(wd) {
+			return "", fmt.Errorf("working directory %q must be absolute", wd)
+		}
+		return path.Join(wd, name), nil
+	}
+
+	// Otherwise, We must lookup the name in the paths.
+	paths := getPath(args.Envv)
+	if kernel.VFS2Enabled {
+		f, err := resolveVFS2(ctx, args.Credentials, args.MountNamespaceVFS2, paths, name)
+		if err != nil {
+			return "", fmt.Errorf("error finding executable %q in PATH %v: %v", name, paths, err)
+		}
+		return f, nil
+	}
+
+	f, err := resolve(ctx, args.MountNamespace, paths, name)
+	if err != nil {
+		return "", fmt.Errorf("error finding executable %q in PATH %v: %v", name, paths, err)
+	}
+	return f, nil
+}
+
+func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name string) (string, error) {
+	root := fs.RootFromContext(ctx)
+	if root == nil {
+		// Caller has no root. Don't bother traversing anything.
+		return "", syserror.ENOENT
+	}
+	defer root.DecRef()
+	for _, p := range paths {
+		if !path.IsAbs(p) {
+			// Relative paths aren't safe, no one should be using them.
+			log.Warningf("Skipping relative path %q in $PATH", p)
+			continue
+		}
+
+		binPath := path.Join(p, name)
+		traversals := uint(linux.MaxSymlinkTraversals)
+		d, err := mns.FindInode(ctx, root, nil, binPath, &traversals)
+		if err == syserror.ENOENT || err == syserror.EACCES {
+			// Didn't find it here.
+			continue
+		}
+		if err != nil {
+			return "", err
+		}
+		defer d.DecRef()
+
+		// Check that it is a regular file.
+		if !fs.IsRegular(d.Inode.StableAttr) {
+			continue
+		}
+
+		// Check whether we can read and execute the found file.
+		if err := d.Inode.CheckPermission(ctx, fs.PermMask{Read: true, Execute: true}); err != nil {
+			log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
+			continue
+		}
+		return path.Join("/", p, name), nil
+	}
+
+	// Couldn't find it.
+	return "", syserror.ENOENT
+}
+
+func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) {
+	root := mns.Root()
+	defer root.DecRef()
+	for _, p := range paths {
+		if !path.IsAbs(p) {
+			// Relative paths aren't safe, no one should be using them.
+			log.Warningf("Skipping relative path %q in $PATH", p)
+			continue
+		}
+
+		binPath := path.Join(p, name)
+		pop := &vfs.PathOperation{
+			Root:               root,
+			Start:              root,
+			Path:               fspath.Parse(binPath),
+			FollowFinalSymlink: true,
+		}
+		opts := &vfs.OpenOptions{
+			FileExec: true,
+			Flags:    linux.O_RDONLY,
+		}
+		dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts)
+		if err == syserror.ENOENT || err == syserror.EACCES {
+			// Didn't find it here.
+			continue
+		}
+		if err != nil {
+			return "", err
+		}
+		dentry.DecRef()
+
+		return binPath, nil
+	}
+
+	// Couldn't find it.
+	return "", syserror.ENOENT
+}
+
+// getPath returns the PATH as a slice of strings given the environment
+// variables.
+func getPath(env []string) []string {
+	const prefix = "PATH="
+	for _, e := range env {
+		if strings.HasPrefix(e, prefix) {
+			return strings.Split(strings.TrimPrefix(e, prefix), ":")
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go
index fe7f67c00..f4d525523 100644
--- a/pkg/sentry/fs/user/user.go
+++ b/pkg/sentry/fs/user/user.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// Package user contains methods for resolving filesystem paths based on the
+// user and their environment.
 package user
 
 import (
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 585764223..93512c9b6 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -18,6 +18,7 @@ go_library(
         "//pkg/context",
         "//pkg/safemem",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index c03c65445..e6fda2b4f 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -116,6 +116,8 @@ type rootInode struct {
 	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
 
+	locks vfs.FileLocks
+
 	// Keep a reference to this inode's dentry.
 	dentry kernfs.Dentry
 
@@ -183,7 +185,7 @@ func (i *rootInode) masterClose(t *Terminal) {
 
 // Open implements kernfs.Inode.Open.
 func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 7a7ce5d81..1081fff52 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
@@ -34,6 +35,8 @@ type masterInode struct {
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
 
+	locks vfs.FileLocks
+
 	// Keep a reference to this inode's dentry.
 	dentry kernfs.Dentry
 
@@ -55,6 +58,7 @@ func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vf
 		inode: mi,
 		t:     t,
 	}
+	fd.LockFD.Init(&mi.locks)
 	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 		mi.DecRef()
 		return nil, err
@@ -63,8 +67,8 @@ func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vf
 }
 
 // Stat implements kernfs.Inode.Stat.
-func (mi *masterInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	statx, err := mi.InodeAttrs.Stat(vfsfs, opts)
+func (mi *masterInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := mi.InodeAttrs.Stat(ctx, vfsfs, opts)
 	if err != nil {
 		return linux.Statx{}, err
 	}
@@ -85,6 +89,7 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
 type masterFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	inode *masterInode
 	t     *Terminal
@@ -181,7 +186,17 @@ func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatO
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return mfd.inode.Stat(fs, opts)
+	return mfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (mfd *masterFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return mfd.Locks().LockPOSIX(ctx, &mfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (mfd *masterFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return mfd.Locks().UnlockPOSIX(ctx, &mfd.vfsfd, uid, start, length, whence)
 }
 
 // maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid.
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
index 526cd406c..a91cae3ef 100644
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ b/pkg/sentry/fsimpl/devpts/slave.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -33,6 +34,8 @@ type slaveInode struct {
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
 
+	locks vfs.FileLocks
+
 	// Keep a reference to this inode's dentry.
 	dentry kernfs.Dentry
 
@@ -51,6 +54,7 @@ func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs
 	fd := &slaveFileDescription{
 		inode: si,
 	}
+	fd.LockFD.Init(&si.locks)
 	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 		si.DecRef()
 		return nil, err
@@ -69,8 +73,8 @@ func (si *slaveInode) Valid(context.Context) bool {
 }
 
 // Stat implements kernfs.Inode.Stat.
-func (si *slaveInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	statx, err := si.InodeAttrs.Stat(vfsfs, opts)
+func (si *slaveInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts)
 	if err != nil {
 		return linux.Statx{}, err
 	}
@@ -91,6 +95,7 @@ func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
 type slaveFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	inode *slaveInode
 }
@@ -127,7 +132,7 @@ func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequen
 	return sfd.inode.t.ld.outputQueueWrite(ctx, src)
 }
 
-// Ioctl implements vfs.FileDescripionImpl.Ioctl.
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
 func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
@@ -178,5 +183,15 @@ func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOp
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return sfd.inode.Stat(fs, opts)
+	return sfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence)
 }
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index 142ee53b0..d0e06cdc0 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -136,6 +136,8 @@ func (a *Accessor) pathOperationAt(pathname string) *vfs.PathOperation {
 // CreateDeviceFile creates a device special file at the given pathname in the
 // devtmpfs instance accessed by the Accessor.
 func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind vfs.DeviceKind, major, minor uint32, perms uint16) error {
+	actx := a.wrapContext(ctx)
+
 	mode := (linux.FileMode)(perms)
 	switch kind {
 	case vfs.BlockDevice:
@@ -145,12 +147,24 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v
 	default:
 		panic(fmt.Sprintf("invalid vfs.DeviceKind: %v", kind))
 	}
+
+	// Create any parent directories. See
+	// devtmpfs.c:handle_create()=>path_create().
+	for it := fspath.Parse(pathname).Begin; it.NextOk(); it = it.Next() {
+		pop := a.pathOperationAt(it.String())
+		if err := a.vfsObj.MkdirAt(actx, a.creds, pop, &vfs.MkdirOptions{
+			Mode: 0755,
+		}); err != nil {
+			return fmt.Errorf("failed to create directory %q: %v", it.String(), err)
+		}
+	}
+
 	// NOTE: Linux's devtmpfs refuses to automatically delete files it didn't
 	// create, which it recognizes by storing a pointer to the kdevtmpfs struct
 	// thread in struct inode::i_private. Accessor doesn't yet support deletion
 	// of files at all, and probably won't as long as we don't need to support
 	// kernel modules, so this is moot for now.
-	return a.vfsObj.MknodAt(a.wrapContext(ctx), a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{
+	return a.vfsObj.MknodAt(actx, a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{
 		Mode:     mode,
 		DevMajor: major,
 		DevMinor: minor,
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
index c573d7935..d12d78b84 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -37,6 +37,7 @@ type EventFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 
 	// queue is used to notify interested parties when the event object
 	// becomes readable or writable.
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index ff861d0fe..abc610ef3 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -54,6 +54,7 @@ go_library(
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/ext/disklayout",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
@@ -95,7 +96,7 @@ go_test(
         "//pkg/syserror",
         "//pkg/test/testutil",
         "//pkg/usermem",
-        "@com_github_google_go-cmp//cmp:go_default_library",
-        "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go
index a2d8c3ad6..8bb104ff0 100644
--- a/pkg/sentry/fsimpl/ext/block_map_file.go
+++ b/pkg/sentry/fsimpl/ext/block_map_file.go
@@ -58,15 +58,16 @@ var _ io.ReaderAt = (*blockMapFile)(nil)
 
 // newBlockMapFile is the blockMapFile constructor. It initializes the file to
 // physical blocks map with (at most) the first 12 (direct) blocks.
-func newBlockMapFile(regFile regularFile) (*blockMapFile, error) {
-	file := &blockMapFile{regFile: regFile}
+func newBlockMapFile(args inodeArgs) (*blockMapFile, error) {
+	file := &blockMapFile{}
 	file.regFile.impl = file
+	file.regFile.inode.init(args, &file.regFile)
 
 	for i := uint(0); i < 4; i++ {
-		file.coverage[i] = getCoverage(regFile.inode.blkSize, i)
+		file.coverage[i] = getCoverage(file.regFile.inode.blkSize, i)
 	}
 
-	blkMap := regFile.inode.diskInode.Data()
+	blkMap := file.regFile.inode.diskInode.Data()
 	binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks)
 	binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk)
 	binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk)
diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go
index 181727ef7..6fa84e7aa 100644
--- a/pkg/sentry/fsimpl/ext/block_map_test.go
+++ b/pkg/sentry/fsimpl/ext/block_map_test.go
@@ -85,20 +85,6 @@ func (n *blkNumGen) next() uint32 {
 // the inode covers and that is written to disk.
 func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
 	mockDisk := make([]byte, mockBMDiskSize)
-	regFile := regularFile{
-		inode: inode{
-			fs: &filesystem{
-				dev: bytes.NewReader(mockDisk),
-			},
-			diskInode: &disklayout.InodeNew{
-				InodeOld: disklayout.InodeOld{
-					SizeLo: getMockBMFileFize(),
-				},
-			},
-			blkSize: uint64(mockBMBlkSize),
-		},
-	}
-
 	var fileData []byte
 	blkNums := newBlkNumGen()
 	var data []byte
@@ -125,9 +111,20 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
 	data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk)
 	fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...)
 
-	copy(regFile.inode.diskInode.Data(), data)
+	args := inodeArgs{
+		fs: &filesystem{
+			dev: bytes.NewReader(mockDisk),
+		},
+		diskInode: &disklayout.InodeNew{
+			InodeOld: disklayout.InodeOld{
+				SizeLo: getMockBMFileFize(),
+			},
+		},
+		blkSize: uint64(mockBMBlkSize),
+	}
+	copy(args.diskInode.Data(), data)
 
-	mockFile, err := newBlockMapFile(regFile)
+	mockFile, err := newBlockMapFile(args)
 	if err != nil {
 		t.Fatalf("newBlockMapFile failed: %v", err)
 	}
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index bfbd7c3d4..55902322a 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -60,3 +60,20 @@ func (d *dentry) DecRef() {
 	// inode.decRef().
 	d.inode.decRef()
 }
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) Watches() *vfs.Watches {
+	return nil
+}
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+//
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) OnZeroWatches() {}
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 12b875c8f..357512c7e 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -54,16 +55,15 @@ type directory struct {
 }
 
 // newDirectory is the directory constructor.
-func newDirectory(inode inode, newDirent bool) (*directory, error) {
+func newDirectory(args inodeArgs, newDirent bool) (*directory, error) {
 	file := &directory{
-		inode:      inode,
 		childCache: make(map[string]*dentry),
 		childMap:   make(map[string]*dirent),
 	}
-	file.inode.impl = file
+	file.inode.init(args, file)
 
 	// Initialize childList by reading dirents from the underlying file.
-	if inode.diskInode.Flags().Index {
+	if args.diskInode.Flags().Index {
 		// TODO(b/134676337): Support hash tree directories. Currently only the '.'
 		// and '..' entries are read in.
 
@@ -74,7 +74,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) {
 
 	// The dirents are organized in a linear array in the file data.
 	// Extract the file data and decode the dirents.
-	regFile, err := newRegularFile(inode)
+	regFile, err := newRegularFile(args)
 	if err != nil {
 		return nil, err
 	}
@@ -82,7 +82,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) {
 	// buf is used as scratch space for reading in dirents from disk and
 	// unmarshalling them into dirent structs.
 	buf := make([]byte, disklayout.DirentSize)
-	size := inode.diskInode.Size()
+	size := args.diskInode.Size()
 	for off, inc := uint64(0), uint64(0); off < size; off += inc {
 		toRead := size - off
 		if toRead > disklayout.DirentSize {
@@ -306,3 +306,13 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
 	fd.off = offset
 	return offset, nil
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *directoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *directoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index 11dcc0346..c36225a7c 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -38,9 +38,10 @@ var _ io.ReaderAt = (*extentFile)(nil)
 // newExtentFile is the extent file constructor. It reads the entire extent
 // tree into memory.
 // TODO(b/134676337): Build extent tree on demand to reduce memory usage.
-func newExtentFile(regFile regularFile) (*extentFile, error) {
-	file := &extentFile{regFile: regFile}
+func newExtentFile(args inodeArgs) (*extentFile, error) {
+	file := &extentFile{}
 	file.regFile.impl = file
+	file.regFile.inode.init(args, &file.regFile)
 	err := file.buildExtTree()
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go
index a2382daa3..cd10d46ee 100644
--- a/pkg/sentry/fsimpl/ext/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/extent_test.go
@@ -177,21 +177,19 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []
 	t.Helper()
 
 	mockDisk := make([]byte, mockExtentBlkSize*10)
-	mockExtentFile := &extentFile{
-		regFile: regularFile{
-			inode: inode{
-				fs: &filesystem{
-					dev: bytes.NewReader(mockDisk),
-				},
-				diskInode: &disklayout.InodeNew{
-					InodeOld: disklayout.InodeOld{
-						SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
-					},
-				},
-				blkSize: mockExtentBlkSize,
+	mockExtentFile := &extentFile{}
+	args := inodeArgs{
+		fs: &filesystem{
+			dev: bytes.NewReader(mockDisk),
+		},
+		diskInode: &disklayout.InodeNew{
+			InodeOld: disklayout.InodeOld{
+				SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
 			},
 		},
+		blkSize: mockExtentBlkSize,
 	}
+	mockExtentFile.regFile.inode.init(args, &mockExtentFile.regFile)
 
 	fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize)
 
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index 92f7da40d..90b086468 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -26,6 +26,7 @@ import (
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 485f86f4b..30636cf66 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -54,6 +54,8 @@ type inode struct {
 	// diskInode gives us access to the inode struct on disk. Immutable.
 	diskInode disklayout.Inode
 
+	locks vfs.FileLocks
+
 	// This is immutable. The first field of the implementations must have inode
 	// as the first field to ensure temporality.
 	impl interface{}
@@ -115,7 +117,7 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 	}
 
 	// Build the inode based on its type.
-	inode := inode{
+	args := inodeArgs{
 		fs:        fs,
 		inodeNum:  inodeNum,
 		blkSize:   blkSize,
@@ -124,19 +126,19 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 
 	switch diskInode.Mode().FileType() {
 	case linux.ModeSymlink:
-		f, err := newSymlink(inode)
+		f, err := newSymlink(args)
 		if err != nil {
 			return nil, err
 		}
 		return &f.inode, nil
 	case linux.ModeRegular:
-		f, err := newRegularFile(inode)
+		f, err := newRegularFile(args)
 		if err != nil {
 			return nil, err
 		}
 		return &f.inode, nil
 	case linux.ModeDirectory:
-		f, err := newDirectory(inode, fs.sb.IncompatibleFeatures().DirentFileType)
+		f, err := newDirectory(args, fs.sb.IncompatibleFeatures().DirentFileType)
 		if err != nil {
 			return nil, err
 		}
@@ -147,6 +149,21 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 	}
 }
 
+type inodeArgs struct {
+	fs        *filesystem
+	inodeNum  uint32
+	blkSize   uint64
+	diskInode disklayout.Inode
+}
+
+func (in *inode) init(args inodeArgs, impl interface{}) {
+	in.fs = args.fs
+	in.inodeNum = args.inodeNum
+	in.blkSize = args.blkSize
+	in.diskInode = args.diskInode
+	in.impl = impl
+}
+
 // open creates and returns a file description for the dentry passed in.
 func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(opts)
@@ -157,6 +174,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
+		fd.LockFD.Init(&in.locks)
 		if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
@@ -168,6 +186,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
+		fd.LockFD.Init(&in.locks)
 		if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
@@ -178,6 +197,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
 			return nil, syserror.ELOOP
 		}
 		var fd symlinkFD
+		fd.LockFD.Init(&in.locks)
 		fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	default:
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index 30135ddb0..66d14bb95 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -43,28 +44,19 @@ type regularFile struct {
 
 // newRegularFile is the regularFile constructor. It figures out what kind of
 // file this is and initializes the fileReader.
-func newRegularFile(inode inode) (*regularFile, error) {
-	regFile := regularFile{
-		inode: inode,
-	}
-
-	inodeFlags := inode.diskInode.Flags()
-
-	if inodeFlags.Extents {
-		file, err := newExtentFile(regFile)
+func newRegularFile(args inodeArgs) (*regularFile, error) {
+	if args.diskInode.Flags().Extents {
+		file, err := newExtentFile(args)
 		if err != nil {
 			return nil, err
 		}
-
-		file.regFile.inode.impl = &file.regFile
 		return &file.regFile, nil
 	}
 
-	file, err := newBlockMapFile(regFile)
+	file, err := newBlockMapFile(args)
 	if err != nil {
 		return nil, err
 	}
-	file.regFile.inode.impl = &file.regFile
 	return &file.regFile, nil
 }
 
@@ -77,6 +69,7 @@ func (in *inode) isRegular() bool {
 // vfs.FileDescriptionImpl.
 type regularFileFD struct {
 	fileDescription
+	vfs.LockFD
 
 	// off is the file offset. off is accessed using atomic memory operations.
 	off int64
@@ -157,3 +150,13 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt
 	// TODO(b/134676337): Implement mmap(2).
 	return syserror.ENODEV
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index 1447a4dc1..62efd4095 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -30,18 +30,17 @@ type symlink struct {
 
 // newSymlink is the symlink constructor. It reads out the symlink target from
 // the inode (however it might have been stored).
-func newSymlink(inode inode) (*symlink, error) {
-	var file *symlink
+func newSymlink(args inodeArgs) (*symlink, error) {
 	var link []byte
 
 	// If the symlink target is lesser than 60 bytes, its stores in inode.Data().
 	// Otherwise either extents or block maps will be used to store the link.
-	size := inode.diskInode.Size()
+	size := args.diskInode.Size()
 	if size < 60 {
-		link = inode.diskInode.Data()[:size]
+		link = args.diskInode.Data()[:size]
 	} else {
 		// Create a regular file out of this inode and read out the target.
-		regFile, err := newRegularFile(inode)
+		regFile, err := newRegularFile(args)
 		if err != nil {
 			return nil, err
 		}
@@ -52,8 +51,8 @@ func newSymlink(inode inode) (*symlink, error) {
 		}
 	}
 
-	file = &symlink{inode: inode, target: string(link)}
-	file.inode.impl = file
+	file := &symlink{target: string(link)}
+	file.inode.init(args, file)
 	return file, nil
 }
 
@@ -67,6 +66,7 @@ func (in *inode) isSymlink() bool {
 // O_PATH. For this reason most of the functions return EBADF.
 type symlinkFD struct {
 	fileDescription
+	vfs.NoLockFD
 }
 
 // Compiles only if symlinkFD implements vfs.FileDescriptionImpl.
diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD
new file mode 100644
index 000000000..999111deb
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/BUILD
@@ -0,0 +1,63 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "request_list",
+    out = "request_list.go",
+    package = "fuse",
+    prefix = "request",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Request",
+        "Linker": "*Request",
+    },
+)
+
+go_library(
+    name = "fuse",
+    srcs = [
+        "connection.go",
+        "dev.go",
+        "fusefs.go",
+        "init.go",
+        "register.go",
+        "request_list.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/log",
+        "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+        "//tools/go_marshal/marshal",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "fuse_test",
+    size = "small",
+    srcs = ["dev_test.go"],
+    library = ":fuse",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/fsimpl/testutil",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+        "//tools/go_marshal/marshal",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go
new file mode 100644
index 000000000..6df2728ab
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/connection.go
@@ -0,0 +1,437 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"errors"
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+)
+
+// maxActiveRequestsDefault is the default setting controlling the upper bound
+// on the number of active requests at any given time.
+const maxActiveRequestsDefault = 10000
+
+// Ordinary requests have even IDs, while interrupts IDs are odd.
+// Used to increment the unique ID for each FUSE request.
+var reqIDStep uint64 = 2
+
+const (
+	// fuseDefaultMaxBackground is the default value for MaxBackground.
+	fuseDefaultMaxBackground = 12
+
+	// fuseDefaultCongestionThreshold is the default value for CongestionThreshold,
+	// and is 75% of the default maximum of MaxGround.
+	fuseDefaultCongestionThreshold = (fuseDefaultMaxBackground * 3 / 4)
+
+	// fuseDefaultMaxPagesPerReq is the default value for MaxPagesPerReq.
+	fuseDefaultMaxPagesPerReq = 32
+)
+
+// Request represents a FUSE operation request that hasn't been sent to the
+// server yet.
+//
+// +stateify savable
+type Request struct {
+	requestEntry
+
+	id   linux.FUSEOpID
+	hdr  *linux.FUSEHeaderIn
+	data []byte
+}
+
+// Response represents an actual response from the server, including the
+// response payload.
+//
+// +stateify savable
+type Response struct {
+	opcode linux.FUSEOpcode
+	hdr    linux.FUSEHeaderOut
+	data   []byte
+}
+
+// connection is the struct by which the sentry communicates with the FUSE server daemon.
+type connection struct {
+	fd *DeviceFD
+
+	// The following FUSE_INIT flags are currently unsupported by this implementation:
+	// - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC)
+	// - FUSE_EXPORT_SUPPORT
+	// - FUSE_HANDLE_KILLPRIV
+	// - FUSE_POSIX_LOCKS: requires POSIX locks
+	// - FUSE_FLOCK_LOCKS: requires POSIX locks
+	// - FUSE_AUTO_INVAL_DATA: requires page caching eviction
+	// - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction
+	// - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation
+	// - FUSE_ASYNC_DIO
+	// - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler
+
+	// initialized after receiving FUSE_INIT reply.
+	// Until it's set, suspend sending FUSE requests.
+	// Use SetInitialized() and IsInitialized() for atomic access.
+	initialized int32
+
+	// initializedChan is used to block requests before initialization.
+	initializedChan chan struct{}
+
+	// blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground).
+	// TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block.
+	blocked bool
+
+	// connected (connection established) when a new FUSE file system is created.
+	// Set to false when:
+	//   umount,
+	//   connection abort,
+	//   device release.
+	connected bool
+
+	// aborted via sysfs.
+	// TODO(gvisor.dev/issue/3185): abort all queued requests.
+	aborted bool
+
+	// connInitError if FUSE_INIT encountered error (major version mismatch).
+	// Only set in INIT.
+	connInitError bool
+
+	// connInitSuccess if FUSE_INIT is successful.
+	// Only set in INIT.
+	// Used for destory.
+	connInitSuccess bool
+
+	// TODO(gvisor.dev/issue/3185): All the queue logic are working in progress.
+
+	// NumberBackground is the number of requests in the background.
+	numBackground uint16
+
+	// congestionThreshold for NumBackground.
+	// Negotiated in FUSE_INIT.
+	congestionThreshold uint16
+
+	// maxBackground is the maximum number of NumBackground.
+	// Block connection when it is reached.
+	// Negotiated in FUSE_INIT.
+	maxBackground uint16
+
+	// numActiveBackground is the number of requests in background and has being marked as active.
+	numActiveBackground uint16
+
+	// numWating is the number of requests waiting for completion.
+	numWaiting uint32
+
+	// TODO(gvisor.dev/issue/3185): BgQueue
+	// some queue for background queued requests.
+
+	// bgLock protects:
+	// MaxBackground, CongestionThreshold, NumBackground,
+	// NumActiveBackground, BgQueue, Blocked.
+	bgLock sync.Mutex
+
+	// maxRead is the maximum size of a read buffer in in bytes.
+	maxRead uint32
+
+	// maxWrite is the maximum size of a write buffer in bytes.
+	// Negotiated in FUSE_INIT.
+	maxWrite uint32
+
+	// maxPages is the maximum number of pages for a single request to use.
+	// Negotiated in FUSE_INIT.
+	maxPages uint16
+
+	// minor version of the FUSE protocol.
+	// Negotiated and only set in INIT.
+	minor uint32
+
+	// asyncRead if read pages asynchronously.
+	// Negotiated and only set in INIT.
+	asyncRead bool
+
+	// abortErr is true if kernel need to return an unique read error after abort.
+	// Negotiated and only set in INIT.
+	abortErr bool
+
+	// writebackCache is true for write-back cache policy,
+	// false for write-through policy.
+	// Negotiated and only set in INIT.
+	writebackCache bool
+
+	// cacheSymlinks if filesystem needs to cache READLINK responses in page cache.
+	// Negotiated and only set in INIT.
+	cacheSymlinks bool
+
+	// bigWrites if doing multi-page cached writes.
+	// Negotiated and only set in INIT.
+	bigWrites bool
+
+	// dontMask if filestestem does not apply umask to creation modes.
+	// Negotiated in INIT.
+	dontMask bool
+}
+
+// newFUSEConnection creates a FUSE connection to fd.
+func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) {
+	// Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to
+	// mount a FUSE filesystem.
+	fuseFD := fd.Impl().(*DeviceFD)
+	fuseFD.mounted = true
+
+	// Create the writeBuf for the header to be stored in.
+	hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+	fuseFD.writeBuf = make([]byte, hdrLen)
+	fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse)
+	fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests)
+	fuseFD.writeCursor = 0
+
+	return &connection{
+		fd:                  fuseFD,
+		maxBackground:       fuseDefaultMaxBackground,
+		congestionThreshold: fuseDefaultCongestionThreshold,
+		maxPages:            fuseDefaultMaxPagesPerReq,
+		initializedChan:     make(chan struct{}),
+		connected:           true,
+	}, nil
+}
+
+// SetInitialized atomically sets the connection as initialized.
+func (conn *connection) SetInitialized() {
+	// Unblock the requests sent before INIT.
+	close(conn.initializedChan)
+
+	// Close the channel first to avoid the non-atomic situation
+	// where conn.initialized is true but there are
+	// tasks being blocked on the channel.
+	// And it prevents the newer tasks from gaining
+	// unnecessary higher chance to be issued before the blocked one.
+
+	atomic.StoreInt32(&(conn.initialized), int32(1))
+}
+
+// IsInitialized atomically check if the connection is initialized.
+// pairs with SetInitialized().
+func (conn *connection) Initialized() bool {
+	return atomic.LoadInt32(&(conn.initialized)) != 0
+}
+
+// NewRequest creates a new request that can be sent to the FUSE server.
+func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
+	conn.fd.mu.Lock()
+	defer conn.fd.mu.Unlock()
+	conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
+
+	hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
+	hdr := linux.FUSEHeaderIn{
+		Len:    uint32(hdrLen + payload.SizeBytes()),
+		Opcode: opcode,
+		Unique: conn.fd.nextOpID,
+		NodeID: ino,
+		UID:    uint32(creds.EffectiveKUID),
+		GID:    uint32(creds.EffectiveKGID),
+		PID:    pid,
+	}
+
+	buf := make([]byte, hdr.Len)
+	hdr.MarshalUnsafe(buf[:hdrLen])
+	payload.MarshalUnsafe(buf[hdrLen:])
+
+	return &Request{
+		id:   hdr.Unique,
+		hdr:  &hdr,
+		data: buf,
+	}, nil
+}
+
+// Call makes a request to the server and blocks the invoking task until a
+// server responds with a response. Task should never be nil.
+// Requests will not be sent before the connection is initialized.
+// For async tasks, use CallAsync().
+func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
+	// Block requests sent before connection is initalized.
+	if !conn.Initialized() {
+		if err := t.Block(conn.initializedChan); err != nil {
+			return nil, err
+		}
+	}
+
+	return conn.call(t, r)
+}
+
+// CallAsync makes an async (aka background) request.
+// Those requests either do not expect a response (e.g. release) or
+// the response should be handled by others (e.g. init).
+// Return immediately unless the connection is blocked (before initialization).
+// Async call example: init, release, forget, aio, interrupt.
+// When the Request is FUSE_INIT, it will not be blocked before initialization.
+func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+	// Block requests sent before connection is initalized.
+	if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT {
+		if err := t.Block(conn.initializedChan); err != nil {
+			return err
+		}
+	}
+
+	// This should be the only place that invokes call() with a nil task.
+	_, err := conn.call(nil, r)
+	return err
+}
+
+// call makes a call without blocking checks.
+func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
+	if !conn.connected {
+		return nil, syserror.ENOTCONN
+	}
+
+	if conn.connInitError {
+		return nil, syserror.ECONNREFUSED
+	}
+
+	fut, err := conn.callFuture(t, r)
+	if err != nil {
+		return nil, err
+	}
+
+	return fut.resolve(t)
+}
+
+// Error returns the error of the FUSE call.
+func (r *Response) Error() error {
+	errno := r.hdr.Error
+	if errno >= 0 {
+		return nil
+	}
+
+	sysErrNo := syscall.Errno(-errno)
+	return error(sysErrNo)
+}
+
+// UnmarshalPayload unmarshals the response data into m.
+func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
+	hdrLen := r.hdr.SizeBytes()
+	haveDataLen := r.hdr.Len - uint32(hdrLen)
+	wantDataLen := uint32(m.SizeBytes())
+
+	if haveDataLen < wantDataLen {
+		return fmt.Errorf("payload too small. Minimum data lenth required: %d,  but got data length %d", wantDataLen, haveDataLen)
+	}
+
+	m.UnmarshalUnsafe(r.data[hdrLen:])
+	return nil
+}
+
+// callFuture makes a request to the server and returns a future response.
+// Call resolve() when the response needs to be fulfilled.
+func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) {
+	conn.fd.mu.Lock()
+	defer conn.fd.mu.Unlock()
+
+	// Is the queue full?
+	//
+	// We must busy wait here until the request can be queued. We don't
+	// block on the fd.fullQueueCh with a lock - so after being signalled,
+	// before we acquire the lock, it is possible that a barging task enters
+	// and queues a request. As a result, upon acquiring the lock we must
+	// again check if the room is available.
+	//
+	// This can potentially starve a request forever but this can only happen
+	// if there are always too many ongoing requests all the time. The
+	// supported maxActiveRequests setting should be really high to avoid this.
+	for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
+		if t == nil {
+			// Since there is no task that is waiting. We must error out.
+			return nil, errors.New("FUSE request queue full")
+		}
+
+		log.Infof("Blocking request %v from being queued. Too many active requests: %v",
+			r.id, conn.fd.numActiveRequests)
+		conn.fd.mu.Unlock()
+		err := t.Block(conn.fd.fullQueueCh)
+		conn.fd.mu.Lock()
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return conn.callFutureLocked(t, r)
+}
+
+// callFutureLocked makes a request to the server and returns a future response.
+func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) {
+	conn.fd.queue.PushBack(r)
+	conn.fd.numActiveRequests += 1
+	fut := newFutureResponse(r.hdr.Opcode)
+	conn.fd.completions[r.id] = fut
+
+	// Signal the readers that there is something to read.
+	conn.fd.waitQueue.Notify(waiter.EventIn)
+
+	return fut, nil
+}
+
+// futureResponse represents an in-flight request, that may or may not have
+// completed yet. Convert it to a resolved Response by calling Resolve, but note
+// that this may block.
+//
+// +stateify savable
+type futureResponse struct {
+	opcode linux.FUSEOpcode
+	ch     chan struct{}
+	hdr    *linux.FUSEHeaderOut
+	data   []byte
+}
+
+// newFutureResponse creates a future response to a FUSE request.
+func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse {
+	return &futureResponse{
+		opcode: opcode,
+		ch:     make(chan struct{}),
+	}
+}
+
+// resolve blocks the task until the server responds to its corresponding request,
+// then returns a resolved response.
+func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
+	// If there is no Task associated with this request  - then we don't try to resolve
+	// the response.  Instead, the task writing the response (proxy to the server) will
+	// process the response on our behalf.
+	if t == nil {
+		log.Infof("fuse.Response.resolve: Not waiting on a response from server.")
+		return nil, nil
+	}
+
+	if err := t.Block(f.ch); err != nil {
+		return nil, err
+	}
+
+	return f.getResponse(), nil
+}
+
+// getResponse creates a Response from the data the futureResponse has.
+func (f *futureResponse) getResponse() *Response {
+	return &Response{
+		opcode: f.opcode,
+		hdr:    *f.hdr,
+		data:   f.data,
+	}
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
new file mode 100644
index 000000000..2225076bc
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -0,0 +1,397 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const fuseDevMinor = 229
+
+// fuseDevice implements vfs.Device for /dev/fuse.
+type fuseDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	if !kernel.FUSEEnabled {
+		return nil, syserror.ENOENT
+	}
+
+	var fd DeviceFD
+	if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
+type DeviceFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
+
+	// mounted specifies whether a FUSE filesystem was mounted using the DeviceFD.
+	mounted bool
+
+	// nextOpID is used to create new requests.
+	nextOpID linux.FUSEOpID
+
+	// queue is the list of requests that need to be processed by the FUSE server.
+	queue requestList
+
+	// numActiveRequests is the number of requests made by the Sentry that has
+	// yet to be responded to.
+	numActiveRequests uint64
+
+	// completions is used to map a request to its response. A Writer will use this
+	// to notify the caller of a completed response.
+	completions map[linux.FUSEOpID]*futureResponse
+
+	writeCursor uint32
+
+	// writeBuf is the memory buffer used to copy in the FUSE out header from
+	// userspace.
+	writeBuf []byte
+
+	// writeCursorFR current FR being copied from server.
+	writeCursorFR *futureResponse
+
+	// mu protects all the queues, maps, buffers and cursors and nextOpID.
+	mu sync.Mutex
+
+	// waitQueue is used to notify interested parties when the device becomes
+	// readable or writable.
+	waitQueue waiter.Queue
+
+	// fullQueueCh is a channel used to synchronize the readers with the writers.
+	// Writers (inbound requests to the filesystem) block if there are too many
+	// unprocessed in-flight requests.
+	fullQueueCh chan struct{}
+
+	// fs is the FUSE filesystem that this FD is being used for.
+	fs *filesystem
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *DeviceFD) Release() {
+	fd.fs.conn.connected = false
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	return 0, syserror.ENOSYS
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	// We require that any Read done on this filesystem have a sane minimum
+	// read buffer. It must have the capacity for the fixed parts of any request
+	// header (Linux uses the request header and the FUSEWriteIn header for this
+	// calculation) + the negotiated MaxWrite room for the data.
+	minBuffSize := linux.FUSE_MIN_READ_BUFFER
+	inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes())
+	writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes())
+	negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.maxWrite
+	if minBuffSize < negotiatedMinBuffSize {
+		minBuffSize = negotiatedMinBuffSize
+	}
+
+	// If the read buffer is too small, error out.
+	if dst.NumBytes() < int64(minBuffSize) {
+		return 0, syserror.EINVAL
+	}
+
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	return fd.readLocked(ctx, dst, opts)
+}
+
+// readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
+func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	if fd.queue.Empty() {
+		return 0, syserror.ErrWouldBlock
+	}
+
+	var readCursor uint32
+	var bytesRead int64
+	for {
+		req := fd.queue.Front()
+		if dst.NumBytes() < int64(req.hdr.Len) {
+			// The request is too large. Cannot process it. All requests must be smaller than the
+			// negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
+			// handshake.
+			errno := -int32(syscall.EIO)
+			if req.hdr.Opcode == linux.FUSE_SETXATTR {
+				errno = -int32(syscall.E2BIG)
+			}
+
+			// Return the error to the calling task.
+			if err := fd.sendError(ctx, errno, req); err != nil {
+				return 0, err
+			}
+
+			// We're done with this request.
+			fd.queue.Remove(req)
+
+			// Restart the read as this request was invalid.
+			log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.")
+			return fd.readLocked(ctx, dst, opts)
+		}
+
+		n, err := dst.CopyOut(ctx, req.data[readCursor:])
+		if err != nil {
+			return 0, err
+		}
+		readCursor += uint32(n)
+		bytesRead += int64(n)
+
+		if readCursor >= req.hdr.Len {
+			// Fully done with this req, remove it from the queue.
+			fd.queue.Remove(req)
+			break
+		}
+	}
+
+	return bytesRead, nil
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	return 0, syserror.ENOSYS
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	return fd.writeLocked(ctx, src, opts)
+}
+
+// writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
+func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	var cn, n int64
+	hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+
+	for src.NumBytes() > 0 {
+		if fd.writeCursorFR != nil {
+			// Already have common header, and we're now copying the payload.
+			wantBytes := fd.writeCursorFR.hdr.Len
+
+			// Note that the FR data doesn't have the header. Copy it over if its necessary.
+			if fd.writeCursorFR.data == nil {
+				fd.writeCursorFR.data = make([]byte, wantBytes)
+			}
+
+			bytesCopied, err := src.CopyIn(ctx, fd.writeCursorFR.data[fd.writeCursor:wantBytes])
+			if err != nil {
+				return 0, err
+			}
+			src = src.DropFirst(bytesCopied)
+
+			cn = int64(bytesCopied)
+			n += cn
+			fd.writeCursor += uint32(cn)
+			if fd.writeCursor == wantBytes {
+				// Done reading this full response. Clean up and unblock the
+				// initiator.
+				break
+			}
+
+			// Check if we have more data in src.
+			continue
+		}
+
+		// Assert that the header isn't read into the writeBuf yet.
+		if fd.writeCursor >= hdrLen {
+			return 0, syserror.EINVAL
+		}
+
+		// We don't have the full common response header yet.
+		wantBytes := hdrLen - fd.writeCursor
+		bytesCopied, err := src.CopyIn(ctx, fd.writeBuf[fd.writeCursor:wantBytes])
+		if err != nil {
+			return 0, err
+		}
+		src = src.DropFirst(bytesCopied)
+
+		cn = int64(bytesCopied)
+		n += cn
+		fd.writeCursor += uint32(cn)
+		if fd.writeCursor == hdrLen {
+			// Have full header in the writeBuf. Use it to fetch the actual futureResponse
+			// from the device's completions map.
+			var hdr linux.FUSEHeaderOut
+			hdr.UnmarshalBytes(fd.writeBuf)
+
+			// We have the header now and so the writeBuf has served its purpose.
+			// We could reset it manually here but instead of doing that, at the
+			// end of the write, the writeCursor will be set to 0 thereby allowing
+			// the next request to overwrite whats in the buffer,
+
+			fut, ok := fd.completions[hdr.Unique]
+			if !ok {
+				// Server sent us a response for a request we never sent?
+				return 0, syserror.EINVAL
+			}
+
+			delete(fd.completions, hdr.Unique)
+
+			// Copy over the header into the future response. The rest of the payload
+			// will be copied over to the FR's data in the next iteration.
+			fut.hdr = &hdr
+			fd.writeCursorFR = fut
+
+			// Next iteration will now try read the complete request, if src has
+			// any data remaining. Otherwise we're done.
+		}
+	}
+
+	if fd.writeCursorFR != nil {
+		if err := fd.sendResponse(ctx, fd.writeCursorFR); err != nil {
+			return 0, err
+		}
+
+		// Ready the device for the next request.
+		fd.writeCursorFR = nil
+		fd.writeCursor = 0
+	}
+
+	return n, nil
+}
+
+// Readiness implements vfs.FileDescriptionImpl.Readiness.
+func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	var ready waiter.EventMask
+	ready |= waiter.EventOut // FD is always writable
+	if !fd.queue.Empty() {
+		// Have reqs available, FD is readable.
+		ready |= waiter.EventIn
+	}
+
+	return ready & mask
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *DeviceFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.waitQueue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
+	fd.waitQueue.EventUnregister(e)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	return 0, syserror.ENOSYS
+}
+
+// sendResponse sends a response to the waiting task (if any).
+func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
+	// See if the running task need to perform some action before returning.
+	// Since we just finished writing the future, we can be sure that
+	// getResponse generates a populated response.
+	if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil {
+		return err
+	}
+
+	// Signal that the queue is no longer full.
+	select {
+	case fd.fullQueueCh <- struct{}{}:
+	default:
+	}
+	fd.numActiveRequests -= 1
+
+	// Signal the task waiting on a response.
+	close(fut.ch)
+	return nil
+}
+
+// sendError sends an error response to the waiting task (if any).
+func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error {
+	// Return the error to the calling task.
+	outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+	respHdr := linux.FUSEHeaderOut{
+		Len:    outHdrLen,
+		Error:  errno,
+		Unique: req.hdr.Unique,
+	}
+
+	fut, ok := fd.completions[respHdr.Unique]
+	if !ok {
+		// Server sent us a response for a request we never sent?
+		return syserror.EINVAL
+	}
+	delete(fd.completions, respHdr.Unique)
+
+	fut.hdr = &respHdr
+	if err := fd.sendResponse(ctx, fut); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// noReceiverAction has the calling kernel.Task do some action if its known that no
+// receiver is going to be waiting on the future channel. This is to be used by:
+// FUSE_INIT.
+func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error {
+	if r.opcode == linux.FUSE_INIT {
+		creds := auth.CredentialsFromContext(ctx)
+		rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
+		return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go
new file mode 100644
index 000000000..84c222ad6
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/dev_test.go
@@ -0,0 +1,428 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"fmt"
+	"io"
+	"math/rand"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+)
+
+// echoTestOpcode is the Opcode used during testing. The server used in tests
+// will simply echo the payload back with the appropriate headers.
+const echoTestOpcode linux.FUSEOpcode = 1000
+
+type testPayload struct {
+	data uint32
+}
+
+// TestFUSECommunication tests that the communication layer between the Sentry and the
+// FUSE server daemon works as expected.
+func TestFUSECommunication(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	k := kernel.KernelFromContext(s.Ctx)
+	creds := auth.CredentialsFromContext(s.Ctx)
+
+	// Create test cases with different number of concurrent clients and servers.
+	testCases := []struct {
+		Name              string
+		NumClients        int
+		NumServers        int
+		MaxActiveRequests uint64
+	}{
+		{
+			Name:              "SingleClientSingleServer",
+			NumClients:        1,
+			NumServers:        1,
+			MaxActiveRequests: maxActiveRequestsDefault,
+		},
+		{
+			Name:              "SingleClientMultipleServers",
+			NumClients:        1,
+			NumServers:        10,
+			MaxActiveRequests: maxActiveRequestsDefault,
+		},
+		{
+			Name:              "MultipleClientsSingleServer",
+			NumClients:        10,
+			NumServers:        1,
+			MaxActiveRequests: maxActiveRequestsDefault,
+		},
+		{
+			Name:              "MultipleClientsMultipleServers",
+			NumClients:        10,
+			NumServers:        10,
+			MaxActiveRequests: maxActiveRequestsDefault,
+		},
+		{
+			Name:              "RequestCapacityFull",
+			NumClients:        10,
+			NumServers:        1,
+			MaxActiveRequests: 1,
+		},
+		{
+			Name:              "RequestCapacityContinuouslyFull",
+			NumClients:        100,
+			NumServers:        2,
+			MaxActiveRequests: 2,
+		},
+	}
+
+	for _, testCase := range testCases {
+		t.Run(testCase.Name, func(t *testing.T) {
+			conn, fd, err := newTestConnection(s, k, testCase.MaxActiveRequests)
+			if err != nil {
+				t.Fatalf("newTestConnection: %v", err)
+			}
+
+			clientsDone := make([]chan struct{}, testCase.NumClients)
+			serversDone := make([]chan struct{}, testCase.NumServers)
+			serversKill := make([]chan struct{}, testCase.NumServers)
+
+			// FUSE clients.
+			for i := 0; i < testCase.NumClients; i++ {
+				clientsDone[i] = make(chan struct{})
+				go func(i int) {
+					fuseClientRun(t, s, k, conn, creds, uint32(i), uint64(i), clientsDone[i])
+				}(i)
+			}
+
+			// FUSE servers.
+			for j := 0; j < testCase.NumServers; j++ {
+				serversDone[j] = make(chan struct{})
+				serversKill[j] = make(chan struct{}, 1) // The kill command shouldn't block.
+				go func(j int) {
+					fuseServerRun(t, s, k, fd, serversDone[j], serversKill[j])
+				}(j)
+			}
+
+			// Tear down.
+			//
+			// Make sure all the clients are done.
+			for i := 0; i < testCase.NumClients; i++ {
+				<-clientsDone[i]
+			}
+
+			// Kill any server that is potentially waiting.
+			for j := 0; j < testCase.NumServers; j++ {
+				serversKill[j] <- struct{}{}
+			}
+
+			// Make sure all the servers are done.
+			for j := 0; j < testCase.NumServers; j++ {
+				<-serversDone[j]
+			}
+		})
+	}
+}
+
+// CallTest makes a request to the server and blocks the invoking
+// goroutine until a server responds with a response. Doesn't block
+// a kernel.Task. Analogous to Connection.Call but used for testing.
+func CallTest(conn *connection, t *kernel.Task, r *Request, i uint32) (*Response, error) {
+	conn.fd.mu.Lock()
+
+	// Wait until we're certain that a new request can be processed.
+	for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
+		conn.fd.mu.Unlock()
+		select {
+		case <-conn.fd.fullQueueCh:
+		}
+		conn.fd.mu.Lock()
+	}
+
+	fut, err := conn.callFutureLocked(t, r) // No task given.
+	conn.fd.mu.Unlock()
+
+	if err != nil {
+		return nil, err
+	}
+
+	// Resolve the response.
+	//
+	// Block without a task.
+	select {
+	case <-fut.ch:
+	}
+
+	// A response is ready. Resolve and return it.
+	return fut.getResponse(), nil
+}
+
+// ReadTest is analogous to vfs.FileDescription.Read and reads from the FUSE
+// device. However, it does so by - not blocking the task that is calling - and
+// instead just waits on a channel. The behaviour is essentially the same as
+// DeviceFD.Read except it guarantees that the task is not blocked.
+func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem.IOSequence, killServer chan struct{}) (int64, bool, error) {
+	var err error
+	var n, total int64
+
+	dev := fd.Impl().(*DeviceFD)
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	dev.EventRegister(&w, waiter.EventIn)
+	for {
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err = dev.Read(serverTask, inIOseq, vfs.ReadOptions{})
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		// Emulate the blocking for when no requests are available
+		select {
+		case <-ch:
+		case <-killServer:
+			// Server killed by the main program.
+			return 0, true, nil
+		}
+	}
+
+	dev.EventUnregister(&w)
+	return total, false, err
+}
+
+// fuseClientRun emulates all the actions of a normal FUSE request. It creates
+// a header, a payload, calls the server, waits for the response, and processes
+// the response.
+func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) {
+	defer func() { clientDone <- struct{}{} }()
+
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	clientTask, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("fuse-client-%v", pid), tc, s.MntNs, s.Root, s.Root)
+	if err != nil {
+		t.Fatal(err)
+	}
+	testObj := &testPayload{
+		data: rand.Uint32(),
+	}
+
+	req, err := conn.NewRequest(creds, pid, inode, echoTestOpcode, testObj)
+	if err != nil {
+		t.Fatalf("NewRequest creation failed: %v", err)
+	}
+
+	// Queue up a request.
+	// Analogous to Call except it doesn't block on the task.
+	resp, err := CallTest(conn, clientTask, req, pid)
+	if err != nil {
+		t.Fatalf("CallTaskNonBlock failed: %v", err)
+	}
+
+	if err = resp.Error(); err != nil {
+		t.Fatalf("Server responded with an error: %v", err)
+	}
+
+	var respTestPayload testPayload
+	if err := resp.UnmarshalPayload(&respTestPayload); err != nil {
+		t.Fatalf("Unmarshalling payload error: %v", err)
+	}
+
+	if resp.hdr.Unique != req.hdr.Unique {
+		t.Fatalf("got response for another request. Expected response for req %v but got response for req %v",
+			req.hdr.Unique, resp.hdr.Unique)
+	}
+
+	if respTestPayload.data != testObj.data {
+		t.Fatalf("read incorrect data. Data expected: %v, but got %v", testObj.data, respTestPayload.data)
+	}
+
+}
+
+// fuseServerRun creates a task and emulates all the actions of a simple FUSE server
+// that simply reads a request and echos the same struct back as a response using the
+// appropriate headers.
+func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.FileDescription, serverDone, killServer chan struct{}) {
+	defer func() { serverDone <- struct{}{} }()
+
+	// Create the tasks that the server will be using.
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	var readPayload testPayload
+
+	serverTask, err := testutil.CreateTask(s.Ctx, "fuse-server", tc, s.MntNs, s.Root, s.Root)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Read the request.
+	for {
+		inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes())
+		payloadLen := uint32(readPayload.SizeBytes())
+
+		// The raed buffer must meet some certain size criteria.
+		buffSize := inHdrLen + payloadLen
+		if buffSize < linux.FUSE_MIN_READ_BUFFER {
+			buffSize = linux.FUSE_MIN_READ_BUFFER
+		}
+		inBuf := make([]byte, buffSize)
+		inIOseq := usermem.BytesIOSequence(inBuf)
+
+		n, serverKilled, err := ReadTest(serverTask, fd, inIOseq, killServer)
+		if err != nil {
+			t.Fatalf("Read failed :%v", err)
+		}
+
+		// Server should shut down. No new requests are going to be made.
+		if serverKilled {
+			break
+		}
+
+		if n <= 0 {
+			t.Fatalf("Read read no bytes")
+		}
+
+		var readFUSEHeaderIn linux.FUSEHeaderIn
+		readFUSEHeaderIn.UnmarshalUnsafe(inBuf[:inHdrLen])
+		readPayload.UnmarshalUnsafe(inBuf[inHdrLen : inHdrLen+payloadLen])
+
+		if readFUSEHeaderIn.Opcode != echoTestOpcode {
+			t.Fatalf("read incorrect data. Header: %v, Payload: %v", readFUSEHeaderIn, readPayload)
+		}
+
+		// Write the response.
+		outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+		outBuf := make([]byte, outHdrLen+payloadLen)
+		outHeader := linux.FUSEHeaderOut{
+			Len:    outHdrLen + payloadLen,
+			Error:  0,
+			Unique: readFUSEHeaderIn.Unique,
+		}
+
+		// Echo the payload back.
+		outHeader.MarshalUnsafe(outBuf[:outHdrLen])
+		readPayload.MarshalUnsafe(outBuf[outHdrLen:])
+		outIOseq := usermem.BytesIOSequence(outBuf)
+
+		n, err = fd.Write(s.Ctx, outIOseq, vfs.WriteOptions{})
+		if err != nil {
+			t.Fatalf("Write failed :%v", err)
+		}
+	}
+}
+
+func setup(t *testing.T) *testutil.System {
+	k, err := testutil.Boot()
+	if err != nil {
+		t.Fatalf("Error creating kernel: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+	creds := auth.CredentialsFromContext(ctx)
+
+	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList:  true,
+		AllowUserMount: true,
+	})
+
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("NewMountNamespace(): %v", err)
+	}
+
+	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
+}
+
+// newTestConnection creates a fuse connection that the sentry can communicate with
+// and the FD for the server to communicate with.
+func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
+	vfsObj := &vfs.VirtualFilesystem{}
+	fuseDev := &DeviceFD{}
+
+	if err := vfsObj.Init(); err != nil {
+		return nil, nil, err
+	}
+
+	vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+	defer vd.DecRef()
+	if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, nil, err
+	}
+
+	fsopts := filesystemOptions{
+		maxActiveRequests: maxActiveRequests,
+	}
+	fs, err := NewFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return fs.conn, &fuseDev.vfsfd, nil
+}
+
+// SizeBytes implements marshal.Marshallable.SizeBytes.
+func (t *testPayload) SizeBytes() int {
+	return 4
+}
+
+// MarshalBytes implements marshal.Marshallable.MarshalBytes.
+func (t *testPayload) MarshalBytes(dst []byte) {
+	usermem.ByteOrder.PutUint32(dst[:4], t.data)
+}
+
+// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
+func (t *testPayload) UnmarshalBytes(src []byte) {
+	*t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
+}
+
+// Packed implements marshal.Marshallable.Packed.
+func (t *testPayload) Packed() bool {
+	return true
+}
+
+// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
+func (t *testPayload) MarshalUnsafe(dst []byte) {
+	t.MarshalBytes(dst)
+}
+
+// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
+func (t *testPayload) UnmarshalUnsafe(src []byte) {
+	t.UnmarshalBytes(src)
+}
+
+// CopyOutN implements marshal.Marshallable.CopyOutN.
+func (t *testPayload) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
+	panic("not implemented")
+}
+
+// CopyOut implements marshal.Marshallable.CopyOut.
+func (t *testPayload) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
+	panic("not implemented")
+}
+
+// CopyIn implements marshal.Marshallable.CopyIn.
+func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
+	panic("not implemented")
+}
+
+// WriteTo implements io.WriterTo.WriteTo.
+func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
+	panic("not implemented")
+}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
new file mode 100644
index 000000000..200a93bbf
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -0,0 +1,228 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fuse implements fusefs.
+package fuse
+
+import (
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the default filesystem name.
+const Name = "fuse"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+type filesystemOptions struct {
+	// userID specifies the numeric uid of the mount owner.
+	// This option should not be specified by the filesystem owner.
+	// It is set by libfuse (or, if libfuse is not used, must be set
+	// by the filesystem itself). For more information, see man page
+	// for fuse(8)
+	userID uint32
+
+	// groupID specifies the numeric gid of the mount owner.
+	// This option should not be specified by the filesystem owner.
+	// It is set by libfuse (or, if libfuse is not used, must be set
+	// by the filesystem itself). For more information, see man page
+	// for fuse(8)
+	groupID uint32
+
+	// rootMode specifies the the file mode of the filesystem's root.
+	rootMode linux.FileMode
+
+	// maxActiveRequests specifies the maximum number of active requests that can
+	// exist at any time. Any further requests will block when trying to
+	// Call the server.
+	maxActiveRequests uint64
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+	devMinor uint32
+
+	// conn is used for communication between the FUSE server
+	// daemon and the sentry fusefs.
+	conn *connection
+
+	// opts is the options the fusefs is initialized with.
+	opts *filesystemOptions
+}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	var fsopts filesystemOptions
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	deviceDescriptorStr, ok := mopts["fd"]
+	if !ok {
+		log.Warningf("%s.GetFilesystem: communication file descriptor N (obtained by opening /dev/fuse) must be specified as 'fd=N'", fsType.Name())
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "fd")
+
+	deviceDescriptor, err := strconv.ParseInt(deviceDescriptorStr, 10 /* base */, 32 /* bitSize */)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	kernelTask := kernel.TaskFromContext(ctx)
+	if kernelTask == nil {
+		log.Warningf("%s.GetFilesystem: couldn't get kernel task from context", fsType.Name())
+		return nil, nil, syserror.EINVAL
+	}
+	fuseFd := kernelTask.GetFileVFS2(int32(deviceDescriptor))
+
+	// Parse and set all the other supported FUSE mount options.
+	// TODO(gVisor.dev/issue/3229): Expand the supported mount options.
+	if userIDStr, ok := mopts["user_id"]; ok {
+		delete(mopts, "user_id")
+		userID, err := strconv.ParseUint(userIDStr, 10, 32)
+		if err != nil {
+			log.Warningf("%s.GetFilesystem: invalid user_id: user_id=%s", fsType.Name(), userIDStr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.userID = uint32(userID)
+	}
+
+	if groupIDStr, ok := mopts["group_id"]; ok {
+		delete(mopts, "group_id")
+		groupID, err := strconv.ParseUint(groupIDStr, 10, 32)
+		if err != nil {
+			log.Warningf("%s.GetFilesystem: invalid group_id: group_id=%s", fsType.Name(), groupIDStr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.groupID = uint32(groupID)
+	}
+
+	rootMode := linux.FileMode(0777)
+	modeStr, ok := mopts["rootmode"]
+	if ok {
+		delete(mopts, "rootmode")
+		mode, err := strconv.ParseUint(modeStr, 8, 32)
+		if err != nil {
+			log.Warningf("%s.GetFilesystem: invalid mode: %q", fsType.Name(), modeStr)
+			return nil, nil, syserror.EINVAL
+		}
+		rootMode = linux.FileMode(mode)
+	}
+	fsopts.rootMode = rootMode
+
+	// Set the maxInFlightRequests option.
+	fsopts.maxActiveRequests = maxActiveRequestsDefault
+
+	// Check for unparsed options.
+	if len(mopts) != 0 {
+		log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Create a new FUSE filesystem.
+	fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
+	if err != nil {
+		log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err)
+		return nil, nil, err
+	}
+
+	fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
+
+	// Send a FUSE_INIT request to the FUSE daemon server before returning.
+	// This call is not blocking.
+	if err := fs.conn.InitSend(creds, uint32(kernelTask.ThreadID())); err != nil {
+		log.Warningf("%s.InitSend: failed with error: %v", fsType.Name(), err)
+		return nil, nil, err
+	}
+
+	// root is the fusefs root directory.
+	root := fs.newInode(creds, fsopts.rootMode)
+
+	return fs.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+// NewFUSEFilesystem creates a new FUSE filesystem.
+func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
+	fs := &filesystem{
+		devMinor: devMinor,
+		opts:     opts,
+	}
+
+	conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests)
+	if err != nil {
+		log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err)
+		return nil, syserror.EINVAL
+	}
+
+	fs.conn = conn
+	fuseFD := device.Impl().(*DeviceFD)
+	fuseFD.fs = fs
+
+	return fs, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release()
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoDynamicLookup
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
+
+	dentry kernfs.Dentry
+}
+
+func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
+	i := &inode{}
+	i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
+	i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	i.dentry.Init(i)
+
+	return &i.dentry
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/init.go
new file mode 100644
index 000000000..779c2bd3f
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/init.go
@@ -0,0 +1,166 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// consts used by FUSE_INIT negotiation.
+const (
+	// fuseMaxMaxPages is the maximum value for MaxPages received in InitOut.
+	// Follow the same behavior as unix fuse implementation.
+	fuseMaxMaxPages = 256
+
+	// Maximum value for the time granularity for file time stamps, 1s.
+	// Follow the same behavior as unix fuse implementation.
+	fuseMaxTimeGranNs = 1000000000
+
+	// Minimum value for MaxWrite.
+	// Follow the same behavior as unix fuse implementation.
+	fuseMinMaxWrite = 4096
+
+	// Temporary default value for max readahead, 128kb.
+	fuseDefaultMaxReadahead = 131072
+
+	// The FUSE_INIT_IN flags sent to the daemon.
+	// TODO(gvisor.dev/issue/3199): complete the flags.
+	fuseDefaultInitFlags = linux.FUSE_MAX_PAGES
+)
+
+// Adjustable maximums for Connection's cogestion control parameters.
+// Used as the upperbound of the config values.
+// Currently we do not support adjustment to them.
+var (
+	MaxUserBackgroundRequest   uint16 = fuseDefaultMaxBackground
+	MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold
+)
+
+// InitSend sends a FUSE_INIT request.
+func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
+	in := linux.FUSEInitIn{
+		Major: linux.FUSE_KERNEL_VERSION,
+		Minor: linux.FUSE_KERNEL_MINOR_VERSION,
+		// TODO(gvisor.dev/issue/3196): find appropriate way to calculate this
+		MaxReadahead: fuseDefaultMaxReadahead,
+		Flags:        fuseDefaultInitFlags,
+	}
+
+	req, err := conn.NewRequest(creds, pid, 0, linux.FUSE_INIT, &in)
+	if err != nil {
+		return err
+	}
+
+	// Since there is no task to block on and FUSE_INIT is the request
+	// to unblock other requests, use nil.
+	return conn.CallAsync(nil, req)
+}
+
+// InitRecv receives a FUSE_INIT reply and process it.
+func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error {
+	if err := res.Error(); err != nil {
+		return err
+	}
+
+	var out linux.FUSEInitOut
+	if err := res.UnmarshalPayload(&out); err != nil {
+		return err
+	}
+
+	return conn.initProcessReply(&out, hasSysAdminCap)
+}
+
+// Process the FUSE_INIT reply from the FUSE server.
+func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error {
+	// No support for old major fuse versions.
+	if out.Major != linux.FUSE_KERNEL_VERSION {
+		conn.connInitError = true
+
+		// Set the connection as initialized and unblock the blocked requests
+		// (i.e. return error for them).
+		conn.SetInitialized()
+
+		return nil
+	}
+
+	// Start processing the reply.
+	conn.connInitSuccess = true
+	conn.minor = out.Minor
+
+	// No support for limits before minor version 13.
+	if out.Minor >= 13 {
+		conn.bgLock.Lock()
+
+		if out.MaxBackground > 0 {
+			conn.maxBackground = out.MaxBackground
+
+			if !hasSysAdminCap &&
+				conn.maxBackground > MaxUserBackgroundRequest {
+				conn.maxBackground = MaxUserBackgroundRequest
+			}
+		}
+
+		if out.CongestionThreshold > 0 {
+			conn.congestionThreshold = out.CongestionThreshold
+
+			if !hasSysAdminCap &&
+				conn.congestionThreshold > MaxUserCongestionThreshold {
+				conn.congestionThreshold = MaxUserCongestionThreshold
+			}
+		}
+
+		conn.bgLock.Unlock()
+	}
+
+	// No support for the following flags before minor version 6.
+	if out.Minor >= 6 {
+		conn.asyncRead = out.Flags&linux.FUSE_ASYNC_READ != 0
+		conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0
+		conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0
+		conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0
+		conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0
+		conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0
+
+		// TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs).
+
+		if out.Flags&linux.FUSE_MAX_PAGES != 0 {
+			maxPages := out.MaxPages
+			if maxPages < 1 {
+				maxPages = 1
+			}
+			if maxPages > fuseMaxMaxPages {
+				maxPages = fuseMaxMaxPages
+			}
+			conn.maxPages = maxPages
+		}
+	}
+
+	// No support for negotiating MaxWrite before minor version 5.
+	if out.Minor >= 5 {
+		conn.maxWrite = out.MaxWrite
+	} else {
+		conn.maxWrite = fuseMinMaxWrite
+	}
+	if conn.maxWrite < fuseMinMaxWrite {
+		conn.maxWrite = fuseMinMaxWrite
+	}
+
+	// Set connection as initialized and unblock the requests
+	// issued before init.
+	conn.SetInitialized()
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/register.go b/pkg/sentry/fsimpl/fuse/register.go
new file mode 100644
index 000000000..b5b581152
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/register.go
@@ -0,0 +1,42 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Register registers the FUSE device with vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+	if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{
+		GroupName: "misc",
+	}); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// CreateDevtmpfsFile creates a device special file in devtmpfs.
+func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error {
+	if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil {
+		return err
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 67e916525..4a800dcf9 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -35,6 +35,7 @@ go_library(
         "fstree.go",
         "gofer.go",
         "handle.go",
+        "host_named_pipe.go",
         "p9file.go",
         "regular_file.go",
         "socket.go",
@@ -47,11 +48,13 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fd",
+        "//pkg/fdnotifier",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
         "//pkg/safemem",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/host",
         "//pkg/sentry/hostfd",
         "//pkg/sentry/kernel",
@@ -71,6 +74,7 @@ go_library(
         "//pkg/unet",
         "//pkg/usermem",
         "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index b98218753..8c7c8e1b3 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -85,6 +85,7 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
 	d2 := &dentry{
 		refs:      1, // held by d
 		fs:        d.fs,
+		ino:       d.fs.nextSyntheticIno(),
 		mode:      uint32(opts.mode),
 		uid:       uint32(opts.kuid),
 		gid:       uint32(opts.kgid),
@@ -138,6 +139,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 		fd.dirents = ds
 	}
 
+	d.InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
 	if d.cachedMetadataAuthoritative() {
 		d.touchAtime(fd.vfsfd.Mount())
 	}
@@ -183,13 +185,13 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 		{
 			Name:    ".",
 			Type:    linux.DT_DIR,
-			Ino:     d.ino,
+			Ino:     uint64(d.ino),
 			NextOff: 1,
 		},
 		{
 			Name:    "..",
 			Type:    uint8(atomic.LoadUint32(&parent.mode) >> 12),
-			Ino:     parent.ino,
+			Ino:     uint64(parent.ino),
 			NextOff: 2,
 		},
 	}
@@ -225,7 +227,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 				}
 				dirent := vfs.Dirent{
 					Name:    p9d.Name,
-					Ino:     p9d.QID.Path,
+					Ino:     uint64(inoFromPath(p9d.QID.Path)),
 					NextOff: int64(len(dirents) + 1),
 				}
 				// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
@@ -258,7 +260,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 			dirents = append(dirents, vfs.Dirent{
 				Name:    child.name,
 				Type:    uint8(atomic.LoadUint32(&child.mode) >> 12),
-				Ino:     child.ino,
+				Ino:     uint64(child.ino),
 				NextOff: int64(len(dirents) + 1),
 			})
 		}
@@ -299,3 +301,8 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
 		return 0, syserror.EINVAL
 	}
 }
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *directoryFD) Sync(ctx context.Context) error {
+	return fd.dentry().handle.sync(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 7f2181216..00e3c99cd 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -16,6 +16,7 @@ package gofer
 
 import (
 	"sync"
+	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -118,7 +119,7 @@ func putDentrySlice(ds *[]*dentry) {
 // must be up to date.
 //
 // Postconditions: The returned dentry's cached metadata is up to date.
-func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
 	}
@@ -149,11 +150,9 @@ afterSymlink:
 			return nil, err
 		}
 		if d != d.parent && !d.cachedMetadataAuthoritative() {
-			_, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask())
-			if err != nil {
+			if err := d.parent.updateFromGetattr(ctx); err != nil {
 				return nil, err
 			}
-			d.parent.updateFromP9Attrs(attrMask, &attr)
 		}
 		rp.Advance()
 		return d.parent, nil
@@ -168,7 +167,7 @@ afterSymlink:
 	if err := rp.CheckMount(&child.vfsd); err != nil {
 		return nil, err
 	}
-	if child.isSymlink() && rp.ShouldFollowSymlink() {
+	if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
 		target, err := child.readlink(ctx, rp.Mount())
 		if err != nil {
 			return nil, err
@@ -208,18 +207,28 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil
 
 // Preconditions: As for getChildLocked. !parent.isSynthetic().
 func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
+	if child != nil {
+		// Need to lock child.metadataMu because we might be updating child
+		// metadata. We need to hold the lock *before* getting metadata from the
+		// server and release it after updating local metadata.
+		child.metadataMu.Lock()
+	}
 	qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
 	if err != nil && err != syserror.ENOENT {
+		if child != nil {
+			child.metadataMu.Unlock()
+		}
 		return nil, err
 	}
 	if child != nil {
-		if !file.isNil() && qid.Path == child.ino {
-			// The file at this path hasn't changed. Just update cached
-			// metadata.
+		if !file.isNil() && inoFromPath(qid.Path) == child.ino {
+			// The file at this path hasn't changed. Just update cached metadata.
 			file.close(ctx)
-			child.updateFromP9Attrs(attrMask, &attr)
+			child.updateFromP9AttrsLocked(attrMask, &attr)
+			child.metadataMu.Unlock()
 			return child, nil
 		}
+		child.metadataMu.Unlock()
 		if file.isNil() && child.isSynthetic() {
 			// We have a synthetic file, and no remote file has arisen to
 			// replace it.
@@ -275,7 +284,7 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	for !rp.Final() {
 		d.dirMu.Lock()
-		next, err := fs.stepLocked(ctx, rp, d, ds)
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
 		d.dirMu.Unlock()
 		if err != nil {
 			return nil, err
@@ -301,7 +310,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 	}
 	for !rp.Done() {
 		d.dirMu.Lock()
-		next, err := fs.stepLocked(ctx, rp, d, ds)
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
 		d.dirMu.Unlock()
 		if err != nil {
 			return nil, err
@@ -371,17 +380,33 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 		}
 		parent.touchCMtime()
 		parent.dirents = nil
+		ev := linux.IN_CREATE
+		if dir {
+			ev |= linux.IN_ISDIR
+		}
+		parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
 		return nil
 	}
 	if fs.opts.interop == InteropModeShared {
-		// The existence of a dentry at name would be inconclusive because the
-		// file it represents may have been deleted from the remote filesystem,
-		// so we would need to make an RPC to revalidate the dentry. Just
-		// attempt the file creation RPC instead. If a file does exist, the RPC
-		// will fail with EEXIST like we would have. If the RPC succeeds, and a
-		// stale dentry exists, the dentry will fail revalidation next time
-		// it's used.
-		return createInRemoteDir(parent, name)
+		if child := parent.children[name]; child != nil && child.isSynthetic() {
+			return syserror.EEXIST
+		}
+		// The existence of a non-synthetic dentry at name would be inconclusive
+		// because the file it represents may have been deleted from the remote
+		// filesystem, so we would need to make an RPC to revalidate the dentry.
+		// Just attempt the file creation RPC instead. If a file does exist, the
+		// RPC will fail with EEXIST like we would have. If the RPC succeeds, and a
+		// stale dentry exists, the dentry will fail revalidation next time it's
+		// used.
+		if err := createInRemoteDir(parent, name); err != nil {
+			return err
+		}
+		ev := linux.IN_CREATE
+		if dir {
+			ev |= linux.IN_ISDIR
+		}
+		parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
+		return nil
 	}
 	if child := parent.children[name]; child != nil {
 		return syserror.EEXIST
@@ -397,6 +422,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	}
 	parent.touchCMtime()
 	parent.dirents = nil
+	ev := linux.IN_CREATE
+	if dir {
+		ev |= linux.IN_ISDIR
+	}
+	parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
 	return nil
 }
 
@@ -443,21 +473,61 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	defer mntns.DecRef()
 	parent.dirMu.Lock()
 	defer parent.dirMu.Unlock()
+
 	child, ok := parent.children[name]
 	if ok && child == nil {
 		return syserror.ENOENT
 	}
-	// We only need a dentry representing the file at name if it can be a mount
-	// point. If child is nil, then it can't be a mount point. If child is
-	// non-nil but stale, the actual file can't be a mount point either; we
-	// detect this case by just speculatively calling PrepareDeleteDentry and
-	// only revalidating the dentry if that fails (indicating that the existing
-	// dentry is a mount point).
+
+	sticky := atomic.LoadUint32(&parent.mode)&linux.ModeSticky != 0
+	if sticky {
+		if !ok {
+			// If the sticky bit is set, we need to retrieve the child to determine
+			// whether removing it is allowed.
+			child, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+			if err != nil {
+				return err
+			}
+		} else if child != nil && !child.cachedMetadataAuthoritative() {
+			// Make sure the dentry representing the file at name is up to date
+			// before examining its metadata.
+			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
+			if err != nil {
+				return err
+			}
+		}
+		if err := parent.mayDelete(rp.Credentials(), child); err != nil {
+			return err
+		}
+	}
+
+	// If a child dentry exists, prepare to delete it. This should fail if it is
+	// a mount point. We detect mount points by speculatively calling
+	// PrepareDeleteDentry, which fails if child is a mount point. However, we
+	// may need to revalidate the file in this case to make sure that it has not
+	// been deleted or replaced on the remote fs, in which case the mount point
+	// will have disappeared. If calling PrepareDeleteDentry fails again on the
+	// up-to-date dentry, we can be sure that it is a mount point.
+	//
+	// Also note that if child is nil, then it can't be a mount point.
 	if child != nil {
+		// Hold child.dirMu so we can check child.children and
+		// child.syntheticChildren. We don't access these fields until a bit later,
+		// but locking child.dirMu after calling vfs.PrepareDeleteDentry() would
+		// create an inconsistent lock ordering between dentry.dirMu and
+		// vfs.Dentry.mu (in the VFS lock order, it would make dentry.dirMu both "a
+		// FilesystemImpl lock" and "a lock acquired by a FilesystemImpl between
+		// PrepareDeleteDentry and CommitDeleteDentry). To avoid this, lock
+		// child.dirMu before calling PrepareDeleteDentry.
 		child.dirMu.Lock()
 		defer child.dirMu.Unlock()
 		if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
-			if parent.cachedMetadataAuthoritative() {
+			// We can skip revalidation in several cases:
+			// - We are not in InteropModeShared
+			// - The parent directory is synthetic, in which case the child must also
+			//   be synthetic
+			// - We already updated the child during the sticky bit check above
+			if parent.cachedMetadataAuthoritative() || sticky {
 				return err
 			}
 			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
@@ -518,7 +588,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 		if child == nil {
 			return syserror.ENOENT
 		}
-	} else {
+	} else if child == nil || !child.isSynthetic() {
 		err = parent.file.unlinkAt(ctx, name, flags)
 		if err != nil {
 			if child != nil {
@@ -527,6 +597,18 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 			return err
 		}
 	}
+
+	// Generate inotify events for rmdir or unlink.
+	if dir {
+		parent.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
+	} else {
+		var cw *vfs.Watches
+		if child != nil {
+			cw = &child.watches
+		}
+		vfs.InotifyRemoveChild(cw, &parent.watches, name)
+	}
+
 	if child != nil {
 		vfsObj.CommitDeleteDentry(&child.vfsd)
 		child.setDeleted()
@@ -754,25 +836,27 @@ afterTrailingSymlink:
 	}
 	// Determine whether or not we need to create a file.
 	parent.dirMu.Lock()
-	child, err := fs.stepLocked(ctx, rp, parent, &ds)
+	child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
 	if err == syserror.ENOENT && mayCreate {
 		if parent.isSynthetic() {
 			parent.dirMu.Unlock()
 			return nil, syserror.EPERM
 		}
-		fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts)
+		fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds)
 		parent.dirMu.Unlock()
 		return fd, err
 	}
+	parent.dirMu.Unlock()
 	if err != nil {
-		parent.dirMu.Unlock()
 		return nil, err
 	}
-	// Open existing child or follow symlink.
-	parent.dirMu.Unlock()
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
+	if !child.isDir() && rp.MustBeDir() {
+		return nil, syserror.ENOTDIR
+	}
+	// Open existing child or follow symlink.
 	if child.isSymlink() && rp.ShouldFollowSymlink() {
 		target, err := child.readlink(ctx, rp.Mount())
 		if err != nil {
@@ -793,20 +877,32 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
+
+	trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG
+	if trunc {
+		// Lock metadataMu *while* we open a regular file with O_TRUNC because
+		// open(2) will change the file size on server.
+		d.metadataMu.Lock()
+		defer d.metadataMu.Unlock()
+	}
+
+	var vfd *vfs.FileDescription
+	var err error
 	mnt := rp.Mount()
 	switch d.fileType() {
 	case linux.S_IFREG:
 		if !d.fs.opts.regularFilesUseSpecialFileFD {
-			if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil {
+			if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, trunc); err != nil {
 				return nil, err
 			}
 			fd := &regularFileFD{}
+			fd.LockFD.Init(&d.locks)
 			if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
 				AllowDirectIO: true,
 			}); err != nil {
 				return nil, err
 			}
-			return &fd.vfsfd, nil
+			vfd = &fd.vfsfd
 		}
 	case linux.S_IFDIR:
 		// Can't open directories with O_CREAT.
@@ -826,6 +922,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 			}
 		}
 		fd := &directoryFD{}
+		fd.LockFD.Init(&d.locks)
 		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
@@ -842,10 +939,28 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 		}
 	case linux.S_IFIFO:
 		if d.isSynthetic() {
-			return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags)
+			return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks)
+		}
+	}
+
+	if vfd == nil {
+		if vfd, err = d.openSpecialFileLocked(ctx, mnt, opts); err != nil {
+			return nil, err
+		}
+	}
+
+	if trunc {
+		// If no errors occured so far then update file size in memory. This
+		// step is required even if !d.cachedMetadataAuthoritative() because
+		// d.mappings has to be updated.
+		// d.metadataMu has already been acquired if trunc == true.
+		d.updateFileSizeLocked(0)
+
+		if d.cachedMetadataAuthoritative() {
+			d.touchCMtimeLocked()
 		}
 	}
-	return d.openSpecialFileLocked(ctx, mnt, opts)
+	return vfd, err
 }
 
 func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
@@ -873,19 +988,37 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts
 	if opts.Flags&linux.O_DIRECT != 0 {
 		return nil, syserror.EINVAL
 	}
-	h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0)
+	// We assume that the server silently inserts O_NONBLOCK in the open flags
+	// for all named pipes (because all existing gofers do this).
+	//
+	// NOTE(b/133875563): This makes named pipe opens racy, because the
+	// mechanisms for translating nonblocking to blocking opens can only detect
+	// the instantaneous presence of a peer holding the other end of the pipe
+	// open, not whether the pipe was *previously* opened by a peer that has
+	// since closed its end.
+	isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0
+retry:
+	h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0)
 	if err != nil {
+		if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && err == syserror.ENXIO {
+			// An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails
+			// with ENXIO if opening the same named pipe with O_WRONLY would
+			// block because there are no readers of the pipe.
+			if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+				return nil, err
+			}
+			goto retry
+		}
 		return nil, err
 	}
-	seekable := d.fileType() == linux.S_IFREG
-	fd := &specialFileFD{
-		handle:   h,
-		seekable: seekable,
+	if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 {
+		if err := blockUntilNonblockingPipeHasWriter(ctx, h.fd); err != nil {
+			h.close(ctx)
+			return nil, err
+		}
 	}
-	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
-		DenyPRead:  !seekable,
-		DenyPWrite: !seekable,
-	}); err != nil {
+	fd, err := newSpecialFileFD(h, mnt, d, &d.locks, opts.Flags)
+	if err != nil {
 		h.close(ctx)
 		return nil, err
 	}
@@ -894,7 +1027,7 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts
 
 // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
 // !d.isSynthetic().
-func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
 	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
 	}
@@ -919,7 +1052,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 	// Filter file creation flags and O_LARGEFILE out; the create RPC already
 	// has the semantics of O_CREAT|O_EXCL, while some servers will choke on
 	// O_LARGEFILE.
-	createFlags := p9.OpenFlags(opts.Flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_LARGEFILE))
+	createFlags := p9.OpenFlags(opts.Flags &^ (vfs.FileCreationFlags | linux.O_LARGEFILE))
 	fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
 	if err != nil {
 		dirfile.close(ctx)
@@ -947,6 +1080,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		}
 		return nil, err
 	}
+	*ds = appendDentry(*ds, child)
 	// Incorporate the fid that was opened by lcreate.
 	useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
 	if useRegularFileFD {
@@ -959,10 +1093,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags)
 		child.handleMu.Unlock()
 	}
-	// Take a reference on the new dentry to be held by the new file
-	// description. (This reference also means that the new dentry is not
-	// eligible for caching yet, so we don't need to append to a dentry slice.)
-	child.refs = 1
 	// Insert the dentry into the tree.
 	d.cacheNewChildLocked(child, name)
 	if d.cachedMetadataAuthoritative() {
@@ -974,6 +1104,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 	var childVFSFD *vfs.FileDescription
 	if useRegularFileFD {
 		fd := &regularFileFD{}
+		fd.LockFD.Init(&child.locks)
 		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
 			AllowDirectIO: true,
 		}); err != nil {
@@ -981,26 +1112,21 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		}
 		childVFSFD = &fd.vfsfd
 	} else {
-		seekable := child.fileType() == linux.S_IFREG
-		fd := &specialFileFD{
-			handle: handle{
-				file: openFile,
-				fd:   -1,
-			},
-			seekable: seekable,
+		h := handle{
+			file: openFile,
+			fd:   -1,
 		}
 		if fdobj != nil {
-			fd.handle.fd = int32(fdobj.Release())
+			h.fd = int32(fdobj.Release())
 		}
-		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
-			DenyPRead:  !seekable,
-			DenyPWrite: !seekable,
-		}); err != nil {
-			fd.handle.close(ctx)
+		fd, err := newSpecialFileFD(h, mnt, child, &d.locks, opts.Flags)
+		if err != nil {
+			h.close(ctx)
 			return nil, err
 		}
 		childVFSFD = &fd.vfsfd
 	}
+	d.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
 	return childVFSFD, nil
 }
 
@@ -1052,7 +1178,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			return err
 		}
 	}
-	if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+	creds := rp.Credentials()
+	if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	vfsObj := rp.VirtualFilesystem()
@@ -1067,12 +1194,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if renamed == nil {
 		return syserror.ENOENT
 	}
+	if err := oldParent.mayDelete(creds, renamed); err != nil {
+		return err
+	}
 	if renamed.isDir() {
 		if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
 			return syserror.EINVAL
 		}
 		if oldParent != newParent {
-			if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+			if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
 				return err
 			}
 		}
@@ -1083,7 +1213,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 
 	if oldParent != newParent {
-		if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
 			return err
 		}
 		newParent.dirMu.Lock()
@@ -1181,10 +1311,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if newParent.cachedMetadataAuthoritative() {
 		newParent.dirents = nil
 		newParent.touchCMtime()
-		if renamed.isDir() {
+		if renamed.isDir() && (replaced == nil || !replaced.isDir()) {
+			// Increase the link count if we did not replace another directory.
 			newParent.incLinks()
 		}
 	}
+	vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
 	return nil
 }
 
@@ -1197,12 +1329,21 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckCaching(&ds)
 		return err
 	}
-	return d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount())
+	if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil {
+		fs.renameMuRUnlockAndCheckCaching(&ds)
+		return err
+	}
+	fs.renameMuRUnlockAndCheckCaching(&ds)
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+	}
+	return nil
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
@@ -1326,24 +1467,38 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckCaching(&ds)
 		return err
 	}
-	return d.setxattr(ctx, rp.Credentials(), &opts)
+	if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil {
+		fs.renameMuRUnlockAndCheckCaching(&ds)
+		return err
+	}
+	fs.renameMuRUnlockAndCheckCaching(&ds)
+
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
 func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckCaching(&ds)
 		return err
 	}
-	return d.removexattr(ctx, rp.Credentials(), name)
+	if err := d.removexattr(ctx, rp.Credentials(), name); err != nil {
+		fs.renameMuRUnlockAndCheckCaching(&ds)
+		return err
+	}
+	fs.renameMuRUnlockAndCheckCaching(&ds)
+
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
@@ -1352,3 +1507,7 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 	defer fs.renameMu.RUnlock()
 	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
 }
+
+func (fs *filesystem) nextSyntheticIno() inodeNumber {
+	return inodeNumber(atomic.AddUint64(&fs.syntheticSeq, 1) | syntheticInoMask)
+}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 6295f6b54..e20de84b5 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -45,6 +45,7 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -84,12 +85,6 @@ type filesystem struct {
 	// devMinor is the filesystem's minor device number. devMinor is immutable.
 	devMinor uint32
 
-	// uid and gid are the effective KUID and KGID of the filesystem's creator,
-	// and are used as the owner and group for files that don't specify one.
-	// uid and gid are immutable.
-	uid auth.KUID
-	gid auth.KGID
-
 	// renameMu serves two purposes:
 	//
 	// - It synchronizes path resolution with renaming initiated by this
@@ -115,6 +110,26 @@ type filesystem struct {
 	syncMu           sync.Mutex
 	syncableDentries map[*dentry]struct{}
 	specialFileFDs   map[*specialFileFD]struct{}
+
+	// syntheticSeq stores a counter to used to generate unique inodeNumber for
+	// synthetic dentries.
+	syntheticSeq uint64
+}
+
+// inodeNumber represents inode number reported in Dirent.Ino. For regular
+// dentries, it comes from QID.Path from the 9P server. Synthetic dentries
+// have have their inodeNumber generated sequentially, with the MSB reserved to
+// prevent conflicts with regular dentries.
+type inodeNumber uint64
+
+// Reserve MSB for synthetic mounts.
+const syntheticInoMask = uint64(1) << 63
+
+func inoFromPath(path uint64) inodeNumber {
+	if path&syntheticInoMask != 0 {
+		log.Warningf("Dropping MSB from ino, collision is possible. Original: %d, new: %d", path, path&^syntheticInoMask)
+	}
+	return inodeNumber(path &^ syntheticInoMask)
 }
 
 type filesystemOptions struct {
@@ -122,6 +137,8 @@ type filesystemOptions struct {
 	fd      int
 	aname   string
 	interop InteropMode // derived from the "cache" mount option
+	dfltuid auth.KUID
+	dfltgid auth.KGID
 	msize   uint32
 	version string
 
@@ -230,6 +247,15 @@ type InternalFilesystemOptions struct {
 	OpenSocketsByConnecting bool
 }
 
+// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default
+// UIDs and GIDs used for files that do not provide a specific owner or group
+// respectively.
+const (
+	// uint32(-2) doesn't work in Go.
+	_V9FS_DEFUID = auth.KUID(4294967294)
+	_V9FS_DEFGID = auth.KGID(4294967294)
+)
+
 // Name implements vfs.FilesystemType.Name.
 func (FilesystemType) Name() string {
 	return Name
@@ -315,6 +341,31 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		}
 	}
 
+	// Parse the default UID and GID.
+	fsopts.dfltuid = _V9FS_DEFUID
+	if dfltuidstr, ok := mopts["dfltuid"]; ok {
+		delete(mopts, "dfltuid")
+		dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltuid=%s", dfltuidstr)
+			return nil, nil, syserror.EINVAL
+		}
+		// In Linux, dfltuid is interpreted as a UID and is converted to a KUID
+		// in the caller's user namespace, but goferfs isn't
+		// application-mountable.
+		fsopts.dfltuid = auth.KUID(dfltuid)
+	}
+	fsopts.dfltgid = _V9FS_DEFGID
+	if dfltgidstr, ok := mopts["dfltgid"]; ok {
+		delete(mopts, "dfltgid")
+		dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltgid=%s", dfltgidstr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.dfltgid = auth.KGID(dfltgid)
+	}
+
 	// Parse the 9P message size.
 	fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M
 	if msizestr, ok := mopts["msize"]; ok {
@@ -422,8 +473,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		client:           client,
 		clock:            ktime.RealtimeClockFromContext(ctx),
 		devMinor:         devMinor,
-		uid:              creds.EffectiveKUID,
-		gid:              creds.EffectiveKGID,
 		syncableDentries: make(map[*dentry]struct{}),
 		specialFileFDs:   make(map[*specialFileFD]struct{}),
 	}
@@ -553,21 +602,27 @@ type dentry struct {
 	// returned by the server. dirents is protected by dirMu.
 	dirents []vfs.Dirent
 
-	// Cached metadata; protected by metadataMu and accessed using atomic
-	// memory operations unless otherwise specified.
+	// Cached metadata; protected by metadataMu.
+	// To access:
+	//   - In situations where consistency is not required (like stat), these
+	//     can be accessed using atomic operations only (without locking).
+	//   - Lock metadataMu and can access without atomic operations.
+	// To mutate:
+	//   - Lock metadataMu and use atomic operations to update because we might
+	//     have atomic readers that don't hold the lock.
 	metadataMu sync.Mutex
-	ino        uint64 // immutable
-	mode       uint32 // type is immutable, perms are mutable
-	uid        uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
-	gid        uint32 // auth.KGID, but ...
-	blockSize  uint32 // 0 if unknown
+	ino        inodeNumber // immutable
+	mode       uint32      // type is immutable, perms are mutable
+	uid        uint32      // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid        uint32      // auth.KGID, but ...
+	blockSize  uint32      // 0 if unknown
 	// Timestamps, all nsecs from the Unix epoch.
 	atime int64
 	mtime int64
 	ctime int64
 	btime int64
 	// File size, protected by both metadataMu and dataMu (i.e. both must be
-	// locked to mutate it).
+	// locked to mutate it; locking either is sufficient to access it).
 	size uint64
 
 	// nlink counts the number of hard links to this dentry. It's updated and
@@ -634,6 +689,11 @@ type dentry struct {
 	// If this dentry represents a synthetic named pipe, pipe is the pipe
 	// endpoint bound to this file.
 	pipe *pipe.VFSPipe
+
+	locks vfs.FileLocks
+
+	// Inotify watches for this dentry.
+	watches vfs.Watches
 }
 
 // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the
@@ -670,10 +730,10 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 	d := &dentry{
 		fs:        fs,
 		file:      file,
-		ino:       qid.Path,
+		ino:       inoFromPath(qid.Path),
 		mode:      uint32(attr.Mode),
-		uid:       uint32(fs.uid),
-		gid:       uint32(fs.gid),
+		uid:       uint32(fs.opts.dfltuid),
+		gid:       uint32(fs.opts.dfltgid),
 		blockSize: usermem.PageSize,
 		handle: handle{
 			fd: -1,
@@ -725,8 +785,8 @@ func (d *dentry) cachedMetadataAuthoritative() bool {
 
 // updateFromP9Attrs is called to update d's metadata after an update from the
 // remote filesystem.
-func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
-	d.metadataMu.Lock()
+// Precondition: d.metadataMu must be locked.
+func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
 	if mask.Mode {
 		if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want {
 			d.metadataMu.Unlock()
@@ -760,11 +820,8 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
 		atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
 	}
 	if mask.Size {
-		d.dataMu.Lock()
-		atomic.StoreUint64(&d.size, attr.Size)
-		d.dataMu.Unlock()
+		d.updateFileSizeLocked(attr.Size)
 	}
-	d.metadataMu.Unlock()
 }
 
 // Preconditions: !d.isSynthetic()
@@ -776,6 +833,10 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error {
 		file            p9file
 		handleMuRLocked bool
 	)
+	// d.metadataMu must be locked *before* we getAttr so that we do not end up
+	// updating stale attributes in d.updateFromP9AttrsLocked().
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
 	d.handleMu.RLock()
 	if !d.handle.file.isNil() {
 		file = d.handle.file
@@ -791,7 +852,7 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error {
 	if err != nil {
 		return err
 	}
-	d.updateFromP9Attrs(attrMask, &attr)
+	d.updateFromP9AttrsLocked(attrMask, &attr)
 	return nil
 }
 
@@ -803,10 +864,18 @@ func (d *dentry) statTo(stat *linux.Statx) {
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
 	stat.Blksize = atomic.LoadUint32(&d.blockSize)
 	stat.Nlink = atomic.LoadUint32(&d.nlink)
+	if stat.Nlink == 0 {
+		// The remote filesystem doesn't support link count; just make
+		// something up. This is consistent with Linux, where
+		// fs/inode.c:inode_init_always() initializes link count to 1, and
+		// fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if
+		// it's not provided by the remote filesystem.
+		stat.Nlink = 1
+	}
 	stat.UID = atomic.LoadUint32(&d.uid)
 	stat.GID = atomic.LoadUint32(&d.gid)
 	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
-	stat.Ino = d.ino
+	stat.Ino = uint64(d.ino)
 	stat.Size = atomic.LoadUint64(&d.size)
 	// This is consistent with regularFileFD.Seek(), which treats regular files
 	// as having no holes.
@@ -819,7 +888,8 @@ func (d *dentry) statTo(stat *linux.Statx) {
 	stat.DevMinor = d.fs.devMinor
 }
 
-func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error {
+func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error {
+	stat := &opts.Stat
 	if stat.Mask == 0 {
 		return nil
 	}
@@ -827,7 +897,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 		return syserror.EPERM
 	}
 	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
-	if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
 		return err
 	}
 	if err := mnt.CheckBeginWrite(); err != nil {
@@ -844,14 +914,14 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 
 		// Prepare for truncate.
 		if stat.Mask&linux.STATX_SIZE != 0 {
-			switch d.mode & linux.S_IFMT {
-			case linux.S_IFREG:
+			switch mode.FileType() {
+			case linux.ModeRegular:
 				if !setLocalMtime {
 					// Truncate updates mtime.
 					setLocalMtime = true
 					stat.Mtime.Nsec = linux.UTIME_NOW
 				}
-			case linux.S_IFDIR:
+			case linux.ModeDirectory:
 				return syserror.EISDIR
 			default:
 				return syserror.EINVAL
@@ -860,8 +930,25 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	}
 	d.metadataMu.Lock()
 	defer d.metadataMu.Unlock()
+	if stat.Mask&linux.STATX_SIZE != 0 {
+		// The size needs to be changed even when
+		// !d.cachedMetadataAuthoritative() because d.mappings has to be
+		// updated.
+		d.updateFileSizeLocked(stat.Size)
+	}
 	if !d.isSynthetic() {
 		if stat.Mask != 0 {
+			if stat.Mask&linux.STATX_SIZE != 0 {
+				// Check whether to allow a truncate request to be made.
+				switch d.mode & linux.S_IFMT {
+				case linux.S_IFREG:
+					// Allow.
+				case linux.S_IFDIR:
+					return syserror.EISDIR
+				default:
+					return syserror.EINVAL
+				}
+			}
 			if err := d.file.setAttr(ctx, p9.SetAttrMask{
 				Permissions:        stat.Mask&linux.STATX_MODE != 0,
 				UID:                stat.Mask&linux.STATX_UID != 0,
@@ -908,6 +995,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 		} else {
 			atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
 		}
+		// Restore mask bits that we cleared earlier.
+		stat.Mask |= linux.STATX_ATIME
 	}
 	if setLocalMtime {
 		if stat.Mtime.Nsec == linux.UTIME_NOW {
@@ -915,48 +1004,56 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 		} else {
 			atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
 		}
+		// Restore mask bits that we cleared earlier.
+		stat.Mask |= linux.STATX_MTIME
 	}
 	atomic.StoreInt64(&d.ctime, now)
-	if stat.Mask&linux.STATX_SIZE != 0 {
+	return nil
+}
+
+// Preconditions: d.metadataMu must be locked.
+func (d *dentry) updateFileSizeLocked(newSize uint64) {
+	d.dataMu.Lock()
+	oldSize := d.size
+	atomic.StoreUint64(&d.size, newSize)
+	// d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
+	// below. This allows concurrent calls to Read/Translate/etc. These
+	// functions synchronize with truncation by refusing to use cache
+	// contents beyond the new d.size. (We are still holding d.metadataMu,
+	// so we can't race with Write or another truncate.)
+	d.dataMu.Unlock()
+	if d.size < oldSize {
+		oldpgend, _ := usermem.PageRoundUp(oldSize)
+		newpgend, _ := usermem.PageRoundUp(d.size)
+		if oldpgend != newpgend {
+			d.mapsMu.Lock()
+			d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+				// Compare Linux's mm/truncate.c:truncate_setsize() =>
+				// truncate_pagecache() =>
+				// mm/memory.c:unmap_mapping_range(evencows=1).
+				InvalidatePrivate: true,
+			})
+			d.mapsMu.Unlock()
+		}
+		// We are now guaranteed that there are no translations of
+		// truncated pages, and can remove them from the cache. Since
+		// truncated pages have been removed from the remote file, they
+		// should be dropped without being written back.
 		d.dataMu.Lock()
-		oldSize := d.size
-		d.size = stat.Size
-		// d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
-		// below. This allows concurrent calls to Read/Translate/etc. These
-		// functions synchronize with truncation by refusing to use cache
-		// contents beyond the new d.size. (We are still holding d.metadataMu,
-		// so we can't race with Write or another truncate.)
+		d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
+		d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
 		d.dataMu.Unlock()
-		if d.size < oldSize {
-			oldpgend, _ := usermem.PageRoundUp(oldSize)
-			newpgend, _ := usermem.PageRoundUp(d.size)
-			if oldpgend != newpgend {
-				d.mapsMu.Lock()
-				d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
-					// Compare Linux's mm/truncate.c:truncate_setsize() =>
-					// truncate_pagecache() =>
-					// mm/memory.c:unmap_mapping_range(evencows=1).
-					InvalidatePrivate: true,
-				})
-				d.mapsMu.Unlock()
-			}
-			// We are now guaranteed that there are no translations of
-			// truncated pages, and can remove them from the cache. Since
-			// truncated pages have been removed from the remote file, they
-			// should be dropped without being written back.
-			d.dataMu.Lock()
-			d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
-			d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
-			d.dataMu.Unlock()
-		}
 	}
-	return nil
 }
 
 func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
+func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
+	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid)))
+}
+
 func dentryUIDFromP9UID(uid p9.UID) uint32 {
 	if !uid.Ok() {
 		return uint32(auth.OverflowUID)
@@ -1011,6 +1108,37 @@ func (d *dentry) decRefLocked() {
 	}
 }
 
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {
+	if d.isDir() {
+		events |= linux.IN_ISDIR
+	}
+
+	d.fs.renameMu.RLock()
+	// The ordering below is important, Linux always notifies the parent first.
+	if d.parent != nil {
+		d.parent.watches.Notify(d.name, events, cookie, et, d.isDeleted())
+	}
+	d.watches.Notify("", events, cookie, et, d.isDeleted())
+	d.fs.renameMu.RUnlock()
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+	return &d.watches
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+//
+// If no watches are left on this dentry and it has no references, cache it.
+func (d *dentry) OnZeroWatches() {
+	if atomic.LoadInt64(&d.refs) == 0 {
+		d.fs.renameMu.Lock()
+		d.checkCachingLocked()
+		d.fs.renameMu.Unlock()
+	}
+}
+
 // checkCachingLocked should be called after d's reference count becomes 0 or it
 // becomes disowned.
 //
@@ -1042,6 +1170,9 @@ func (d *dentry) checkCachingLocked() {
 	// Deleted and invalidated dentries with zero references are no longer
 	// reachable by path resolution and should be dropped immediately.
 	if d.vfsd.IsDead() {
+		if d.isDeleted() {
+			d.watches.HandleDeletion()
+		}
 		if d.cached {
 			d.fs.cachedDentries.Remove(d)
 			d.fs.cachedDentriesLen--
@@ -1050,6 +1181,14 @@ func (d *dentry) checkCachingLocked() {
 		d.destroyLocked()
 		return
 	}
+	// If d still has inotify watches and it is not deleted or invalidated, we
+	// cannot cache it and allow it to be evicted. Otherwise, we will lose its
+	// watches, even if a new dentry is created for the same file in the future.
+	// Note that the size of d.watches cannot concurrently transition from zero
+	// to non-zero, because adding a watch requires holding a reference on d.
+	if d.watches.Size() > 0 {
+		return
+	}
 	// If d is already cached, just move it to the front of the LRU.
 	if d.cached {
 		d.fs.cachedDentries.Remove(d)
@@ -1155,7 +1294,7 @@ func (d *dentry) setDeleted() {
 // We only support xattrs prefixed with "user." (see b/148380782). Currently,
 // there is no need to expose any other xattrs through a gofer.
 func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
-	if d.file.isNil() {
+	if d.file.isNil() || !d.userXattrSupported() {
 		return nil, nil
 	}
 	xattrMap, err := d.file.listXattr(ctx, size)
@@ -1181,6 +1320,9 @@ func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vf
 	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
 		return "", syserror.EOPNOTSUPP
 	}
+	if !d.userXattrSupported() {
+		return "", syserror.ENODATA
+	}
 	return d.file.getXattr(ctx, opts.Name, opts.Size)
 }
 
@@ -1194,6 +1336,9 @@ func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vf
 	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
 		return syserror.EOPNOTSUPP
 	}
+	if !d.userXattrSupported() {
+		return syserror.EPERM
+	}
 	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
 }
 
@@ -1207,10 +1352,20 @@ func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name
 	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
 		return syserror.EOPNOTSUPP
 	}
+	if !d.userXattrSupported() {
+		return syserror.EPERM
+	}
 	return d.file.removeXattr(ctx, name)
 }
 
-// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDirectory().
+// Extended attributes in the user.* namespace are only supported for regular
+// files and directories.
+func (d *dentry) userXattrSupported() bool {
+	filetype := linux.FileMode(atomic.LoadUint32(&d.mode)).FileType()
+	return filetype == linux.ModeRegular || filetype == linux.ModeDirectory
+}
+
+// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDir().
 func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
 	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
 	// O_TRUNC).
@@ -1302,23 +1457,21 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 }
 
 // incLinks increments link count.
-//
-// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32.
 func (d *dentry) incLinks() {
-	v := atomic.AddUint32(&d.nlink, 1)
-	if v < 2 {
-		panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v))
+	if atomic.LoadUint32(&d.nlink) == 0 {
+		// The remote filesystem doesn't support link count.
+		return
 	}
+	atomic.AddUint32(&d.nlink, 1)
 }
 
 // decLinks decrements link count.
-//
-// Preconditions: d.nlink > 1.
 func (d *dentry) decLinks() {
-	v := atomic.AddUint32(&d.nlink, ^uint32(0))
-	if v == 0 {
-		panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v))
+	if atomic.LoadUint32(&d.nlink) == 0 {
+		// The remote filesystem doesn't support link count.
+		return
 	}
+	atomic.AddUint32(&d.nlink, ^uint32(0))
 }
 
 // fileDescription is embedded by gofer implementations of
@@ -1326,6 +1479,9 @@ func (d *dentry) decLinks() {
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	lockLogging sync.Once
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
@@ -1354,7 +1510,13 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount())
+	if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()); err != nil {
+		return err
+	}
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		fd.dentry().InotifyWithParent(ev, 0, vfs.InodeEvent)
+	}
+	return nil
 }
 
 // Listxattr implements vfs.FileDescriptionImpl.Listxattr.
@@ -1369,10 +1531,41 @@ func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOption
 
 // Setxattr implements vfs.FileDescriptionImpl.Setxattr.
 func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
-	return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
+	d := fd.dentry()
+	if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
+		return err
+	}
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // Removexattr implements vfs.FileDescriptionImpl.Removexattr.
 func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
-	return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name)
+	d := fd.dentry()
+	if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
+		return err
+	}
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	fd.lockLogging.Do(func() {
+		log.Infof("File lock using gofer file handled internally.")
+	})
+	return fd.LockFD.LockBSD(ctx, uid, t, block)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	fd.lockLogging.Do(func() {
+		log.Infof("Range lock using gofer file handled internally.")
+	})
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
 }
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index 724a3f1f7..8792ca4f2 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -126,11 +126,16 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o
 }
 
 func (h *handle) sync(ctx context.Context) error {
+	// Handle most common case first.
 	if h.fd >= 0 {
 		ctx.UninterruptibleSleepStart(false)
 		err := syscall.Fsync(int(h.fd))
 		ctx.UninterruptibleSleepFinish(false)
 		return err
 	}
+	if h.file.isNil() {
+		// File hasn't been touched, there is nothing to sync.
+		return nil
+	}
 	return h.file.fsync(ctx)
 }
diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
new file mode 100644
index 000000000..7294de7d6
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
@@ -0,0 +1,97 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"sync"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create
+// pipes after sentry initialization due to syscall filters.
+var (
+	tempPipeMu      sync.Mutex
+	tempPipeReadFD  int
+	tempPipeWriteFD int
+	tempPipeBuf     [1]byte
+)
+
+func init() {
+	var pipeFDs [2]int
+	if err := unix.Pipe(pipeFDs[:]); err != nil {
+		panic(fmt.Sprintf("failed to create pipe for gofer.blockUntilNonblockingPipeHasWriter: %v", err))
+	}
+	tempPipeReadFD = pipeFDs[0]
+	tempPipeWriteFD = pipeFDs[1]
+}
+
+func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error {
+	for {
+		ok, err := nonblockingPipeHasWriter(fd)
+		if err != nil {
+			return err
+		}
+		if ok {
+			return nil
+		}
+		if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+			return err
+		}
+	}
+}
+
+func nonblockingPipeHasWriter(fd int32) (bool, error) {
+	tempPipeMu.Lock()
+	defer tempPipeMu.Unlock()
+	// Copy 1 byte from fd into the temporary pipe.
+	n, err := unix.Tee(int(fd), tempPipeWriteFD, 1, unix.SPLICE_F_NONBLOCK)
+	if err == syserror.EAGAIN {
+		// The pipe represented by fd is empty, but has a writer.
+		return true, nil
+	}
+	if err != nil {
+		return false, err
+	}
+	if n == 0 {
+		// The pipe represented by fd is empty and has no writer.
+		return false, nil
+	}
+	// The pipe represented by fd is non-empty, so it either has, or has
+	// previously had, a writer. Remove the byte copied to the temporary pipe
+	// before returning.
+	if n, err := unix.Read(tempPipeReadFD, tempPipeBuf[:]); err != nil || n != 1 {
+		panic(fmt.Sprintf("failed to drain pipe for gofer.blockUntilNonblockingPipeHasWriter: got (%d, %v), wanted (1, nil)", n, err))
+	}
+	return true, nil
+}
+
+func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error {
+	t := time.NewTimer(100 * time.Millisecond)
+	defer t.Stop()
+	cancel := ctx.SleepStart()
+	select {
+	case <-t.C:
+		ctx.SleepFinish(true)
+		return nil
+	case <-cancel:
+		ctx.SleepFinish(false)
+		return syserror.ErrInterrupted
+	}
+}
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 0d10cf7ac..09f142cfc 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -24,11 +24,11 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -67,12 +67,46 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
 	return d.handle.file.flush(ctx)
 }
 
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+
+	d := fd.dentry()
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+
+	size := offset + length
+
+	// Allocating a smaller size is a noop.
+	if size <= d.size {
+		return nil
+	}
+
+	d.handleMu.Lock()
+	defer d.handleMu.Unlock()
+
+	err := d.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+	if err != nil {
+		return err
+	}
+	d.dataMu.Lock()
+	atomic.StoreUint64(&d.size, size)
+	d.dataMu.Unlock()
+	if !d.cachedMetadataAuthoritative() {
+		d.touchCMtimeLocked()
+	}
+	return nil
+}
+
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
-	if opts.Flags != 0 {
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
 
@@ -120,21 +154,53 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
 
 // PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, _, err := fd.pwrite(ctx, src, offset, opts)
+	return n, err
+}
+
+// pwrite returns the number of bytes written, final offset, error. The final
+// offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
 	if offset < 0 {
-		return 0, syserror.EINVAL
+		return 0, offset, syserror.EINVAL
 	}
-	if opts.Flags != 0 {
-		return 0, syserror.EOPNOTSUPP
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, offset, syserror.EOPNOTSUPP
+	}
+
+	d := fd.dentry()
+	// If the fd was opened with O_APPEND, make sure the file size is updated.
+	// There is a possible race here if size is modified externally after
+	// metadata cache is updated.
+	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return 0, offset, err
+		}
+	}
+
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+	// Set offset to file size if the fd was opened with O_APPEND.
+	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+		// Holding d.metadataMu is sufficient for reading d.size.
+		offset = int64(d.size)
 	}
 	limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
 	if err != nil {
-		return 0, err
+		return 0, offset, err
 	}
 	src = src.TakeFirst64(limit)
+	n, err := fd.pwriteLocked(ctx, src, offset, opts)
+	return n, offset + n, err
+}
 
+// Preconditions: fd.dentry().metatdataMu must be locked.
+func (fd *regularFileFD) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	d := fd.dentry()
-	d.metadataMu.Lock()
-	defer d.metadataMu.Unlock()
 	if d.fs.opts.interop != InteropModeShared {
 		// Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
 		// file_update_time(). This is d.touchCMtime(), but without locking
@@ -154,12 +220,12 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 			return 0, syserror.EINVAL
 		}
 		mr := memmap.MappableRange{pgstart, pgend}
-		var freed []platform.FileRange
+		var freed []memmap.FileRange
 		d.dataMu.Lock()
 		cseg := d.cache.LowerBoundSegment(mr.Start)
 		for cseg.Ok() && cseg.Start() < mr.End {
 			cseg = d.cache.Isolate(cseg, mr)
-			freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
+			freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
 			cseg = d.cache.Remove(cseg).NextSegment()
 		}
 		d.dataMu.Unlock()
@@ -197,8 +263,8 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 // Write implements vfs.FileDescriptionImpl.Write.
 func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	fd.mu.Lock()
-	n, err := fd.PWrite(ctx, src, fd.off, opts)
-	fd.off += n
+	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+	fd.off = off
 	fd.mu.Unlock()
 	return n, err
 }
@@ -489,15 +555,24 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
 func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
 	fd.mu.Lock()
 	defer fd.mu.Unlock()
+	newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
+	if err != nil {
+		return 0, err
+	}
+	fd.off = newOffset
+	return newOffset, nil
+}
+
+// Calculate the new offset for a seek operation on a regular file.
+func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) {
 	switch whence {
 	case linux.SEEK_SET:
 		// Use offset as specified.
 	case linux.SEEK_CUR:
-		offset += fd.off
+		offset += fdOffset
 	case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE:
 		// Ensure file size is up to date.
-		d := fd.dentry()
-		if fd.filesystem().opts.interop == InteropModeShared {
+		if !d.cachedMetadataAuthoritative() {
 			if err := d.updateFromGetattr(ctx); err != nil {
 				return 0, err
 			}
@@ -525,7 +600,6 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
-	fd.off = offset
 	return offset, nil
 }
 
@@ -536,20 +610,19 @@ func (fd *regularFileFD) Sync(ctx context.Context) error {
 
 func (d *dentry) syncSharedHandle(ctx context.Context) error {
 	d.handleMu.RLock()
-	if !d.handleWritable {
-		d.handleMu.RUnlock()
-		return nil
-	}
-	d.dataMu.Lock()
-	// Write dirty cached data to the remote file.
-	err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
-	d.dataMu.Unlock()
-	if err == nil {
-		// Sync the remote file.
-		err = d.handle.sync(ctx)
+	defer d.handleMu.RUnlock()
+
+	if d.handleWritable {
+		d.dataMu.Lock()
+		// Write dirty cached data to the remote file.
+		err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
+		d.dataMu.Unlock()
+		if err != nil {
+			return err
+		}
 	}
-	d.handleMu.RUnlock()
-	return err
+	// Sync the remote file.
+	return d.handle.sync(ctx)
 }
 
 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
@@ -747,7 +820,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange
 
 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
 func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
-	// Whether we have a host fd (and consequently what platform.File is
+	// Whether we have a host fd (and consequently what memmap.File is
 	// mapped) can change across save/restore, so invalidate all translations
 	// unconditionally.
 	d.mapsMu.Lock()
@@ -795,8 +868,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
 	}
 }
 
-// dentryPlatformFile implements platform.File. It exists solely because dentry
-// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef.
+// dentryPlatformFile implements memmap.File. It exists solely because dentry
+// cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef.
 //
 // dentryPlatformFile is only used when a host FD representing the remote file
 // is available (i.e. dentry.handle.fd >= 0), and that FD is used for
@@ -804,7 +877,7 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
 type dentryPlatformFile struct {
 	*dentry
 
-	// fdRefs counts references on platform.File offsets. fdRefs is protected
+	// fdRefs counts references on memmap.File offsets. fdRefs is protected
 	// by dentry.dataMu.
 	fdRefs fsutil.FrameRefSet
 
@@ -816,29 +889,29 @@ type dentryPlatformFile struct {
 	hostFileMapperInitOnce sync.Once
 }
 
-// IncRef implements platform.File.IncRef.
-func (d *dentryPlatformFile) IncRef(fr platform.FileRange) {
+// IncRef implements memmap.File.IncRef.
+func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) {
 	d.dataMu.Lock()
 	d.fdRefs.IncRefAndAccount(fr)
 	d.dataMu.Unlock()
 }
 
-// DecRef implements platform.File.DecRef.
-func (d *dentryPlatformFile) DecRef(fr platform.FileRange) {
+// DecRef implements memmap.File.DecRef.
+func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) {
 	d.dataMu.Lock()
 	d.fdRefs.DecRefAndAccount(fr)
 	d.dataMu.Unlock()
 }
 
-// MapInternal implements platform.File.MapInternal.
-func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+// MapInternal implements memmap.File.MapInternal.
+func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
 	d.handleMu.RLock()
 	bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write)
 	d.handleMu.RUnlock()
 	return bs, err
 }
 
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
 func (d *dentryPlatformFile) FD() int {
 	d.handleMu.RLock()
 	fd := d.handle.fd
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index a464e6a94..811528982 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -16,20 +16,22 @@ package gofer
 
 import (
 	"sync"
+	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// specialFileFD implements vfs.FileDescriptionImpl for files other than
-// regular files, directories, and symlinks: pipes, sockets, etc. It is also
-// used for regular files when filesystemOptions.specialRegularFiles is in
-// effect. specialFileFD differs from regularFileFD by using per-FD handles
-// instead of shared per-dentry handles, and never buffering I/O.
+// specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device
+// special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is
+// in effect) regular files. specialFileFD differs from regularFileFD by using
+// per-FD handles instead of shared per-dentry handles, and never buffering I/O.
 type specialFileFD struct {
 	fileDescription
 
@@ -40,13 +42,48 @@ type specialFileFD struct {
 	// file offset is significant, i.e. a regular file. seekable is immutable.
 	seekable bool
 
+	// haveQueue is true if this file description represents a file for which
+	// queue may send I/O readiness events. haveQueue is immutable.
+	haveQueue bool
+	queue     waiter.Queue
+
 	// If seekable is true, off is the file offset. off is protected by mu.
 	mu  sync.Mutex
 	off int64
 }
 
+func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) {
+	ftype := d.fileType()
+	seekable := ftype == linux.S_IFREG
+	haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0
+	fd := &specialFileFD{
+		handle:    h,
+		seekable:  seekable,
+		haveQueue: haveQueue,
+	}
+	fd.LockFD.Init(locks)
+	if haveQueue {
+		if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil {
+			return nil, err
+		}
+	}
+	if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+		DenyPRead:  !seekable,
+		DenyPWrite: !seekable,
+	}); err != nil {
+		if haveQueue {
+			fdnotifier.RemoveFD(h.fd)
+		}
+		return nil, err
+	}
+	return fd, nil
+}
+
 // Release implements vfs.FileDescriptionImpl.Release.
 func (fd *specialFileFD) Release() {
+	if fd.haveQueue {
+		fdnotifier.RemoveFD(fd.handle.fd)
+	}
 	fd.handle.close(context.Background())
 	fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
 	fs.syncMu.Lock()
@@ -62,12 +99,44 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error {
 	return fd.handle.file.flush(ctx)
 }
 
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if fd.haveQueue {
+		return fdnotifier.NonBlockingPoll(fd.handle.fd, mask)
+	}
+	return fd.fileDescription.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	if fd.haveQueue {
+		fd.queue.EventRegister(e, mask)
+		fdnotifier.UpdateFD(fd.handle.fd)
+		return
+	}
+	fd.fileDescription.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
+	if fd.haveQueue {
+		fd.queue.EventUnregister(e)
+		fdnotifier.UpdateFD(fd.handle.fd)
+		return
+	}
+	fd.fileDescription.EventUnregister(e)
+}
+
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	if fd.seekable && offset < 0 {
 		return 0, syserror.EINVAL
 	}
-	if opts.Flags != 0 {
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
 
@@ -76,11 +145,14 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 	// mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't
 	// hold here since specialFileFD doesn't client-cache data. Just buffer the
 	// read instead.
-	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+	if d := fd.dentry(); d.cachedMetadataAuthoritative() {
 		d.touchAtime(fd.vfsfd.Mount())
 	}
 	buf := make([]byte, dst.NumBytes())
 	n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	if err == syserror.EAGAIN {
+		err = syserror.ErrWouldBlock
+	}
 	if n == 0 {
 		return 0, err
 	}
@@ -105,32 +177,76 @@ func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
 
 // PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, _, err := fd.pwrite(ctx, src, offset, opts)
+	return n, err
+}
+
+// pwrite returns the number of bytes written, final offset, error. The final
+// offset should be ignored by PWrite.
+func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
 	if fd.seekable && offset < 0 {
-		return 0, syserror.EINVAL
+		return 0, offset, syserror.EINVAL
 	}
-	if opts.Flags != 0 {
-		return 0, syserror.EOPNOTSUPP
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, offset, syserror.EOPNOTSUPP
+	}
+
+	d := fd.dentry()
+	// If the regular file fd was opened with O_APPEND, make sure the file size
+	// is updated. There is a possible race here if size is modified externally
+	// after metadata cache is updated.
+	if fd.seekable && fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return 0, offset, err
+		}
 	}
 
 	if fd.seekable {
+		// We need to hold the metadataMu *while* writing to a regular file.
+		d.metadataMu.Lock()
+		defer d.metadataMu.Unlock()
+
+		// Set offset to file size if the regular file was opened with O_APPEND.
+		if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+			// Holding d.metadataMu is sufficient for reading d.size.
+			offset = int64(d.size)
+		}
 		limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
 		if err != nil {
-			return 0, err
+			return 0, offset, err
 		}
 		src = src.TakeFirst64(limit)
 	}
 
 	// Do a buffered write. See rationale in PRead.
-	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+	if d.cachedMetadataAuthoritative() {
 		d.touchCMtime()
 	}
 	buf := make([]byte, src.NumBytes())
 	// Don't do partial writes if we get a partial read from src.
 	if _, err := src.CopyIn(ctx, buf); err != nil {
-		return 0, err
+		return 0, offset, err
 	}
 	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
-	return int64(n), err
+	if err == syserror.EAGAIN {
+		err = syserror.ErrWouldBlock
+	}
+	finalOff = offset
+	// Update file size for regular files.
+	if fd.seekable {
+		finalOff += int64(n)
+		// d.metadataMu is already locked at this point.
+		if uint64(finalOff) > d.size {
+			d.dataMu.Lock()
+			defer d.dataMu.Unlock()
+			atomic.StoreUint64(&d.size, uint64(finalOff))
+		}
+	}
+	return int64(n), finalOff, err
 }
 
 // Write implements vfs.FileDescriptionImpl.Write.
@@ -140,8 +256,8 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts
 	}
 
 	fd.mu.Lock()
-	n, err := fd.PWrite(ctx, src, fd.off, opts)
-	fd.off += n
+	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+	fd.off = off
 	fd.mu.Unlock()
 	return n, err
 }
@@ -153,27 +269,15 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (
 	}
 	fd.mu.Lock()
 	defer fd.mu.Unlock()
-	switch whence {
-	case linux.SEEK_SET:
-		// Use offset as given.
-	case linux.SEEK_CUR:
-		offset += fd.off
-	default:
-		// SEEK_END, SEEK_DATA, and SEEK_HOLE aren't supported since it's not
-		// clear that file size is even meaningful for these files.
-		return 0, syserror.EINVAL
-	}
-	if offset < 0 {
-		return 0, syserror.EINVAL
+	newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
+	if err != nil {
+		return 0, err
 	}
-	fd.off = offset
-	return offset, nil
+	fd.off = newOffset
+	return newOffset, nil
 }
 
 // Sync implements vfs.FileDescriptionImpl.Sync.
 func (fd *specialFileFD) Sync(ctx context.Context) error {
-	if !fd.vfsfd.IsWritable() {
-		return nil
-	}
-	return fd.handle.sync(ctx)
+	return fd.dentry().syncSharedHandle(ctx)
 }
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index 2608e7e1d..0eef4e16e 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -36,8 +36,11 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
 	}
 }
 
-// Preconditions: fs.interop != InteropModeShared.
+// Preconditions: d.cachedMetadataAuthoritative() == true.
 func (d *dentry) touchAtime(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime {
+		return
+	}
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return
 	}
@@ -48,8 +51,8 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) {
 	mnt.EndWrite()
 }
 
-// Preconditions: fs.interop != InteropModeShared. The caller has successfully
-// called vfs.Mount.CheckBeginWrite().
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// successfully called vfs.Mount.CheckBeginWrite().
 func (d *dentry) touchCtime() {
 	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
@@ -57,8 +60,8 @@ func (d *dentry) touchCtime() {
 	d.metadataMu.Unlock()
 }
 
-// Preconditions: fs.interop != InteropModeShared. The caller has successfully
-// called vfs.Mount.CheckBeginWrite().
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// successfully called vfs.Mount.CheckBeginWrite().
 func (d *dentry) touchCMtime() {
 	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
@@ -67,6 +70,8 @@ func (d *dentry) touchCMtime() {
 	d.metadataMu.Unlock()
 }
 
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// locked d.metadataMu.
 func (d *dentry) touchCMtimeLocked() {
 	now := d.fs.clock.Now().Nanoseconds()
 	atomic.StoreInt64(&d.mtime, now)
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index ca0fe6d2b..bd701bbc7 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -22,17 +22,18 @@ go_library(
         "//pkg/context",
         "//pkg/fdnotifier",
         "//pkg/fspath",
+        "//pkg/iovec",
         "//pkg/log",
         "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/hostfd",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/platform",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 18b127521..c894f2ca0 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -28,6 +28,7 @@ import (
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/hostfd"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -90,7 +91,9 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
 		isTTY:      opts.IsTTY,
 		wouldBlock: wouldBlock(uint32(fileType)),
 		seekable:   seekable,
-		canMap:     canMap(uint32(fileType)),
+		// NOTE(b/38213152): Technically, some obscure char devices can be memory
+		// mapped, but we only allow regular files.
+		canMap: fileType == linux.S_IFREG,
 	}
 	i.pf.inode = i
 
@@ -182,6 +185,8 @@ type inode struct {
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
 
+	locks vfs.FileLocks
+
 	// When the reference count reaches zero, the host fd is closed.
 	refs.AtomicRefCount
 
@@ -254,7 +259,7 @@ func (i *inode) Mode() linux.FileMode {
 }
 
 // Stat implements kernfs.Inode.
-func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
 	if opts.Mask&linux.STATX__RESERVED != 0 {
 		return linux.Statx{}, syserror.EINVAL
 	}
@@ -368,7 +373,7 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) {
 
 // SetStat implements kernfs.Inode.
 func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
-	s := opts.Stat
+	s := &opts.Stat
 
 	m := s.Mask
 	if m == 0 {
@@ -381,7 +386,7 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 	if err := syscall.Fstat(i.hostFD, &hostStat); err != nil {
 		return err
 	}
-	if err := vfs.CheckSetStat(ctx, creds, &s, linux.FileMode(hostStat.Mode&linux.PermissionsMask), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
 		return err
 	}
 
@@ -391,6 +396,9 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 		}
 	}
 	if m&linux.STATX_SIZE != 0 {
+		if hostStat.Mode&linux.S_IFMT != linux.S_IFREG {
+			return syserror.EINVAL
+		}
 		if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
 			return err
 		}
@@ -454,10 +462,12 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 	fileType := s.Mode & linux.FileTypeMask
 
 	// Constrain flags to a subset we can handle.
-	// TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags.
-	flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND
+	//
+	// TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls.
+	flags &= syscall.O_ACCMODE | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND
 
-	if fileType == syscall.S_IFSOCK {
+	switch fileType {
+	case syscall.S_IFSOCK:
 		if i.isTTY {
 			log.Warningf("cannot use host socket fd %d as TTY", i.hostFD)
 			return nil, syserror.ENOTTY
@@ -468,35 +478,41 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 			return nil, err
 		}
 		// Currently, we only allow Unix sockets to be imported.
-		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d)
-	}
+		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks)
 
-	// TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that
-	// we don't allow importing arbitrary file types without proper support.
-	if i.isTTY {
-		fd := &TTYFileDescription{
-			fileDescription: fileDescription{inode: i},
-			termios:         linux.DefaultSlaveTermios,
+	case syscall.S_IFREG, syscall.S_IFIFO, syscall.S_IFCHR:
+		if i.isTTY {
+			fd := &TTYFileDescription{
+				fileDescription: fileDescription{inode: i},
+				termios:         linux.DefaultSlaveTermios,
+			}
+			fd.LockFD.Init(&i.locks)
+			vfsfd := &fd.vfsfd
+			if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+				return nil, err
+			}
+			return vfsfd, nil
 		}
+
+		fd := &fileDescription{inode: i}
+		fd.LockFD.Init(&i.locks)
 		vfsfd := &fd.vfsfd
 		if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return vfsfd, nil
-	}
 
-	fd := &fileDescription{inode: i}
-	vfsfd := &fd.vfsfd
-	if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
-		return nil, err
+	default:
+		log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType)
+		return nil, syserror.EPERM
 	}
-	return vfsfd, nil
 }
 
 // fileDescription is embedded by host fd implementations of FileDescriptionImpl.
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	// inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but
 	// cached to reduce indirections and casting. fileDescription does not hold
@@ -521,8 +537,8 @@ func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 }
 
 // Stat implements vfs.FileDescriptionImpl.
-func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) {
-	return f.inode.Stat(f.vfsfd.Mount().Filesystem(), opts)
+func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
 }
 
 // Release implements vfs.FileDescriptionImpl.
@@ -530,6 +546,16 @@ func (f *fileDescription) Release() {
 	// noop
 }
 
+// Allocate implements vfs.FileDescriptionImpl.
+func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	if !f.inode.seekable {
+		return syserror.ESPIPE
+	}
+
+	// TODO(gvisor.dev/issue/2923): Implement Allocate for non-pipe hostfds.
+	return syserror.EOPNOTSUPP
+}
+
 // PRead implements FileDescriptionImpl.
 func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	i := f.inode
@@ -556,7 +582,7 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts
 		}
 		return n, err
 	}
-	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+
 	f.offsetMu.Lock()
 	n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags)
 	f.offset += n
@@ -565,8 +591,10 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts
 }
 
 func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
-	// TODO(gvisor.dev/issue/1672): Support select preadv2 flags.
-	if flags != 0 {
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if flags&^linux.RWF_HIPRI != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
 	reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
@@ -577,41 +605,58 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off
 
 // PWrite implements FileDescriptionImpl.
 func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	i := f.inode
-	if !i.seekable {
+	if !f.inode.seekable {
 		return 0, syserror.ESPIPE
 	}
 
-	return writeToHostFD(ctx, i.hostFD, src, offset, opts.Flags)
+	return f.writeToHostFD(ctx, src, offset, opts.Flags)
 }
 
 // Write implements FileDescriptionImpl.
 func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	i := f.inode
 	if !i.seekable {
-		n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags)
+		n, err := f.writeToHostFD(ctx, src, -1, opts.Flags)
 		if isBlockError(err) {
 			err = syserror.ErrWouldBlock
 		}
 		return n, err
 	}
-	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
-	// TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file.
+
 	f.offsetMu.Lock()
-	n, err := writeToHostFD(ctx, i.hostFD, src, f.offset, opts.Flags)
+	// NOTE(gvisor.dev/issue/2983): O_APPEND may cause memory corruption if
+	// another process modifies the host file between retrieving the file size
+	// and writing to the host fd. This is an unavoidable race condition because
+	// we cannot enforce synchronization on the host.
+	if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+		var s syscall.Stat_t
+		if err := syscall.Fstat(i.hostFD, &s); err != nil {
+			f.offsetMu.Unlock()
+			return 0, err
+		}
+		f.offset = s.Size
+	}
+	n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags)
 	f.offset += n
 	f.offsetMu.Unlock()
 	return n, err
 }
 
-func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
-	// TODO(gvisor.dev/issue/1672): Support select pwritev2 flags.
+func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
+	hostFD := f.inode.hostFD
+	// TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
 	if flags != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
 	writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
 	n, err := src.CopyInTo(ctx, writer)
 	hostfd.PutReadWriterAt(writer)
+	// NOTE(gvisor.dev/issue/2979): We always sync everything, even for O_DSYNC.
+	if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+		if syncErr := unix.Fsync(hostFD); syncErr != nil {
+			return int64(n), syncErr
+		}
+	}
 	return int64(n), err
 }
 
@@ -682,7 +727,7 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i
 
 // Sync implements FileDescriptionImpl.
 func (f *fileDescription) Sync(context.Context) error {
-	// TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything.
+	// TODO(gvisor.dev/issue/1897): Currently, we always sync everything.
 	return unix.Fsync(f.inode.hostFD)
 }
 
@@ -712,3 +757,13 @@ func (f *fileDescription) EventUnregister(e *waiter.Entry) {
 func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask)
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (f *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return f.Locks().LockPOSIX(ctx, &f.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (f *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return f.Locks().UnlockPOSIX(ctx, &f.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go
index 8545a82f0..65d3af38c 100644
--- a/pkg/sentry/fsimpl/host/mmap.go
+++ b/pkg/sentry/fsimpl/host/mmap.go
@@ -19,13 +19,12 @@ import (
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// inodePlatformFile implements platform.File. It exists solely because inode
-// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef.
+// inodePlatformFile implements memmap.File. It exists solely because inode
+// cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef.
 //
 // inodePlatformFile should only be used if inode.canMap is true.
 type inodePlatformFile struct {
@@ -34,7 +33,7 @@ type inodePlatformFile struct {
 	// fdRefsMu protects fdRefs.
 	fdRefsMu sync.Mutex
 
-	// fdRefs counts references on platform.File offsets. It is used solely for
+	// fdRefs counts references on memmap.File offsets. It is used solely for
 	// memory accounting.
 	fdRefs fsutil.FrameRefSet
 
@@ -45,32 +44,32 @@ type inodePlatformFile struct {
 	fileMapperInitOnce sync.Once
 }
 
-// IncRef implements platform.File.IncRef.
+// IncRef implements memmap.File.IncRef.
 //
 // Precondition: i.inode.canMap must be true.
-func (i *inodePlatformFile) IncRef(fr platform.FileRange) {
+func (i *inodePlatformFile) IncRef(fr memmap.FileRange) {
 	i.fdRefsMu.Lock()
 	i.fdRefs.IncRefAndAccount(fr)
 	i.fdRefsMu.Unlock()
 }
 
-// DecRef implements platform.File.DecRef.
+// DecRef implements memmap.File.DecRef.
 //
 // Precondition: i.inode.canMap must be true.
-func (i *inodePlatformFile) DecRef(fr platform.FileRange) {
+func (i *inodePlatformFile) DecRef(fr memmap.FileRange) {
 	i.fdRefsMu.Lock()
 	i.fdRefs.DecRefAndAccount(fr)
 	i.fdRefsMu.Unlock()
 }
 
-// MapInternal implements platform.File.MapInternal.
+// MapInternal implements memmap.File.MapInternal.
 //
 // Precondition: i.inode.canMap must be true.
-func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+func (i *inodePlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
 	return i.fileMapper.MapInternal(fr, i.hostFD, at.Write)
 }
 
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
 func (i *inodePlatformFile) FD() int {
 	return i.hostFD
 }
diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go
index 38f1fbfba..fd16bd92d 100644
--- a/pkg/sentry/fsimpl/host/socket.go
+++ b/pkg/sentry/fsimpl/host/socket.go
@@ -47,11 +47,6 @@ func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transpor
 	return ep, nil
 }
 
-// maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
-//
-// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
-const maxSendBufferSize = 8 << 20
-
 // ConnectedEndpoint is an implementation of transport.ConnectedEndpoint and
 // transport.Receiver. It is backed by a host fd that was imported at sentry
 // startup. This fd is shared with a hostfs inode, which retains ownership of
@@ -114,10 +109,6 @@ func (c *ConnectedEndpoint) init() *syserr.Error {
 	if err != nil {
 		return syserr.FromError(err)
 	}
-	if sndbuf > maxSendBufferSize {
-		log.Warningf("Socket send buffer too large: %d", sndbuf)
-		return syserr.ErrInvalidEndpointState
-	}
 
 	c.stype = linux.SockType(stype)
 	c.sndbuf = int64(sndbuf)
diff --git a/pkg/sentry/fsimpl/host/socket_iovec.go b/pkg/sentry/fsimpl/host/socket_iovec.go
index 584c247d2..fc0d5fd38 100644
--- a/pkg/sentry/fsimpl/host/socket_iovec.go
+++ b/pkg/sentry/fsimpl/host/socket_iovec.go
@@ -17,13 +17,10 @@ package host
 import (
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/iovec"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// maxIovs is the maximum number of iovecs to pass to the host.
-var maxIovs = linux.UIO_MAXIOV
-
 // copyToMulti copies as many bytes from src to dst as possible.
 func copyToMulti(dst [][]byte, src []byte) {
 	for _, d := range dst {
@@ -74,7 +71,7 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec
 		}
 	}
 
-	if iovsRequired > maxIovs {
+	if iovsRequired > iovec.MaxIovs {
 		// The kernel will reject our call if we pass this many iovs.
 		// Use a single intermediate buffer instead.
 		b := make([]byte, stopLen)
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index 68af6e5af..4ee9270cc 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -325,9 +326,9 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal)
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		// No task? Linux does not have an analog for this case, but
-		// tty_check_change is more of a blacklist of cases than a
-		// whitelist, and is surprisingly permissive. Allowing the
-		// change seems most appropriate.
+		// tty_check_change only blocks specific cases and is
+		// surprisingly permissive. Allowing the change seems
+		// appropriate.
 		return nil
 	}
 
@@ -377,3 +378,13 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal)
 	_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
 	return kernel.ERESTARTSYS
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (t *TTYFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, typ fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return t.Locks().LockPOSIX(ctx, &t.vfsfd, uid, typ, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (t *TTYFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return t.Locks().UnlockPOSIX(ctx, &t.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
index 2bc757b1a..412bdb2eb 100644
--- a/pkg/sentry/fsimpl/host/util.go
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -49,16 +49,6 @@ func wouldBlock(fileType uint32) bool {
 	return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
 }
 
-// canMap returns true if a file with fileType is allowed to be memory mapped.
-// This is ported over from VFS1, but it's probably not the best way for us
-// to check if a file can be memory mapped.
-func canMap(fileType uint32) bool {
-	// TODO(gvisor.dev/issue/1672): Also allow "special files" to be mapped (see fs/host:canMap()).
-	//
-	// TODO(b/38213152): Some obscure character devices can be mapped.
-	return fileType == syscall.S_IFREG
-}
-
 // isBlockError checks if an error is EAGAIN or EWOULDBLOCK.
 // If so, they can be transformed into syserror.ErrWouldBlock.
 func isBlockError(err error) bool {
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index ef34cb28a..3835557fe 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -45,6 +45,7 @@ go_library(
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
         "//pkg/sentry/socket/unix/transport",
@@ -69,6 +70,6 @@ go_test(
         "//pkg/sentry/vfs",
         "//pkg/syserror",
         "//pkg/usermem",
-        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 1568a9d49..c6c4472e7 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -38,7 +39,8 @@ type DynamicBytesFile struct {
 	InodeNotDirectory
 	InodeNotSymlink
 
-	data vfs.DynamicBytesSource
+	locks vfs.FileLocks
+	data  vfs.DynamicBytesSource
 }
 
 var _ Inode = (*DynamicBytesFile)(nil)
@@ -55,7 +57,7 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint
 // Open implements Inode.Open.
 func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &DynamicBytesFD{}
-	if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil {
+	if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
@@ -77,13 +79,15 @@ func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credent
 type DynamicBytesFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.DynamicBytesFileDescriptionImpl
+	vfs.LockFD
 
 	vfsfd vfs.FileDescription
 	inode Inode
 }
 
 // Init initializes a DynamicBytesFD.
-func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) error {
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
+	fd.LockFD.Init(locks)
 	if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
 		return err
 	}
@@ -97,12 +101,12 @@ func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32)
 	return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence)
 }
 
-// Read implmenets vfs.FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
 	return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts)
 }
 
-// PRead implmenets vfs.FileDescriptionImpl.PRead.
+// PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts)
 }
@@ -123,7 +127,7 @@ func (fd *DynamicBytesFD) Release() {}
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return fd.inode.Stat(fs, opts)
+	return fd.inode.Stat(ctx, fs, opts)
 }
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
@@ -131,3 +135,13 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error {
 	// DynamicBytesFiles are immutable.
 	return syserror.EPERM
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *DynamicBytesFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *DynamicBytesFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 8284e76a7..1d37ccb98 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -42,6 +43,7 @@ import (
 type GenericDirectoryFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.DirectoryFileDescriptionDefaultImpl
+	vfs.LockFD
 
 	vfsfd    vfs.FileDescription
 	children *OrderedChildren
@@ -55,9 +57,9 @@ type GenericDirectoryFD struct {
 
 // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
 // dentry.
-func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
 	fd := &GenericDirectoryFD{}
-	if err := fd.Init(children, opts); err != nil {
+	if err := fd.Init(children, locks, opts); err != nil {
 		return nil, err
 	}
 	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
@@ -69,11 +71,12 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre
 // Init initializes a GenericDirectoryFD. Use it when overriding
 // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
 // correct implementation.
-func (fd *GenericDirectoryFD) Init(children *OrderedChildren, opts *vfs.OpenOptions) error {
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error {
 	if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
 		// Can't open directories for writing.
 		return syserror.EISDIR
 	}
+	fd.LockFD.Init(locks)
 	fd.children = children
 	return nil
 }
@@ -109,7 +112,7 @@ func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence
 	return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
 }
 
-// Release implements vfs.FileDecriptionImpl.Release.
+// Release implements vfs.FileDescriptionImpl.Release.
 func (fd *GenericDirectoryFD) Release() {}
 
 func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem {
@@ -120,7 +123,7 @@ func (fd *GenericDirectoryFD) inode() Inode {
 	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
 }
 
-// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents. IterDirents holds
 // o.mu when calling cb.
 func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
 	fd.mu.Lock()
@@ -129,7 +132,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	opts := vfs.StatOptions{Mask: linux.STATX_INO}
 	// Handle ".".
 	if fd.off == 0 {
-		stat, err := fd.inode().Stat(fd.filesystem(), opts)
+		stat, err := fd.inode().Stat(ctx, fd.filesystem(), opts)
 		if err != nil {
 			return err
 		}
@@ -149,7 +152,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	if fd.off == 1 {
 		vfsd := fd.vfsfd.VirtualDentry().Dentry()
 		parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode
-		stat, err := parentInode.Stat(fd.filesystem(), opts)
+		stat, err := parentInode.Stat(ctx, fd.filesystem(), opts)
 		if err != nil {
 			return err
 		}
@@ -173,7 +176,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	childIdx := fd.off - 2
 	for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
 		inode := it.Dentry.Impl().(*Dentry).inode
-		stat, err := inode.Stat(fd.filesystem(), opts)
+		stat, err := inode.Stat(ctx, fd.filesystem(), opts)
 		if err != nil {
 			return err
 		}
@@ -195,7 +198,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	return err
 }
 
-// Seek implements vfs.FileDecriptionImpl.Seek.
+// Seek implements vfs.FileDescriptionImpl.Seek.
 func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
 	fd.mu.Lock()
 	defer fd.mu.Unlock()
@@ -223,7 +226,7 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
 func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := fd.filesystem()
 	inode := fd.inode()
-	return inode.Stat(fs, opts)
+	return inode.Stat(ctx, fs, opts)
 }
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
@@ -232,3 +235,18 @@ func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptio
 	inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
 	return inode.SetStat(ctx, fd.filesystem(), creds, opts)
 }
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *GenericDirectoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 4a12ae245..61a36cff9 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -35,7 +35,7 @@ import (
 // Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
 //
 // Postcondition: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) (*vfs.Dentry, error) {
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) {
 	d := vfsd.Impl().(*Dentry)
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
@@ -81,7 +81,7 @@ afterSymlink:
 		return nil, err
 	}
 	// Resolve any symlink at current path component.
-	if rp.ShouldFollowSymlink() && next.isSymlink() {
+	if mayFollowSymlinks && rp.ShouldFollowSymlink() && next.isSymlink() {
 		targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount())
 		if err != nil {
 			return nil, err
@@ -152,7 +152,7 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP
 	vfsd := rp.Start()
 	for !rp.Done() {
 		var err error
-		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
 		if err != nil {
 			return nil, nil, err
 		}
@@ -178,7 +178,7 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
 	vfsd := rp.Start()
 	for !rp.Final() {
 		var err error
-		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
 		if err != nil {
 			return nil, nil, err
 		}
@@ -449,7 +449,7 @@ afterTrailingSymlink:
 		return nil, syserror.ENAMETOOLONG
 	}
 	// Determine whether or not we need to create a file.
-	childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD)
+	childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */)
 	if err == syserror.ENOENT {
 		// Already checked for searchability above; now check for writability.
 		if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
@@ -684,7 +684,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	if err != nil {
 		return linux.Statx{}, err
 	}
-	return inode.Stat(fs.VFSFilesystem(), opts)
+	return inode.Stat(ctx, fs.VFSFilesystem(), opts)
 }
 
 // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 982daa2e6..579e627f0 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -243,7 +243,7 @@ func (a *InodeAttrs) Mode() linux.FileMode {
 // Stat partially implements Inode.Stat. Note that this function doesn't provide
 // all the stat fields, and the embedder should consider extending the result
 // with filesystem-specific fields.
-func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
+func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
 	var stat linux.Statx
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
 	stat.DevMajor = a.devMajor
@@ -267,7 +267,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
 		return err
 	}
 
@@ -293,6 +293,8 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 	// inode numbers are immutable after node creation.
 
 	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
+	// Also, STATX_SIZE will need some special handling, because read-only static
+	// files should return EIO for truncate operations.
 
 	return nil
 }
@@ -469,6 +471,8 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.De
 	if err := o.checkExistingLocked(name, child); err != nil {
 		return err
 	}
+
+	// TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
 	o.removeLocked(name)
 	return nil
 }
@@ -516,6 +520,8 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c
 	if err := o.checkExistingLocked(oldname, child); err != nil {
 		return nil, err
 	}
+
+	// TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
 	replaced := dst.replaceChildLocked(newname, child)
 	return replaced, nil
 }
@@ -555,6 +561,8 @@ type StaticDirectory struct {
 	InodeAttrs
 	InodeNoDynamicLookup
 	OrderedChildren
+
+	locks vfs.FileLocks
 }
 
 var _ Inode = (*StaticDirectory)(nil)
@@ -584,7 +592,7 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint3
 
 // Open implements kernfs.Inode.
 func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
+	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index a83151ad3..46f207664 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -225,9 +225,24 @@ func (d *Dentry) destroy() {
 	}
 }
 
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// Although Linux technically supports inotify on pseudo filesystems (inotify
+// is implemented at the vfs layer), it is not particularly useful. It is left
+// unimplemented until someone actually needs it.
+func (d *Dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *Dentry) Watches() *vfs.Watches {
+	return nil
+}
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+func (d *Dentry) OnZeroWatches() {}
+
 // InsertChild inserts child into the vfs dentry cache with the given name under
 // this dentry. This does not update the directory inode, so calling this on
-// it's own isn't sufficient to insert a child into a directory. InsertChild
+// its own isn't sufficient to insert a child into a directory. InsertChild
 // updates the link count on d if required.
 //
 // Precondition: d must represent a directory inode.
@@ -331,7 +346,7 @@ type inodeMetadata interface {
 
 	// Stat returns the metadata for this inode. This corresponds to
 	// vfs.FilesystemImpl.StatAt.
-	Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
+	Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
 
 	// SetStat updates the metadata for this inode. This corresponds to
 	// vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking
@@ -413,10 +428,10 @@ type inodeDynamicLookup interface {
 	// IterDirents is used to iterate over dynamically created entries. It invokes
 	// cb on each entry in the directory represented by the FileDescription.
 	// 'offset' is the offset for the entire IterDirents call, which may include
-	// results from the caller. 'relOffset' is the offset inside the entries
-	// returned by this IterDirents invocation. In other words,
-	// 'offset+relOffset+1' is the value that should be set in vfs.Dirent.NextOff,
-	// while 'relOffset' is the place where iteration should start from.
+	// results from the caller (e.g. "." and ".."). 'relOffset' is the offset
+	// inside the entries returned by this IterDirents invocation. In other words,
+	// 'offset' should be used to calculate each vfs.Dirent.NextOff as well as
+	// the return value, while 'relOffset' is the place to start iteration.
 	IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 412cf6ac9..dc407eb1d 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -100,8 +100,10 @@ type readonlyDir struct {
 	kernfs.InodeNotSymlink
 	kernfs.InodeNoDynamicLookup
 	kernfs.InodeDirectoryNoNewChildren
-
 	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
+
 	dentry kernfs.Dentry
 }
 
@@ -117,7 +119,7 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
 }
 
 func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
@@ -128,10 +130,12 @@ type dir struct {
 	attrs
 	kernfs.InodeNotSymlink
 	kernfs.InodeNoDynamicLookup
+	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
 
 	fs     *filesystem
 	dentry kernfs.Dentry
-	kernfs.OrderedChildren
 }
 
 func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
@@ -147,7 +151,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 }
 
 func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD
new file mode 100644
index 000000000..8cf5b35d3
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/BUILD
@@ -0,0 +1,41 @@
+load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "overlay",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "dentry",
+    },
+)
+
+go_library(
+    name = "overlay",
+    srcs = [
+        "copy_up.go",
+        "directory.go",
+        "filesystem.go",
+        "fstree.go",
+        "non_directory.go",
+        "overlay.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
new file mode 100644
index 000000000..8f8dcfafe
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -0,0 +1,262 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"fmt"
+	"io"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isCopiedUp() bool {
+	return atomic.LoadUint32(&d.copiedUp) != 0
+}
+
+// copyUpLocked ensures that d exists on the upper layer, i.e. d.upperVD.Ok().
+//
+// Preconditions: filesystem.renameMu must be locked.
+func (d *dentry) copyUpLocked(ctx context.Context) error {
+	// Fast path.
+	if d.isCopiedUp() {
+		return nil
+	}
+
+	ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
+	switch ftype {
+	case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR:
+		// Can be copied-up.
+	default:
+		// Can't be copied-up.
+		return syserror.EPERM
+	}
+
+	// Ensure that our parent directory is copied-up.
+	if d.parent == nil {
+		// d is a filesystem root with no upper layer.
+		return syserror.EROFS
+	}
+	if err := d.parent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+
+	d.copyMu.Lock()
+	defer d.copyMu.Unlock()
+	if d.upperVD.Ok() {
+		// Raced with another call to d.copyUpLocked().
+		return nil
+	}
+	if d.vfsd.IsDead() {
+		// Raced with deletion of d.
+		return syserror.ENOENT
+	}
+
+	// Perform copy-up.
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	newpop := vfs.PathOperation{
+		Root:  d.parent.upperVD,
+		Start: d.parent.upperVD,
+		Path:  fspath.Parse(d.name),
+	}
+	cleanupUndoCopyUp := func() {
+		var err error
+		if ftype == linux.S_IFDIR {
+			err = vfsObj.RmdirAt(ctx, d.fs.creds, &newpop)
+		} else {
+			err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop)
+		}
+		if err != nil {
+			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)
+		}
+	}
+	switch ftype {
+	case linux.S_IFREG:
+		oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.lowerVDs[0],
+			Start: d.lowerVDs[0],
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDONLY,
+		})
+		if err != nil {
+			return err
+		}
+		defer oldFD.DecRef()
+		newFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &newpop, &vfs.OpenOptions{
+			Flags: linux.O_WRONLY | linux.O_CREAT | linux.O_EXCL,
+			Mode:  linux.FileMode(d.mode &^ linux.S_IFMT),
+		})
+		if err != nil {
+			return err
+		}
+		defer newFD.DecRef()
+		bufIOSeq := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size
+		for {
+			readN, readErr := oldFD.Read(ctx, bufIOSeq, vfs.ReadOptions{})
+			if readErr != nil && readErr != io.EOF {
+				cleanupUndoCopyUp()
+				return readErr
+			}
+			total := int64(0)
+			for total < readN {
+				writeN, writeErr := newFD.Write(ctx, bufIOSeq.DropFirst64(total), vfs.WriteOptions{})
+				total += writeN
+				if writeErr != nil {
+					cleanupUndoCopyUp()
+					return writeErr
+				}
+			}
+			if readErr == io.EOF {
+				break
+			}
+		}
+		if err := newFD.SetStat(ctx, vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  d.uid,
+				GID:  d.gid,
+			},
+		}); err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		d.upperVD = newFD.VirtualDentry()
+		d.upperVD.IncRef()
+
+	case linux.S_IFDIR:
+		if err := vfsObj.MkdirAt(ctx, d.fs.creds, &newpop, &vfs.MkdirOptions{
+			Mode: linux.FileMode(d.mode &^ linux.S_IFMT),
+		}); err != nil {
+			return err
+		}
+		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  d.uid,
+				GID:  d.gid,
+			},
+		}); err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+		if err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		d.upperVD = upperVD
+
+	case linux.S_IFLNK:
+		target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.lowerVDs[0],
+			Start: d.lowerVDs[0],
+		})
+		if err != nil {
+			return err
+		}
+		if err := vfsObj.SymlinkAt(ctx, d.fs.creds, &newpop, target); err != nil {
+			return err
+		}
+		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID,
+				Mode: uint16(d.mode),
+				UID:  d.uid,
+				GID:  d.gid,
+			},
+		}); err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+		if err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		d.upperVD = upperVD
+
+	case linux.S_IFBLK, linux.S_IFCHR:
+		lowerStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.lowerVDs[0],
+			Start: d.lowerVDs[0],
+		}, &vfs.StatOptions{})
+		if err != nil {
+			return err
+		}
+		if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{
+			Mode:     linux.FileMode(d.mode),
+			DevMajor: lowerStat.RdevMajor,
+			DevMinor: lowerStat.RdevMinor,
+		}); err != nil {
+			return err
+		}
+		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  d.uid,
+				GID:  d.gid,
+			},
+		}); err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+		if err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		d.upperVD = upperVD
+
+	default:
+		// Should have rejected this at the beginning of this function?
+		panic(fmt.Sprintf("unexpected file type %o", ftype))
+	}
+
+	// TODO(gvisor.dev/issue/1199): copy up xattrs
+
+	// Update the dentry's device and inode numbers (except for directories,
+	// for which these remain overlay-assigned).
+	if ftype != linux.S_IFDIR {
+		upperStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.upperVD,
+			Start: d.upperVD,
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_INO,
+		})
+		if err != nil {
+			d.upperVD.DecRef()
+			d.upperVD = vfs.VirtualDentry{}
+			cleanupUndoCopyUp()
+			return err
+		}
+		if upperStat.Mask&linux.STATX_INO == 0 {
+			d.upperVD.DecRef()
+			d.upperVD = vfs.VirtualDentry{}
+			cleanupUndoCopyUp()
+			return syserror.EREMOTE
+		}
+		atomic.StoreUint32(&d.devMajor, upperStat.DevMajor)
+		atomic.StoreUint32(&d.devMinor, upperStat.DevMinor)
+		atomic.StoreUint64(&d.ino, upperStat.Ino)
+	}
+
+	atomic.StoreUint32(&d.copiedUp, 1)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go
new file mode 100644
index 000000000..f5c2462a5
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/directory.go
@@ -0,0 +1,287 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func (d *dentry) isDir() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
+}
+
+// Preconditions: d.dirMu must be locked. d.isDir().
+func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) {
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	var readdirErr error
+	whiteouts := make(map[string]bool)
+	var maybeWhiteouts []string
+	d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool {
+		layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  layerVD,
+			Start: layerVD,
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+		})
+		if err != nil {
+			readdirErr = err
+			return false
+		}
+		defer layerFD.DecRef()
+
+		// Reuse slice allocated for maybeWhiteouts from a previous layer to
+		// reduce allocations.
+		maybeWhiteouts = maybeWhiteouts[:0]
+		if err := layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+			if dirent.Name == "." || dirent.Name == ".." {
+				return nil
+			}
+			if _, ok := whiteouts[dirent.Name]; ok {
+				// This file has been whited-out in a previous layer.
+				return nil
+			}
+			if dirent.Type == linux.DT_CHR {
+				// We have to determine if this is a whiteout, which doesn't
+				// count against the directory's emptiness. However, we can't
+				// do so while holding locks held by layerFD.IterDirents().
+				maybeWhiteouts = append(maybeWhiteouts, dirent.Name)
+				return nil
+			}
+			// Non-whiteout file in the directory prevents rmdir.
+			return syserror.ENOTEMPTY
+		})); err != nil {
+			readdirErr = err
+			return false
+		}
+
+		for _, maybeWhiteoutName := range maybeWhiteouts {
+			stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+				Root:  layerVD,
+				Start: layerVD,
+				Path:  fspath.Parse(maybeWhiteoutName),
+			}, &vfs.StatOptions{})
+			if err != nil {
+				readdirErr = err
+				return false
+			}
+			if stat.RdevMajor != 0 || stat.RdevMinor != 0 {
+				// This file is a real character device, not a whiteout.
+				readdirErr = syserror.ENOTEMPTY
+				return false
+			}
+			whiteouts[maybeWhiteoutName] = isUpper
+		}
+		// Continue iteration since we haven't found any non-whiteout files in
+		// this directory yet.
+		return true
+	})
+	return whiteouts, readdirErr
+}
+
+type directoryFD struct {
+	fileDescription
+	vfs.DirectoryFileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	mu      sync.Mutex
+	off     int64
+	dirents []vfs.Dirent
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	d := fd.dentry()
+	if fd.dirents == nil {
+		ds, err := d.getDirents(ctx)
+		if err != nil {
+			return err
+		}
+		fd.dirents = ds
+	}
+
+	for fd.off < int64(len(fd.dirents)) {
+		if err := cb.Handle(fd.dirents[fd.off]); err != nil {
+			return err
+		}
+		fd.off++
+	}
+	return nil
+}
+
+// Preconditions: d.isDir().
+func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
+	d.fs.renameMu.RLock()
+	defer d.fs.renameMu.RUnlock()
+	d.dirMu.Lock()
+	defer d.dirMu.Unlock()
+
+	if d.dirents != nil {
+		return d.dirents, nil
+	}
+
+	parent := genericParentOrSelf(d)
+	dirents := []vfs.Dirent{
+		{
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     d.ino,
+			NextOff: 1,
+		},
+		{
+			Name:    "..",
+			Type:    uint8(atomic.LoadUint32(&parent.mode) >> 12),
+			Ino:     parent.ino,
+			NextOff: 2,
+		},
+	}
+
+	// Merge dirents from all layers comprising this directory.
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	var readdirErr error
+	prevDirents := make(map[string]struct{})
+	var maybeWhiteouts []vfs.Dirent
+	d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool {
+		layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  layerVD,
+			Start: layerVD,
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+		})
+		if err != nil {
+			readdirErr = err
+			return false
+		}
+		defer layerFD.DecRef()
+
+		// Reuse slice allocated for maybeWhiteouts from a previous layer to
+		// reduce allocations.
+		maybeWhiteouts = maybeWhiteouts[:0]
+		if err := layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+			if dirent.Name == "." || dirent.Name == ".." {
+				return nil
+			}
+			if _, ok := prevDirents[dirent.Name]; ok {
+				// This file is hidden by, or merged with, another file with
+				// the same name in a previous layer.
+				return nil
+			}
+			prevDirents[dirent.Name] = struct{}{}
+			if dirent.Type == linux.DT_CHR {
+				// We can't determine if this file is a whiteout while holding
+				// locks held by layerFD.IterDirents().
+				maybeWhiteouts = append(maybeWhiteouts, dirent)
+				return nil
+			}
+			dirent.NextOff = int64(len(dirents) + 1)
+			dirents = append(dirents, dirent)
+			return nil
+		})); err != nil {
+			readdirErr = err
+			return false
+		}
+
+		for _, dirent := range maybeWhiteouts {
+			stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+				Root:  layerVD,
+				Start: layerVD,
+				Path:  fspath.Parse(dirent.Name),
+			}, &vfs.StatOptions{})
+			if err != nil {
+				readdirErr = err
+				return false
+			}
+			if stat.RdevMajor == 0 && stat.RdevMinor == 0 {
+				// This file is a whiteout; don't emit a dirent for it.
+				continue
+			}
+			dirent.NextOff = int64(len(dirents) + 1)
+			dirents = append(dirents, dirent)
+		}
+		return true
+	})
+	if readdirErr != nil {
+		return nil, readdirErr
+	}
+
+	// Cache dirents for future directoryFDs.
+	d.dirents = dirents
+	return dirents, nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		if offset == 0 {
+			// Ensure that the next call to fd.IterDirents() calls
+			// fd.dentry().getDirents().
+			fd.dirents = nil
+		}
+		fd.off = offset
+		return fd.off, nil
+	case linux.SEEK_CUR:
+		offset += fd.off
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		// Don't clear fd.dirents in this case, even if offset == 0.
+		fd.off = offset
+		return fd.off, nil
+	default:
+		return 0, syserror.EINVAL
+	}
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync. Forwards sync to the upper
+// layer, if there is one. The lower layer doesn't need to sync because it
+// never changes.
+func (fd *directoryFD) Sync(ctx context.Context) error {
+	d := fd.dentry()
+	if !d.isCopiedUp() {
+		return nil
+	}
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	pop := vfs.PathOperation{
+		Root:  d.upperVD,
+		Start: d.upperVD,
+	}
+	upperFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
+	if err != nil {
+		return err
+	}
+	err = upperFD.Sync(ctx)
+	upperFD.DecRef()
+	return err
+}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
new file mode 100644
index 000000000..6b705e955
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -0,0 +1,1364 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
+// opaque directories.
+// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
+const _OVL_XATTR_OPAQUE = "trusted.overlay.opaque"
+
+func isWhiteout(stat *linux.Statx) bool {
+	return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	if fs.opts.UpperRoot.Ok() {
+		return fs.opts.UpperRoot.Mount().Filesystem().Impl().Sync(ctx)
+	}
+	return nil
+}
+
+var dentrySlicePool = sync.Pool{
+	New: func() interface{} {
+		ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+		return &ds
+	},
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+	if ds == nil {
+		ds = dentrySlicePool.Get().(*[]*dentry)
+	}
+	*ds = append(*ds, d)
+	return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+	// Allow dentries to be GC'd.
+	for i := range *ds {
+		(*ds)[i] = nil
+	}
+	*ds = (*ds)[:0]
+	dentrySlicePool.Put(ds)
+}
+
+// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls
+// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckDrop(ds **[]*dentry) {
+	fs.renameMu.RUnlock()
+	if *ds == nil {
+		return
+	}
+	if len(**ds) != 0 {
+		fs.renameMu.Lock()
+		for _, d := range **ds {
+			d.checkDropLocked()
+		}
+		fs.renameMu.Unlock()
+	}
+	putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckDrop(ds **[]*dentry) {
+	if *ds == nil {
+		fs.renameMu.Unlock()
+		return
+	}
+	for _, d := range **ds {
+		d.checkDropLocked()
+	}
+	fs.renameMu.Unlock()
+	putDentrySlice(*ds)
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may have a reference count of zero, and which therefore
+// should be dropped once traversal is complete, are appended to ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done().
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
+	}
+	if name == ".." {
+		if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return d, nil
+		}
+		if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return d.parent, nil
+	}
+	child, err := fs.getChildLocked(ctx, d, name, ds)
+	if err != nil {
+		return nil, err
+	}
+	if err := rp.CheckMount(&child.vfsd); err != nil {
+		return nil, err
+	}
+	if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+	if child, ok := parent.children[name]; ok {
+		return child, nil
+	}
+	child, err := fs.lookupLocked(ctx, parent, name)
+	if err != nil {
+		return nil, err
+	}
+	if parent.children == nil {
+		parent.children = make(map[string]*dentry)
+	}
+	parent.children[name] = child
+	// child's refcount is initially 0, so it may be dropped after traversal.
+	*ds = appendDentry(*ds, child)
+	return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
+	childPath := fspath.Parse(name)
+	child := fs.newDentry()
+	existsOnAnyLayer := false
+	var lookupErr error
+
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+	parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
+		childVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  parentVD,
+			Start: parentVD,
+			Path:  childPath,
+		}, &vfs.GetDentryOptions{})
+		if err == syserror.ENOENT || err == syserror.ENAMETOOLONG {
+			// The file doesn't exist on this layer. Proceed to the next one.
+			return true
+		}
+		if err != nil {
+			lookupErr = err
+			return false
+		}
+
+		mask := uint32(linux.STATX_TYPE)
+		if !existsOnAnyLayer {
+			// Mode, UID, GID, and (for non-directories) inode number come from
+			// the topmost layer on which the file exists.
+			mask |= linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+		}
+		stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  childVD,
+			Start: childVD,
+		}, &vfs.StatOptions{
+			Mask: mask,
+		})
+		if err != nil {
+			lookupErr = err
+			return false
+		}
+		if stat.Mask&mask != mask {
+			lookupErr = syserror.EREMOTE
+			return false
+		}
+
+		if isWhiteout(&stat) {
+			// This is a whiteout, so it "doesn't exist" on this layer, and
+			// layers below this one are ignored.
+			return false
+		}
+		isDir := stat.Mode&linux.S_IFMT == linux.S_IFDIR
+		if existsOnAnyLayer && !isDir {
+			// Directories are not merged with non-directory files from lower
+			// layers; instead, layers including and below the first
+			// non-directory file are ignored. (This file must be a directory
+			// on previous layers, since lower layers aren't searched for
+			// non-directory files.)
+			return false
+		}
+
+		// Update child to include this layer.
+		if isUpper {
+			child.upperVD = childVD
+			child.copiedUp = 1
+		} else {
+			child.lowerVDs = append(child.lowerVDs, childVD)
+		}
+		if !existsOnAnyLayer {
+			existsOnAnyLayer = true
+			child.mode = uint32(stat.Mode)
+			child.uid = stat.UID
+			child.gid = stat.GID
+			child.devMajor = stat.DevMajor
+			child.devMinor = stat.DevMinor
+			child.ino = stat.Ino
+		}
+
+		// For non-directory files, only the topmost layer that contains a file
+		// matters.
+		if !isDir {
+			return false
+		}
+
+		// Directories are merged with directories from lower layers if they
+		// are not explicitly opaque.
+		opaqueVal, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  childVD,
+			Start: childVD,
+		}, &vfs.GetxattrOptions{
+			Name: _OVL_XATTR_OPAQUE,
+			Size: 1,
+		})
+		return !(err == nil && opaqueVal == "y")
+	})
+
+	if lookupErr != nil {
+		child.destroyLocked()
+		return nil, lookupErr
+	}
+	if !existsOnAnyLayer {
+		child.destroyLocked()
+		return nil, syserror.ENOENT
+	}
+
+	// Device and inode numbers were copied from the topmost layer above;
+	// override them if necessary.
+	if child.isDir() {
+		child.devMajor = linux.UNNAMED_MAJOR
+		child.devMinor = fs.dirDevMinor
+		child.ino = fs.newDirIno()
+	} else if !child.upperVD.Ok() {
+		child.devMajor = linux.UNNAMED_MAJOR
+		child.devMinor = fs.lowerDevMinors[child.lowerVDs[0].Mount().Filesystem()]
+	}
+
+	parent.IncRef()
+	child.parent = parent
+	child.name = name
+	return child, nil
+}
+
+// lookupLayerLocked is similar to lookupLocked, but only returns information
+// about the file rather than a dentry.
+//
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) {
+	childPath := fspath.Parse(name)
+	lookupLayer := lookupLayerNone
+	var lookupErr error
+
+	parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
+		stat, err := fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  parentVD,
+			Start: parentVD,
+			Path:  childPath,
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_TYPE,
+		})
+		if err == syserror.ENOENT || err == syserror.ENAMETOOLONG {
+			// The file doesn't exist on this layer. Proceed to the next
+			// one.
+			return true
+		}
+		if err != nil {
+			lookupErr = err
+			return false
+		}
+		if stat.Mask&linux.STATX_TYPE == 0 {
+			// Linux's overlayfs tends to return EREMOTE in cases where a file
+			// is unusable for reasons that are not better captured by another
+			// errno.
+			lookupErr = syserror.EREMOTE
+			return false
+		}
+		if isWhiteout(&stat) {
+			// This is a whiteout, so it "doesn't exist" on this layer, and
+			// layers below this one are ignored.
+			if isUpper {
+				lookupLayer = lookupLayerUpperWhiteout
+			}
+			return false
+		}
+		// The file exists; we can stop searching.
+		if isUpper {
+			lookupLayer = lookupLayerUpper
+		} else {
+			lookupLayer = lookupLayerLower
+		}
+		return false
+	})
+
+	return lookupLayer, lookupErr
+}
+
+type lookupLayer int
+
+const (
+	// lookupLayerNone indicates that no file exists at the given path on the
+	// upper layer, and is either whited out or does not exist on lower layers.
+	// Therefore, the file does not exist in the overlay filesystem, and file
+	// creation may proceed normally (if an upper layer exists).
+	lookupLayerNone lookupLayer = iota
+
+	// lookupLayerLower indicates that no file exists at the given path on the
+	// upper layer, but exists on a lower layer. Therefore, the file exists in
+	// the overlay filesystem, but must be copied-up before mutation.
+	lookupLayerLower
+
+	// lookupLayerUpper indicates that a non-whiteout file exists at the given
+	// path on the upper layer. Therefore, the file exists in the overlay
+	// filesystem, and is already copied-up.
+	lookupLayerUpper
+
+	// lookupLayerUpperWhiteout indicates that a whiteout exists at the given
+	// path on the upper layer. Therefore, the file does not exist in the
+	// overlay filesystem, and file creation must remove the whiteout before
+	// proceeding.
+	lookupLayerUpperWhiteout
+)
+
+func (ll lookupLayer) existsInOverlay() bool {
+	return ll == lookupLayerLower || ll == lookupLayerUpper
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions: fs.renameMu must be locked. !rp.Done().
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+	for !rp.Final() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	for !rp.Done() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	start := rp.Start().Impl().(*dentry)
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EEXIST
+	}
+	if !dir && rp.MustBeDir() {
+		return syserror.ENOENT
+	}
+	if parent.vfsd.IsDead() {
+		return syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	// Determine if a file already exists at name.
+	if _, ok := parent.children[name]; ok {
+		return syserror.EEXIST
+	}
+	childLayer, err := fs.lookupLayerLocked(ctx, parent, name)
+	if err != nil {
+		return err
+	}
+	if childLayer.existsInOverlay() {
+		return syserror.EEXIST
+	}
+
+	// Ensure that the parent directory is copied-up so that we can create the
+	// new file in the upper layer.
+	if err := parent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+
+	// Finally create the new file.
+	if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil {
+		return err
+	}
+	parent.dirents = nil
+	return nil
+}
+
+// Preconditions: pop's parent directory has been copied up.
+func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) error {
+	return vfsObj.MknodAt(ctx, fs.creds, pop, &vfs.MknodOptions{
+		Mode: linux.S_IFCHR, // permissions == include/linux/fs.h:WHITEOUT_MODE == 0
+		// DevMajor == DevMinor == 0, from include/linux/fs.h:WHITEOUT_DEV
+	})
+}
+
+func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) {
+	if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil {
+		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)
+	}
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.checkPermissions(creds, ats)
+}
+
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	layerVD := d.topLayer()
+	return fs.vfsfs.VirtualFilesystem().BoundEndpointAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	}, &opts)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !d.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+			return nil, err
+		}
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	start := rp.Start().Impl().(*dentry)
+	d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+		if rp.Mount() != vd.Mount() {
+			return syserror.EXDEV
+		}
+		old := vd.Dentry().Impl().(*dentry)
+		if old.isDir() {
+			return syserror.EPERM
+		}
+		if err := old.copyUpLocked(ctx); err != nil {
+			return err
+		}
+		vfsObj := fs.vfsfs.VirtualFilesystem()
+		newpop := vfs.PathOperation{
+			Root:  parent.upperVD,
+			Start: parent.upperVD,
+			Path:  fspath.Parse(childName),
+		}
+		if haveUpperWhiteout {
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
+				return err
+			}
+		}
+		if err := vfsObj.LinkAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  old.upperVD,
+			Start: old.upperVD,
+		}, &newpop); err != nil {
+			if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
+			}
+			return err
+		}
+		creds := rp.Credentials()
+		if err := vfsObj.SetStatAt(ctx, fs.creds, &newpop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  uint32(creds.EffectiveKUID),
+				GID:  uint32(creds.EffectiveKGID),
+			},
+		}); err != nil {
+			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil {
+				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)
+			} else if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
+			}
+			return err
+		}
+		return nil
+	})
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+		vfsObj := fs.vfsfs.VirtualFilesystem()
+		pop := vfs.PathOperation{
+			Root:  parent.upperVD,
+			Start: parent.upperVD,
+			Path:  fspath.Parse(childName),
+		}
+		if haveUpperWhiteout {
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+				return err
+			}
+		}
+		if err := vfsObj.MkdirAt(ctx, fs.creds, &pop, &opts); err != nil {
+			if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		creds := rp.Credentials()
+		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  uint32(creds.EffectiveKUID),
+				GID:  uint32(creds.EffectiveKGID),
+			},
+		}); err != nil {
+			if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
+				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)
+			} else if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		if haveUpperWhiteout {
+			// There may be directories on lower layers (previously hidden by
+			// the whiteout) that the new directory should not be merged with.
+			// Mark it opaque to prevent merging.
+			if err := vfsObj.SetxattrAt(ctx, fs.creds, &pop, &vfs.SetxattrOptions{
+				Name:  _OVL_XATTR_OPAQUE,
+				Value: "y",
+			}); err != nil {
+				if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
+					ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)
+				} else {
+					fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+				}
+				return err
+			}
+		}
+		return nil
+	})
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+		// Disallow attempts to create whiteouts.
+		if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 {
+			return syserror.EPERM
+		}
+		vfsObj := fs.vfsfs.VirtualFilesystem()
+		pop := vfs.PathOperation{
+			Root:  parent.upperVD,
+			Start: parent.upperVD,
+			Path:  fspath.Parse(childName),
+		}
+		if haveUpperWhiteout {
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+				return err
+			}
+		}
+		if err := vfsObj.MknodAt(ctx, fs.creds, &pop, &opts); err != nil {
+			if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		creds := rp.Credentials()
+		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  uint32(creds.EffectiveKUID),
+				GID:  uint32(creds.EffectiveKGID),
+			},
+		}); err != nil {
+			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)
+			} else if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		return nil
+	})
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	mayCreate := opts.Flags&linux.O_CREAT != 0
+	mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
+
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+
+	start := rp.Start().Impl().(*dentry)
+	if rp.Done() {
+		if mustCreate {
+			return nil, syserror.EEXIST
+		}
+		return start.openLocked(ctx, rp, &opts)
+	}
+
+afterTrailingSymlink:
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+	// Determine whether or not we need to create a file.
+	parent.dirMu.Lock()
+	child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+	if err == syserror.ENOENT && mayCreate {
+		fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds)
+		parent.dirMu.Unlock()
+		return fd, err
+	}
+	if err != nil {
+		parent.dirMu.Unlock()
+		return nil, err
+	}
+	// Open existing child or follow symlink.
+	parent.dirMu.Unlock()
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	if child.isSymlink() && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		start = parent
+		goto afterTrailingSymlink
+	}
+	return child.openLocked(ctx, rp, &opts)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(opts)
+	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+		return nil, err
+	}
+	if ats.MayWrite() {
+		if err := d.copyUpLocked(ctx); err != nil {
+			return nil, err
+		}
+	}
+	mnt := rp.Mount()
+
+	// Directory FDs open FDs from each layer when directory entries are read,
+	// so they don't require opening an FD from d.topLayer() up front.
+	ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
+	if ftype == linux.S_IFDIR {
+		// Can't open directories with O_CREAT.
+		if opts.Flags&linux.O_CREAT != 0 {
+			return nil, syserror.EISDIR
+		}
+		// Can't open directories writably.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		if opts.Flags&linux.O_DIRECT != 0 {
+			return nil, syserror.EINVAL
+		}
+		fd := &directoryFD{}
+		fd.LockFD.Init(&d.locks)
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+			UseDentryMetadata: true,
+		}); err != nil {
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	}
+
+	layerVD, isUpper := d.topLayerInfo()
+	layerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	}, opts)
+	if err != nil {
+		return nil, err
+	}
+	layerFlags := layerFD.StatusFlags()
+	fd := &nonDirectoryFD{
+		copiedUp:    isUpper,
+		cachedFD:    layerFD,
+		cachedFlags: layerFlags,
+	}
+	fd.LockFD.Init(&d.locks)
+	layerFDOpts := layerFD.Options()
+	if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil {
+		layerFD.DecRef()
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Preconditions: parent.dirMu must be locked. parent does not already contain
+// a child named rp.Component().
+func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
+	creds := rp.Credentials()
+	if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	if parent.vfsd.IsDead() {
+		return nil, syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return nil, err
+	}
+	defer mnt.EndWrite()
+
+	if err := parent.copyUpLocked(ctx); err != nil {
+		return nil, err
+	}
+
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+	childName := rp.Component()
+	pop := vfs.PathOperation{
+		Root:  parent.upperVD,
+		Start: parent.upperVD,
+		Path:  fspath.Parse(childName),
+	}
+	// We don't know if a whiteout exists on the upper layer; speculatively
+	// unlink it.
+	//
+	// TODO(gvisor.dev/issue/1199): Modify OpenAt => stepLocked so that we do
+	// know whether a whiteout exists.
+	var haveUpperWhiteout bool
+	switch err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err {
+	case nil:
+		haveUpperWhiteout = true
+	case syserror.ENOENT:
+		haveUpperWhiteout = false
+	default:
+		return nil, err
+	}
+	// Create the file on the upper layer, and get an FD representing it.
+	upperFD, err := vfsObj.OpenAt(ctx, fs.creds, &pop, &vfs.OpenOptions{
+		Flags: opts.Flags&^vfs.FileCreationFlags | linux.O_CREAT | linux.O_EXCL,
+		Mode:  opts.Mode,
+	})
+	if err != nil {
+		if haveUpperWhiteout {
+			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+		}
+		return nil, err
+	}
+	// Change the file's owner to the caller. We can't use upperFD.SetStat()
+	// because it will pick up creds from ctx.
+	if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_UID | linux.STATX_GID,
+			UID:  uint32(creds.EffectiveKUID),
+			GID:  uint32(creds.EffectiveKGID),
+		},
+	}); err != nil {
+		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)
+		} else if haveUpperWhiteout {
+			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+		}
+		return nil, err
+	}
+	// Re-lookup to get a dentry representing the new file, which is needed for
+	// the returned FD.
+	child, err := fs.getChildLocked(ctx, parent, childName, ds)
+	if err != nil {
+		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)
+		} else if haveUpperWhiteout {
+			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+		}
+		return nil, err
+	}
+	// Finally construct the overlay FD.
+	upperFlags := upperFD.StatusFlags()
+	fd := &nonDirectoryFD{
+		copiedUp:    true,
+		cachedFD:    upperFD,
+		cachedFlags: upperFlags,
+	}
+	fd.LockFD.Init(&child.locks)
+	upperFDOpts := upperFD.Options()
+	if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil {
+		upperFD.DecRef()
+		// Don't bother with cleanup; the file was created successfully, we
+		// just can't open it anymore for some reason.
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	layerVD := d.topLayer()
+	return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	})
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	if opts.Flags != 0 {
+		return syserror.EINVAL
+	}
+
+	var ds *[]*dentry
+	fs.renameMu.Lock()
+	defer fs.renameMuUnlockAndCheckDrop(&ds)
+	newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
+	if err != nil {
+		return err
+	}
+	newName := rp.Component()
+	if newName == "." || newName == ".." {
+		return syserror.EBUSY
+	}
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	// FIXME(gvisor.dev/issue/1199): Actually implement rename.
+	_ = newParent
+	return syserror.EXDEV
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	start := rp.Start().Impl().(*dentry)
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	name := rp.Component()
+	if name == "." {
+		return syserror.EINVAL
+	}
+	if name == ".." {
+		return syserror.ENOTEMPTY
+	}
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	// Ensure that parent is copied-up before potentially holding child.copyMu
+	// below.
+	if err := parent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+
+	// Unlike UnlinkAt, we need a dentry representing the child directory being
+	// removed in order to verify that it's empty.
+	child, err := fs.getChildLocked(ctx, parent, name, &ds)
+	if err != nil {
+		return err
+	}
+	if !child.isDir() {
+		return syserror.ENOTDIR
+	}
+	child.dirMu.Lock()
+	defer child.dirMu.Unlock()
+	whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx)
+	if err != nil {
+		return err
+	}
+	child.copyMu.RLock()
+	defer child.copyMu.RUnlock()
+	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+		return err
+	}
+
+	pop := vfs.PathOperation{
+		Root:  parent.upperVD,
+		Start: parent.upperVD,
+		Path:  fspath.Parse(name),
+	}
+	if child.upperVD.Ok() {
+		cleanupRecreateWhiteouts := func() {
+			if !child.upperVD.Ok() {
+				return
+			}
+			for whiteoutName, whiteoutUpper := range whiteouts {
+				if !whiteoutUpper {
+					continue
+				}
+				if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{
+					Root:  child.upperVD,
+					Start: child.upperVD,
+					Path:  fspath.Parse(whiteoutName),
+				}); err != nil && err != syserror.EEXIST {
+					ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)
+				}
+			}
+		}
+		// Remove existing whiteouts on the upper layer.
+		for whiteoutName, whiteoutUpper := range whiteouts {
+			if !whiteoutUpper {
+				continue
+			}
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
+				Root:  child.upperVD,
+				Start: child.upperVD,
+				Path:  fspath.Parse(whiteoutName),
+			}); err != nil {
+				cleanupRecreateWhiteouts()
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+				return err
+			}
+		}
+		// Remove the existing directory on the upper layer.
+		if err := vfsObj.RmdirAt(ctx, fs.creds, &pop); err != nil {
+			cleanupRecreateWhiteouts()
+			vfsObj.AbortDeleteDentry(&child.vfsd)
+			return err
+		}
+	}
+	if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
+		// Don't attempt to recover from this: the original directory is
+		// already gone, so any dentries representing it are invalid, and
+		// creating a new directory won't undo that.
+		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err)
+		vfsObj.AbortDeleteDentry(&child.vfsd)
+		return err
+	}
+
+	vfsObj.CommitDeleteDentry(&child.vfsd)
+	delete(parent.children, name)
+	ds = appendDentry(ds, child)
+	parent.dirents = nil
+	return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+		return err
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// Changes to d's attributes are serialized by d.copyMu.
+	d.copyMu.Lock()
+	defer d.copyMu.Unlock()
+	if err := d.fs.vfsfs.VirtualFilesystem().SetStatAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.upperVD,
+		Start: d.upperVD,
+	}, &opts); err != nil {
+		return err
+	}
+	d.updateAfterSetStatLocked(&opts)
+	return nil
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+
+	var stat linux.Statx
+	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
+		layerVD := d.topLayer()
+		stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  layerVD,
+			Start: layerVD,
+		}, &vfs.StatOptions{
+			Mask: layerMask,
+			Sync: opts.Sync,
+		})
+		if err != nil {
+			return linux.Statx{}, err
+		}
+	}
+	d.statInternalTo(ctx, &opts, &stat)
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	return fs.statFS(ctx)
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+		vfsObj := fs.vfsfs.VirtualFilesystem()
+		pop := vfs.PathOperation{
+			Root:  parent.upperVD,
+			Start: parent.upperVD,
+			Path:  fspath.Parse(childName),
+		}
+		if haveUpperWhiteout {
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+				return err
+			}
+		}
+		if err := vfsObj.SymlinkAt(ctx, fs.creds, &pop, target); err != nil {
+			if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		creds := rp.Credentials()
+		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  uint32(creds.EffectiveKUID),
+				GID:  uint32(creds.EffectiveKGID),
+			},
+		}); err != nil {
+			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)
+			} else if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		return nil
+	})
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	start := rp.Start().Impl().(*dentry)
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EISDIR
+	}
+	if rp.MustBeDir() {
+		return syserror.ENOTDIR
+	}
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	// Ensure that parent is copied-up before potentially holding child.copyMu
+	// below.
+	if err := parent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+
+	child := parent.children[name]
+	var childLayer lookupLayer
+	if child != nil {
+		if child.isDir() {
+			return syserror.EISDIR
+		}
+		if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+			return err
+		}
+		// Hold child.copyMu to prevent it from being copied-up during
+		// deletion.
+		child.copyMu.RLock()
+		defer child.copyMu.RUnlock()
+		if child.upperVD.Ok() {
+			childLayer = lookupLayerUpper
+		} else {
+			childLayer = lookupLayerLower
+		}
+	} else {
+		// Determine if the file being unlinked actually exists. Holding
+		// parent.dirMu prevents a dentry from being instantiated for the file,
+		// which in turn prevents it from being copied-up, so this result is
+		// stable.
+		childLayer, err = fs.lookupLayerLocked(ctx, parent, name)
+		if err != nil {
+			return err
+		}
+		if !childLayer.existsInOverlay() {
+			return syserror.ENOENT
+		}
+	}
+
+	pop := vfs.PathOperation{
+		Root:  parent.upperVD,
+		Start: parent.upperVD,
+		Path:  fspath.Parse(name),
+	}
+	if childLayer == lookupLayerUpper {
+		// Remove the existing file on the upper layer.
+		if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+			if child != nil {
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+			}
+			return err
+		}
+	}
+	if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
+		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err)
+		if child != nil {
+			vfsObj.AbortDeleteDentry(&child.vfsd)
+		}
+		return err
+	}
+
+	if child != nil {
+		vfsObj.CommitDeleteDentry(&child.vfsd)
+		delete(parent.children, name)
+		ds = appendDentry(ds, child)
+	}
+	parent.dirents = nil
+	return nil
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	// TODO(gvisor.dev/issue/1199): Linux overlayfs actually allows listxattr,
+	// but not any other xattr syscalls. For now we just reject all of them.
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.renameMu.RLock()
+	defer fs.renameMu.RUnlock()
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
+}
diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go
new file mode 100644
index 000000000..c0749e711
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/non_directory.go
@@ -0,0 +1,266 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isSymlink() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
+}
+
+func (d *dentry) readlink(ctx context.Context) (string, error) {
+	layerVD := d.topLayer()
+	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	})
+}
+
+type nonDirectoryFD struct {
+	fileDescription
+
+	// If copiedUp is false, cachedFD represents
+	// fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents
+	// fileDescription.dentry().upperVD. cachedFlags is the last known value of
+	// cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are
+	// protected by mu.
+	mu          sync.Mutex
+	copiedUp    bool
+	cachedFD    *vfs.FileDescription
+	cachedFlags uint32
+}
+
+func (fd *nonDirectoryFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return nil, err
+	}
+	wrappedFD.IncRef()
+	return wrappedFD, nil
+}
+
+func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) {
+	d := fd.dentry()
+	statusFlags := fd.vfsfd.StatusFlags()
+	if !fd.copiedUp && d.isCopiedUp() {
+		// Switch to the copied-up file.
+		upperVD := d.topLayer()
+		upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  upperVD,
+			Start: upperVD,
+		}, &vfs.OpenOptions{
+			Flags: statusFlags,
+		})
+		if err != nil {
+			return nil, err
+		}
+		oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR)
+		if oldOffErr == nil {
+			if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil {
+				upperFD.DecRef()
+				return nil, err
+			}
+		}
+		fd.cachedFD.DecRef()
+		fd.copiedUp = true
+		fd.cachedFD = upperFD
+		fd.cachedFlags = statusFlags
+	} else if fd.cachedFlags != statusFlags {
+		if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil {
+			return nil, err
+		}
+		fd.cachedFlags = statusFlags
+	}
+	return fd.cachedFD, nil
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *nonDirectoryFD) Release() {
+	fd.cachedFD.DecRef()
+	fd.cachedFD = nil
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *nonDirectoryFD) OnClose(ctx context.Context) error {
+	// Linux doesn't define ovl_file_operations.flush at all (i.e. its
+	// equivalent to OnClose is a no-op). We pass through to
+	// fd.cachedFD.OnClose() without upgrading if fd.dentry() has been
+	// copied-up, since OnClose is mostly used to define post-close writeback,
+	// and if fd.cachedFD hasn't been updated then it can't have been used to
+	// mutate fd.dentry() anyway.
+	fd.mu.Lock()
+	if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags {
+		if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil {
+			fd.mu.Unlock()
+			return err
+		}
+		fd.cachedFlags = statusFlags
+	}
+	wrappedFD := fd.cachedFD
+	defer wrappedFD.IncRef()
+	fd.mu.Unlock()
+	return wrappedFD.OnClose(ctx)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
+		wrappedFD, err := fd.getCurrentFD(ctx)
+		if err != nil {
+			return linux.Statx{}, err
+		}
+		stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{
+			Mask: layerMask,
+			Sync: opts.Sync,
+		})
+		wrappedFD.DecRef()
+		if err != nil {
+			return linux.Statx{}, err
+		}
+	}
+	fd.dentry().statInternalTo(ctx, &opts, &stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	d := fd.dentry()
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+		return err
+	}
+	mnt := fd.vfsfd.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// Changes to d's attributes are serialized by d.copyMu.
+	d.copyMu.Lock()
+	defer d.copyMu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return err
+	}
+	if err := wrappedFD.SetStat(ctx, opts); err != nil {
+		return err
+	}
+	d.updateAfterSetStatLocked(&opts)
+	return nil
+}
+
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *nonDirectoryFD) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return fd.filesystem().statFS(ctx)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef()
+	return wrappedFD.PRead(ctx, dst, offset, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *nonDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// Hold fd.mu during the read to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Read(ctx, dst, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef()
+	return wrappedFD.PWrite(ctx, src, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *nonDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// Hold fd.mu during the write to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Write(ctx, src, opts)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *nonDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	// Hold fd.mu during the seek to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Seek(ctx, offset, whence)
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *nonDirectoryFD) Sync(ctx context.Context) error {
+	fd.mu.Lock()
+	if !fd.dentry().isCopiedUp() {
+		fd.mu.Unlock()
+		return nil
+	}
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		fd.mu.Unlock()
+		return err
+	}
+	wrappedFD.IncRef()
+	defer wrappedFD.DecRef()
+	fd.mu.Unlock()
+	return wrappedFD.Sync(ctx)
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return err
+	}
+	defer wrappedFD.DecRef()
+	return wrappedFD.ConfigureMMap(ctx, opts)
+}
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
new file mode 100644
index 000000000..e720d4825
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -0,0 +1,627 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package overlay provides an overlay filesystem implementation, which
+// synthesizes a filesystem by composing one or more immutable filesystems
+// ("lower layers") with an optional mutable filesystem ("upper layer").
+//
+// Lock order:
+//
+// directoryFD.mu / nonDirectoryFD.mu
+//   filesystem.renameMu
+//     dentry.dirMu
+//       dentry.copyMu
+//
+// Locking dentry.dirMu in multiple dentries requires that parent dentries are
+// locked before child dentries, and that filesystem.renameMu is locked to
+// stabilize this relationship.
+package overlay
+
+import (
+	"strings"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the default filesystem name.
+const Name = "overlay"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
+// FilesystemType.GetFilesystem.
+type FilesystemOptions struct {
+	// Callers passing FilesystemOptions to
+	// overlay.FilesystemType.GetFilesystem() are responsible for ensuring that
+	// the vfs.Mounts comprising the layers of the overlay filesystem do not
+	// contain submounts.
+
+	// If UpperRoot.Ok(), it is the root of the writable upper layer of the
+	// overlay.
+	UpperRoot vfs.VirtualDentry
+
+	// LowerRoots contains the roots of the immutable lower layers of the
+	// overlay. LowerRoots is immutable.
+	LowerRoots []vfs.VirtualDentry
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// Immutable options.
+	opts FilesystemOptions
+
+	// creds is a copy of the filesystem's creator's credentials, which are
+	// used for accesses to the filesystem's layers. creds is immutable.
+	creds *auth.Credentials
+
+	// dirDevMinor is the device minor number used for directories. dirDevMinor
+	// is immutable.
+	dirDevMinor uint32
+
+	// lowerDevMinors maps lower layer filesystems to device minor numbers
+	// assigned to non-directory files originating from that filesystem.
+	// lowerDevMinors is immutable.
+	lowerDevMinors map[*vfs.Filesystem]uint32
+
+	// renameMu synchronizes renaming with non-renaming operations in order to
+	// ensure consistent lock ordering between dentry.dirMu in different
+	// dentries.
+	renameMu sync.RWMutex
+
+	// lastDirIno is the last inode number assigned to a directory. lastDirIno
+	// is accessed using atomic memory operations.
+	lastDirIno uint64
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	fsoptsRaw := opts.InternalData
+	fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions)
+	if fsoptsRaw != nil && !haveFSOpts {
+		ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
+		return nil, nil, syserror.EINVAL
+	}
+	if haveFSOpts {
+		if len(fsopts.LowerRoots) == 0 {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
+			return nil, nil, syserror.EINVAL
+		}
+		if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
+			return nil, nil, syserror.EINVAL
+		}
+		// We don't enforce a maximum number of lower layers when not
+		// configured by applications; the sandbox owner can have an overlay
+		// filesystem with any number of lower layers.
+	} else {
+		vfsroot := vfs.RootFromContext(ctx)
+		defer vfsroot.DecRef()
+		upperPathname, ok := mopts["upperdir"]
+		if ok {
+			delete(mopts, "upperdir")
+			// Linux overlayfs also requires a workdir when upperdir is
+			// specified; we don't, so silently ignore this option.
+			delete(mopts, "workdir")
+			upperPath := fspath.Parse(upperPathname)
+			if !upperPath.Absolute {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
+				return nil, nil, syserror.EINVAL
+			}
+			upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
+				Root:               vfsroot,
+				Start:              vfsroot,
+				Path:               upperPath,
+				FollowFinalSymlink: true,
+			}, &vfs.GetDentryOptions{
+				CheckSearchable: true,
+			})
+			if err != nil {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
+				return nil, nil, err
+			}
+			defer upperRoot.DecRef()
+			privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
+			if err != nil {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
+				return nil, nil, err
+			}
+			defer privateUpperRoot.DecRef()
+			fsopts.UpperRoot = privateUpperRoot
+		}
+		lowerPathnamesStr, ok := mopts["lowerdir"]
+		if !ok {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
+			return nil, nil, syserror.EINVAL
+		}
+		delete(mopts, "lowerdir")
+		lowerPathnames := strings.Split(lowerPathnamesStr, ":")
+		const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
+		if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
+			return nil, nil, syserror.EINVAL
+		}
+		if len(lowerPathnames) > maxLowerLayers {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
+			return nil, nil, syserror.EINVAL
+		}
+		for _, lowerPathname := range lowerPathnames {
+			lowerPath := fspath.Parse(lowerPathname)
+			if !lowerPath.Absolute {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
+				return nil, nil, syserror.EINVAL
+			}
+			lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
+				Root:               vfsroot,
+				Start:              vfsroot,
+				Path:               lowerPath,
+				FollowFinalSymlink: true,
+			}, &vfs.GetDentryOptions{
+				CheckSearchable: true,
+			})
+			if err != nil {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
+				return nil, nil, err
+			}
+			defer lowerRoot.DecRef()
+			privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
+			if err != nil {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
+				return nil, nil, err
+			}
+			defer privateLowerRoot.DecRef()
+			fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot)
+		}
+	}
+	if len(mopts) != 0 {
+		ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Allocate device numbers.
+	dirDevMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+	lowerDevMinors := make(map[*vfs.Filesystem]uint32)
+	for _, lowerRoot := range fsopts.LowerRoots {
+		lowerFS := lowerRoot.Mount().Filesystem()
+		if _, ok := lowerDevMinors[lowerFS]; !ok {
+			devMinor, err := vfsObj.GetAnonBlockDevMinor()
+			if err != nil {
+				vfsObj.PutAnonBlockDevMinor(dirDevMinor)
+				for _, lowerDevMinor := range lowerDevMinors {
+					vfsObj.PutAnonBlockDevMinor(lowerDevMinor)
+				}
+				return nil, nil, err
+			}
+			lowerDevMinors[lowerFS] = devMinor
+		}
+	}
+
+	// Take extra references held by the filesystem.
+	if fsopts.UpperRoot.Ok() {
+		fsopts.UpperRoot.IncRef()
+	}
+	for _, lowerRoot := range fsopts.LowerRoots {
+		lowerRoot.IncRef()
+	}
+
+	fs := &filesystem{
+		opts:           fsopts,
+		creds:          creds.Fork(),
+		dirDevMinor:    dirDevMinor,
+		lowerDevMinors: lowerDevMinors,
+	}
+	fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+	// Construct the root dentry.
+	root := fs.newDentry()
+	root.refs = 1
+	if fs.opts.UpperRoot.Ok() {
+		fs.opts.UpperRoot.IncRef()
+		root.copiedUp = 1
+		root.upperVD = fs.opts.UpperRoot
+	}
+	for _, lowerRoot := range fs.opts.LowerRoots {
+		lowerRoot.IncRef()
+		root.lowerVDs = append(root.lowerVDs, lowerRoot)
+	}
+	rootTopVD := root.topLayer()
+	// Get metadata from the topmost layer. See fs.lookupLocked().
+	const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+	rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+		Root:  rootTopVD,
+		Start: rootTopVD,
+	}, &vfs.StatOptions{
+		Mask: rootStatMask,
+	})
+	if err != nil {
+		root.destroyLocked()
+		fs.vfsfs.DecRef()
+		return nil, nil, err
+	}
+	if rootStat.Mask&rootStatMask != rootStatMask {
+		root.destroyLocked()
+		fs.vfsfs.DecRef()
+		return nil, nil, syserror.EREMOTE
+	}
+	if isWhiteout(&rootStat) {
+		ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
+		root.destroyLocked()
+		fs.vfsfs.DecRef()
+		return nil, nil, syserror.EINVAL
+	}
+	root.mode = uint32(rootStat.Mode)
+	root.uid = rootStat.UID
+	root.gid = rootStat.GID
+	if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR {
+		root.devMajor = linux.UNNAMED_MAJOR
+		root.devMinor = fs.dirDevMinor
+		root.ino = fs.newDirIno()
+	} else if !root.upperVD.Ok() {
+		root.devMajor = linux.UNNAMED_MAJOR
+		root.devMinor = fs.lowerDevMinors[root.lowerVDs[0].Mount().Filesystem()]
+		root.ino = rootStat.Ino
+	} else {
+		root.devMajor = rootStat.DevMajor
+		root.devMinor = rootStat.DevMinor
+		root.ino = rootStat.Ino
+	}
+
+	return &fs.vfsfs, &root.vfsd, nil
+}
+
+// clonePrivateMount creates a non-recursive bind mount rooted at vd, not
+// associated with any MountNamespace, and returns the root of the new mount.
+// (This is required to ensure that each layer of an overlay comprises only a
+// single mount, and therefore can't cross into e.g. the overlay filesystem
+// itself, risking lock recursion.) A reference is held on the returned
+// VirtualDentry.
+func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) {
+	oldmnt := vd.Mount()
+	opts := oldmnt.Options()
+	if forceReadOnly {
+		opts.ReadOnly = true
+	}
+	newmnt, err := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts)
+	if err != nil {
+		return vfs.VirtualDentry{}, err
+	}
+	return vfs.MakeVirtualDentry(newmnt, vd.Dentry()), nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+	vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor)
+	for _, lowerDevMinor := range fs.lowerDevMinors {
+		vfsObj.PutAnonBlockDevMinor(lowerDevMinor)
+	}
+	if fs.opts.UpperRoot.Ok() {
+		fs.opts.UpperRoot.DecRef()
+	}
+	for _, lowerRoot := range fs.opts.LowerRoots {
+		lowerRoot.DecRef()
+	}
+}
+
+func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) {
+	// Always statfs the root of the topmost layer. Compare Linux's
+	// fs/overlayfs/super.c:ovl_statfs().
+	var rootVD vfs.VirtualDentry
+	if fs.opts.UpperRoot.Ok() {
+		rootVD = fs.opts.UpperRoot
+	} else {
+		rootVD = fs.opts.LowerRoots[0]
+	}
+	fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  rootVD,
+		Start: rootVD,
+	})
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC
+	return fsstat, nil
+}
+
+func (fs *filesystem) newDirIno() uint64 {
+	return atomic.AddUint64(&fs.lastDirIno, 1)
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	vfsd vfs.Dentry
+
+	refs int64
+
+	// fs is the owning filesystem. fs is immutable.
+	fs *filesystem
+
+	// mode, uid, and gid are the file mode, owner, and group of the file in
+	// the topmost layer (and therefore the overlay file as well), and are used
+	// for permission checks on this dentry. These fields are protected by
+	// copyMu and accessed using atomic memory operations.
+	mode uint32
+	uid  uint32
+	gid  uint32
+
+	// copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and
+	// 0 otherwise. copiedUp is accessed using atomic memory operations.
+	copiedUp uint32
+
+	// parent is the dentry corresponding to this dentry's parent directory.
+	// name is this dentry's name in parent. If this dentry is a filesystem
+	// root, parent is nil and name is the empty string. parent and name are
+	// protected by fs.renameMu.
+	parent *dentry
+	name   string
+
+	// If this dentry represents a directory, children maps the names of
+	// children for which dentries have been instantiated to those dentries,
+	// and dirents (if not nil) is a cache of dirents as returned by
+	// directoryFDs representing this directory. children is protected by
+	// dirMu.
+	dirMu    sync.Mutex
+	children map[string]*dentry
+	dirents  []vfs.Dirent
+
+	// upperVD and lowerVDs are the files from the overlay filesystem's layers
+	// that comprise the file on the overlay filesystem.
+	//
+	// If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e.
+	// be copied up) with copyMu locked for writing; otherwise, it is
+	// immutable. lowerVDs is always immutable.
+	copyMu   sync.RWMutex
+	upperVD  vfs.VirtualDentry
+	lowerVDs []vfs.VirtualDentry
+
+	// inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <=
+	// len(inlineLowerVDs).
+	inlineLowerVDs [1]vfs.VirtualDentry
+
+	// devMajor, devMinor, and ino are the device major/minor and inode numbers
+	// used by this dentry. These fields are protected by copyMu and accessed
+	// using atomic memory operations.
+	devMajor uint32
+	devMinor uint32
+	ino      uint64
+
+	locks vfs.FileLocks
+}
+
+// newDentry creates a new dentry. The dentry initially has no references; it
+// is the caller's responsibility to set the dentry's reference count and/or
+// call dentry.destroy() as appropriate. The dentry is initially invalid in
+// that it contains no layers; the caller is responsible for setting them.
+func (fs *filesystem) newDentry() *dentry {
+	d := &dentry{
+		fs: fs,
+	}
+	d.lowerVDs = d.inlineLowerVDs[:0]
+	d.vfsd.Init(d)
+	return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+	// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
+	// d.checkDropLocked().
+	atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&d.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef() {
+	if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+		d.fs.renameMu.Lock()
+		d.checkDropLocked()
+		d.fs.renameMu.Unlock()
+	} else if refs < 0 {
+		panic("overlay.dentry.DecRef() called without holding a reference")
+	}
+}
+
+// checkDropLocked should be called after d's reference count becomes 0 or it
+// becomes deleted.
+//
+// Preconditions: d.fs.renameMu must be locked for writing.
+func (d *dentry) checkDropLocked() {
+	// Dentries with a positive reference count must be retained. (The only way
+	// to obtain a reference on a dentry with zero references is via path
+	// resolution, which requires renameMu, so if d.refs is zero then it will
+	// remain zero while we hold renameMu for writing.) Dentries with a
+	// negative reference count have already been destroyed.
+	if atomic.LoadInt64(&d.refs) != 0 {
+		return
+	}
+	// Refs is still zero; destroy it.
+	d.destroyLocked()
+	return
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+func (d *dentry) destroyLocked() {
+	switch atomic.LoadInt64(&d.refs) {
+	case 0:
+		// Mark the dentry destroyed.
+		atomic.StoreInt64(&d.refs, -1)
+	case -1:
+		panic("overlay.dentry.destroyLocked() called on already destroyed dentry")
+	default:
+		panic("overlay.dentry.destroyLocked() called with references on the dentry")
+	}
+
+	if d.upperVD.Ok() {
+		d.upperVD.DecRef()
+	}
+	for _, lowerVD := range d.lowerVDs {
+		lowerVD.DecRef()
+	}
+
+	if d.parent != nil {
+		d.parent.dirMu.Lock()
+		if !d.vfsd.IsDead() {
+			delete(d.parent.children, d.name)
+		}
+		d.parent.dirMu.Unlock()
+		// Drop the reference held by d on its parent without recursively
+		// locking d.fs.renameMu.
+		if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+			d.parent.checkDropLocked()
+		} else if refs < 0 {
+			panic("overlay.dentry.DecRef() called without holding a reference")
+		}
+	}
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {
+	// TODO(gvisor.dev/issue/1479): Implement inotify.
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+	// TODO(gvisor.dev/issue/1479): Implement inotify.
+	return nil
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) OnZeroWatches() {}
+
+// iterLayers invokes yield on each layer comprising d, from top to bottom. If
+// any call to yield returns false, iterLayer stops iteration.
+func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) {
+	if d.isCopiedUp() {
+		if !yield(d.upperVD, true) {
+			return
+		}
+	}
+	for _, lowerVD := range d.lowerVDs {
+		if !yield(lowerVD, false) {
+			return
+		}
+	}
+}
+
+func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) {
+	if d.isCopiedUp() {
+		return d.upperVD, true
+	}
+	return d.lowerVDs[0], false
+}
+
+func (d *dentry) topLayer() vfs.VirtualDentry {
+	vd, _ := d.topLayerInfo()
+	return vd
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+// statInternalMask is the set of stat fields that is set by
+// dentry.statInternalTo().
+const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+
+// statInternalTo writes fields to stat that are stored in d, and therefore do
+// not requiring invoking StatAt on the overlay's layers.
+func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) {
+	stat.Mask |= statInternalMask
+	if d.isDir() {
+		// Linux sets nlink to 1 for merged directories
+		// (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is
+		// correct more often ("." and the directory's entry in its parent),
+		// and some of our tests expect this.
+		stat.Nlink = 2
+	}
+	stat.UID = atomic.LoadUint32(&d.uid)
+	stat.GID = atomic.LoadUint32(&d.gid)
+	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
+	stat.Ino = atomic.LoadUint64(&d.ino)
+	stat.DevMajor = atomic.LoadUint32(&d.devMajor)
+	stat.DevMinor = atomic.LoadUint32(&d.devMinor)
+}
+
+// Preconditions: d.copyMu must be locked for writing.
+func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) {
+	if opts.Stat.Mask&linux.STATX_MODE != 0 {
+		atomic.StoreUint32(&d.mode, (d.mode&linux.S_IFMT)|uint32(opts.Stat.Mode&^linux.S_IFMT))
+	}
+	if opts.Stat.Mask&linux.STATX_UID != 0 {
+		atomic.StoreUint32(&d.uid, opts.Stat.UID)
+	}
+	if opts.Stat.Mask&linux.STATX_GID != 0 {
+		atomic.StoreUint32(&d.gid, opts.Stat.GID)
+	}
+}
+
+// fileDescription is embedded by overlay implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) dentry() *dentry {
+	return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index cab771211..811f80a5f 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -81,7 +81,8 @@ type inode struct {
 	kernfs.InodeNotSymlink
 	kernfs.InodeNoopRefCount
 
-	pipe *pipe.VFSPipe
+	locks vfs.FileLocks
+	pipe  *pipe.VFSPipe
 
 	ino uint64
 	uid auth.KUID
@@ -114,7 +115,7 @@ func (i *inode) Mode() linux.FileMode {
 }
 
 // Stat implements kernfs.Inode.Stat.
-func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+func (i *inode) Stat(_ context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
 	ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds())
 	return linux.Statx{
 		Mask:     linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
@@ -147,7 +148,7 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.
 
 // Open implements kernfs.Inode.Open.
 func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags)
+	return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks)
 }
 
 // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 17c1342b5..6014138ff 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -22,6 +22,7 @@ go_library(
         "//pkg/log",
         "//pkg/refs",
         "//pkg/safemem",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsbridge",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/inet",
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 36a911db4..79c2725f3 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -37,6 +37,8 @@ type subtasksInode struct {
 	kernfs.OrderedChildren
 	kernfs.AlwaysValid
 
+	locks vfs.FileLocks
+
 	fs                *filesystem
 	task              *kernel.Task
 	pidns             *kernel.PIDNamespace
@@ -126,7 +128,7 @@ func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallbac
 	return fd.GenericDirectoryFD.IterDirents(ctx, cb)
 }
 
-// Seek implements vfs.FileDecriptionImpl.Seek.
+// Seek implements vfs.FileDescriptionImpl.Seek.
 func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
 	if fd.task.ExitState() >= kernel.TaskExitZombie {
 		return 0, syserror.ENOENT
@@ -153,7 +155,7 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro
 // Open implements kernfs.Inode.
 func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &subtasksFD{task: i.task}
-	if err := fd.Init(&i.OrderedChildren, &opts); err != nil {
+	if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil {
 		return nil, err
 	}
 	if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
@@ -163,8 +165,8 @@ func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *v
 }
 
 // Stat implements kernfs.Inode.
-func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	stat, err := i.InodeAttrs.Stat(vsfs, opts)
+func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
 	if err != nil {
 		return linux.Statx{}, err
 	}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 482055db1..a5c7aa470 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -38,6 +38,8 @@ type taskInode struct {
 	kernfs.InodeAttrs
 	kernfs.OrderedChildren
 
+	locks vfs.FileLocks
+
 	task *kernel.Task
 }
 
@@ -103,7 +105,7 @@ func (i *taskInode) Valid(ctx context.Context) bool {
 
 // Open implements kernfs.Inode.
 func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
@@ -154,8 +156,8 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.
 }
 
 // Stat implements kernfs.Inode.
-func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	stat, err := i.Inode.Stat(fs, opts)
+func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.Inode.Stat(ctx, fs, opts)
 	if err != nil {
 		return linux.Statx{}, err
 	}
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 44ccc9e4a..fea29e5f0 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -53,6 +53,8 @@ func taskFDExists(t *kernel.Task, fd int32) bool {
 }
 
 type fdDir struct {
+	locks vfs.FileLocks
+
 	fs   *filesystem
 	task *kernel.Task
 
@@ -62,7 +64,7 @@ type fdDir struct {
 }
 
 // IterDirents implements kernfs.inodeDynamicLookup.
-func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, absOffset, relOffset int64) (int64, error) {
+func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	var fds []int32
 	i.task.WithMuLocked(func(t *kernel.Task) {
 		if fdTable := t.FDTable(); fdTable != nil {
@@ -70,7 +72,6 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, abs
 		}
 	})
 
-	offset := absOffset + relOffset
 	typ := uint8(linux.DT_REG)
 	if i.produceSymlink {
 		typ = linux.DT_LNK
@@ -143,7 +144,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 
 // Open implements kernfs.Inode.
 func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
@@ -270,7 +271,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
 
 // Open implements kernfs.Inode.
 func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 2f297e48a..859b7d727 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -34,6 +35,10 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// "There is an (arbitrary) limit on the number of lines in the file. As at
+// Linux 3.18, the limit is five lines." - user_namespaces(7)
+const maxIDMapLines = 5
+
 // mm gets the kernel task's MemoryManager. No additional reference is taken on
 // mm here. This is safe because MemoryManager.destroy is required to leave the
 // MemoryManager in a state where it's still usable as a DynamicBytesSource.
@@ -226,8 +231,9 @@ func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 
 		// Linux will return envp up to and including the first NULL character,
 		// so find it.
-		if end := bytes.IndexByte(buf.Bytes()[ar.Length():], 0); end != -1 {
-			buf.Truncate(end)
+		envStart := int(ar.Length())
+		if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 {
+			buf.Truncate(envStart + nullIdx)
 		}
 	}
 
@@ -282,7 +288,8 @@ func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
-// idMapData implements vfs.DynamicBytesSource for /proc/[pid]/{gid_map|uid_map}.
+// idMapData implements vfs.WritableDynamicBytesSource for
+// /proc/[pid]/{gid_map|uid_map}.
 //
 // +stateify savable
 type idMapData struct {
@@ -294,7 +301,7 @@ type idMapData struct {
 
 var _ dynamicInode = (*idMapData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.Generate.
+// Generate implements vfs.WritableDynamicBytesSource.Generate.
 func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	var entries []auth.IDMapEntry
 	if d.gids {
@@ -308,6 +315,60 @@ func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	// "In addition, the number of bytes written to the file must be less than
+	// the system page size, and the write must be performed at the start of
+	// the file ..." - user_namespaces(7)
+	srclen := src.NumBytes()
+	if srclen >= usermem.PageSize || offset != 0 {
+		return 0, syserror.EINVAL
+	}
+	b := make([]byte, srclen)
+	if _, err := src.CopyIn(ctx, b); err != nil {
+		return 0, err
+	}
+
+	// Truncate from the first NULL byte.
+	var nul int64
+	nul = int64(bytes.IndexByte(b, 0))
+	if nul == -1 {
+		nul = srclen
+	}
+	b = b[:nul]
+	// Remove the last \n.
+	if nul >= 1 && b[nul-1] == '\n' {
+		b = b[:nul-1]
+	}
+	lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1)
+	if len(lines) > maxIDMapLines {
+		return 0, syserror.EINVAL
+	}
+
+	entries := make([]auth.IDMapEntry, len(lines))
+	for i, l := range lines {
+		var e auth.IDMapEntry
+		_, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length)
+		if err != nil {
+			return 0, syserror.EINVAL
+		}
+		entries[i] = e
+	}
+	var err error
+	if d.gids {
+		err = d.task.UserNamespace().SetGIDMap(ctx, entries)
+	} else {
+		err = d.task.UserNamespace().SetUIDMap(ctx, entries)
+	}
+	if err != nil {
+		return 0, err
+	}
+
+	// On success, Linux's kernel/user_namespace.c:map_write() always returns
+	// count, even if fewer bytes were used.
+	return int64(srclen), nil
+}
+
 // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
 //
 // +stateify savable
@@ -775,6 +836,8 @@ type namespaceInode struct {
 	kernfs.InodeNoopRefCount
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
+
+	locks vfs.FileLocks
 }
 
 var _ kernfs.Inode = (*namespaceInode)(nil)
@@ -791,6 +854,7 @@ func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32
 func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &namespaceFD{inode: i}
 	i.IncRef()
+	fd.LockFD.Init(&i.locks)
 	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
@@ -801,6 +865,7 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *
 // /proc/[pid]/ns/*.
 type namespaceFD struct {
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	vfsfd vfs.FileDescription
 	inode *namespaceInode
@@ -811,7 +876,7 @@ var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
 // Stat implements FileDescriptionImpl.
 func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return fd.inode.Stat(vfs, opts)
+	return fd.inode.Stat(ctx, vfs, opts)
 }
 
 // SetStat implements FileDescriptionImpl.
@@ -826,7 +891,12 @@ func (fd *namespaceFD) Release() {
 	fd.inode.DecRef()
 }
 
-// OnClose implements FileDescriptionImpl.
-func (*namespaceFD) OnClose(context.Context) error {
-	return nil
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *namespaceFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *namespaceFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
 }
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index b51d43954..6d2b90a8b 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -43,6 +43,8 @@ type tasksInode struct {
 	kernfs.OrderedChildren
 	kernfs.AlwaysValid
 
+	locks vfs.FileLocks
+
 	fs    *filesystem
 	pidns *kernel.PIDNamespace
 
@@ -197,15 +199,15 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 
 // Open implements kernfs.Inode.
 func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	stat, err := i.InodeAttrs.Stat(vsfs, opts)
+func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
 	if err != nil {
 		return linux.Statx{}, err
 	}
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
index d29ef3f83..242ba9b5d 100644
--- a/pkg/sentry/fsimpl/signalfd/signalfd.go
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -31,6 +31,7 @@ type SignalFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 
 	// target is the original signal target task.
 	//
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index a741e2bb6..1b548ccd4 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -29,6 +29,6 @@ go_test(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/vfs",
-        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 0af373604..01ce30a4d 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -98,8 +98,10 @@ type dir struct {
 	kernfs.InodeNoDynamicLookup
 	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
-
 	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
+
 	dentry kernfs.Dentry
 }
 
@@ -121,7 +123,7 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set
 
 // Open implements kernfs.Inode.Open.
 func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
@@ -136,7 +138,7 @@ type cpuFile struct {
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "0-%d", c.maxCores-1)
+	fmt.Fprintf(buf, "0-%d\n", c.maxCores-1)
 	return nil
 }
 
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 4b3602d47..242d5fd12 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -51,7 +51,7 @@ func TestReadCPUFile(t *testing.T) {
 	k := kernel.KernelFromContext(s.Ctx)
 	maxCPUCores := k.ApplicationCores()
 
-	expected := fmt.Sprintf("0-%d", maxCPUCores-1)
+	expected := fmt.Sprintf("0-%d\n", maxCPUCores-1)
 
 	for _, fname := range []string{"online", "possible", "present"} {
 		pop := s.PathOpAtRoot(fmt.Sprintf("devices/system/cpu/%s", fname))
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
index 0e4053a46..400a97996 100644
--- a/pkg/sentry/fsimpl/testutil/BUILD
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -32,6 +32,6 @@ go_library(
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/usermem",
-        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index c16a36cdb..e743e8114 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -62,6 +62,7 @@ func Boot() (*kernel.Kernel, error) {
 		return nil, fmt.Errorf("creating platform: %v", err)
 	}
 
+	kernel.VFS2Enabled = true
 	k := &kernel.Kernel{
 		Platform: plat,
 	}
@@ -73,7 +74,7 @@ func Boot() (*kernel.Kernel, error) {
 	k.SetMemoryFile(mf)
 
 	// Pass k as the platform since it is savable, unlike the actual platform.
-	vdso, err := loader.PrepareVDSO(nil, k)
+	vdso, err := loader.PrepareVDSO(k)
 	if err != nil {
 		return nil, fmt.Errorf("creating vdso: %v", err)
 	}
@@ -103,11 +104,6 @@ func Boot() (*kernel.Kernel, error) {
 		return nil, fmt.Errorf("initializing kernel: %v", err)
 	}
 
-	kernel.VFS2Enabled = true
-
-	if err := k.VFS().Init(); err != nil {
-		return nil, fmt.Errorf("VFS init: %v", err)
-	}
 	k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 		AllowUserList:  true,
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
index 60c92d626..2dc90d484 100644
--- a/pkg/sentry/fsimpl/timerfd/timerfd.go
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -32,6 +32,7 @@ type TimerFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 
 	events waiter.Queue
 	timer  *ktime.Timer
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 007be1572..e73732a6b 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -59,9 +59,9 @@ go_library(
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
-        "//pkg/sentry/vfs/lock",
         "//pkg/sentry/vfs/memxattr",
         "//pkg/sync",
         "//pkg/syserror",
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
index 83bf885ee..ac54d420d 100644
--- a/pkg/sentry/fsimpl/tmpfs/device_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -29,7 +29,7 @@ type deviceFile struct {
 	minor uint32
 }
 
-func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
+func (fs *filesystem) newDeviceFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
 	file := &deviceFile{
 		kind:  kind,
 		major: major,
@@ -43,7 +43,7 @@ func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode
 	default:
 		panic(fmt.Sprintf("invalid DeviceKind: %v", kind))
 	}
-	file.inode.init(file, fs, creds, mode)
+	file.inode.init(file, fs, kuid, kgid, mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index f2399981b..0a1ad4765 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -48,9 +48,9 @@ type directory struct {
 	childList dentryList
 }
 
-func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *directory {
+func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *directory {
 	dir := &directory{}
-	dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode)
+	dir.inode.init(dir, fs, kuid, kgid, linux.S_IFDIR|mode)
 	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
 	dir.dentry.inode = &dir.inode
 	dir.dentry.vfsd.Init(&dir.dentry)
@@ -81,6 +81,10 @@ func (dir *directory) removeChildLocked(child *dentry) {
 	dir.iterMu.Unlock()
 }
 
+func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error {
+	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid)))
+}
+
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
@@ -106,6 +110,8 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	fs := fd.filesystem()
 	dir := fd.inode().impl.(*directory)
 
+	defer fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+
 	// fs.mu is required to read d.parent and dentry.name.
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 80fa7b29d..ef210a69b 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -79,7 +79,7 @@ afterSymlink:
 	}
 	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
 		// Symlink traversal updates access time.
-		atomic.StoreInt64(&d.inode.atime, d.inode.fs.clock.Now().Nanoseconds())
+		child.inode.touchAtime(rp.Mount())
 		if err := rp.HandleSymlink(symlink.target); err != nil {
 			return nil, err
 		}
@@ -177,6 +177,12 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 	if err := create(parentDir, name); err != nil {
 		return err
 	}
+
+	ev := linux.IN_CREATE
+	if dir {
+		ev |= linux.IN_ISDIR
+	}
+	parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
 	parentDir.inode.touchCMtime()
 	return nil
 }
@@ -231,17 +237,22 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 			return syserror.EXDEV
 		}
 		d := vd.Dentry().Impl().(*dentry)
-		if d.inode.isDir() {
+		i := d.inode
+		if i.isDir() {
 			return syserror.EPERM
 		}
-		if d.inode.nlink == 0 {
+		if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+			return err
+		}
+		if i.nlink == 0 {
 			return syserror.ENOENT
 		}
-		if d.inode.nlink == maxLinks {
+		if i.nlink == maxLinks {
 			return syserror.EMLINK
 		}
-		d.inode.incLinksLocked()
-		parentDir.insertChildLocked(fs.newDentry(d.inode), name)
+		i.incLinksLocked()
+		i.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */)
+		parentDir.insertChildLocked(fs.newDentry(i), name)
 		return nil
 	})
 }
@@ -249,11 +260,12 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
 	return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error {
+		creds := rp.Credentials()
 		if parentDir.inode.nlink == maxLinks {
 			return syserror.EMLINK
 		}
 		parentDir.inode.incLinksLocked() // from child's ".."
-		childDir := fs.newDirectory(rp.Credentials(), opts.Mode)
+		childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
 		parentDir.insertChildLocked(&childDir.dentry, name)
 		return nil
 	})
@@ -262,18 +274,19 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
 	return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
+		creds := rp.Credentials()
 		var childInode *inode
 		switch opts.Mode.FileType() {
-		case 0, linux.S_IFREG:
-			childInode = fs.newRegularFile(rp.Credentials(), opts.Mode)
+		case linux.S_IFREG:
+			childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
 		case linux.S_IFIFO:
-			childInode = fs.newNamedPipe(rp.Credentials(), opts.Mode)
+			childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
 		case linux.S_IFBLK:
-			childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
+			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
 		case linux.S_IFCHR:
-			childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
+			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
 		case linux.S_IFSOCK:
-			childInode = fs.newSocketFile(rp.Credentials(), opts.Mode, opts.Endpoint)
+			childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint)
 		default:
 			return syserror.EINVAL
 		}
@@ -348,15 +361,20 @@ afterTrailingSymlink:
 		}
 		defer rp.Mount().EndWrite()
 		// Create and open the child.
-		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+		creds := rp.Credentials()
+		child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode))
 		parentDir.insertChildLocked(child, name)
 		fd, err := child.open(ctx, rp, &opts, true)
 		if err != nil {
 			return nil, err
 		}
+		parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
 		parentDir.inode.touchCMtime()
 		return fd, nil
 	}
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
 	// Is the file mounted over?
 	if err := rp.CheckMount(&child.vfsd); err != nil {
 		return nil, err
@@ -364,7 +382,7 @@ afterTrailingSymlink:
 	// Do we need to resolve a trailing symlink?
 	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
 		// Symlink traversal updates access time.
-		atomic.StoreInt64(&child.inode.atime, child.inode.fs.clock.Now().Nanoseconds())
+		child.inode.touchAtime(rp.Mount())
 		if err := rp.HandleSymlink(symlink.target); err != nil {
 			return nil, err
 		}
@@ -388,10 +406,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 	switch impl := d.inode.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		fd.LockFD.Init(&d.inode.locks)
+		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
 			return nil, err
 		}
-		if opts.Flags&linux.O_TRUNC != 0 {
+		if !afterCreate && opts.Flags&linux.O_TRUNC != 0 {
 			if _, err := impl.truncate(0); err != nil {
 				return nil, err
 			}
@@ -403,15 +422,16 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		fd.LockFD.Init(&d.inode.locks)
+		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
 			return nil, err
 		}
 		return &fd.vfsfd, nil
 	case *symlink:
-		// Can't open symlinks without O_PATH (which is unimplemented).
+		// TODO(gvisor.dev/issue/2782): Can't open symlinks without O_PATH.
 		return nil, syserror.ELOOP
 	case *namedPipe:
-		return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags)
+		return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks)
 	case *deviceFile:
 		return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
 	case *socketFile:
@@ -472,6 +492,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if !ok {
 		return syserror.ENOENT
 	}
+	if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil {
+		return err
+	}
 	// Note that we don't need to call rp.CheckMount(), since if renamed is a
 	// mount point then we want to rename the mount point, not anything in the
 	// mounted filesystem.
@@ -559,6 +582,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		newParentDir.inode.touchCMtime()
 	}
 	renamed.inode.touchCtime()
+
+	vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir())
 	return nil
 }
 
@@ -584,6 +609,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if !ok {
 		return syserror.ENOENT
 	}
+	if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
+		return err
+	}
 	childDir, ok := child.inode.impl.(*directory)
 	if !ok {
 		return syserror.ENOTDIR
@@ -603,8 +631,11 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 	parentDir.removeChildLocked(child)
-	parentDir.inode.decLinksLocked() // from child's ".."
+	parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
+	// Remove links for child, child/., and child/..
+	child.inode.decLinksLocked()
 	child.inode.decLinksLocked()
+	parentDir.inode.decLinksLocked()
 	vfsObj.CommitDeleteDentry(&child.vfsd)
 	parentDir.inode.touchCMtime()
 	return nil
@@ -613,12 +644,21 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	fs.mu.RLock()
-	defer fs.mu.RUnlock()
 	d, err := resolveLocked(rp)
 	if err != nil {
+		fs.mu.RUnlock()
+		return err
+	}
+	if err := d.inode.setStat(ctx, rp.Credentials(), &opts); err != nil {
+		fs.mu.RUnlock()
 		return err
 	}
-	return d.inode.setStat(ctx, rp.Credentials(), &opts.Stat)
+	fs.mu.RUnlock()
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+	}
+	return nil
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
@@ -656,7 +696,8 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
 	return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
-		child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+		creds := rp.Credentials()
+		child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target))
 		parentDir.insertChildLocked(child, name)
 		return nil
 	})
@@ -681,6 +722,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if !ok {
 		return syserror.ENOENT
 	}
+	if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
+		return err
+	}
 	if child.inode.isDir() {
 		return syserror.EISDIR
 	}
@@ -698,6 +742,12 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
 		return err
 	}
+
+	// Generate inotify events. Note that this must take place before the link
+	// count of the child is decremented, or else the watches may be dropped
+	// before these events are added.
+	vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name)
+
 	parentDir.removeChildLocked(child)
 	child.inode.decLinksLocked()
 	vfsObj.CommitDeleteDentry(&child.vfsd)
@@ -749,23 +799,37 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
 func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
 	fs.mu.RLock()
-	defer fs.mu.RUnlock()
 	d, err := resolveLocked(rp)
 	if err != nil {
+		fs.mu.RUnlock()
+		return err
+	}
+	if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
+		fs.mu.RUnlock()
 		return err
 	}
-	return d.inode.setxattr(rp.Credentials(), &opts)
+	fs.mu.RUnlock()
+
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
 func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
-	defer fs.mu.RUnlock()
 	d, err := resolveLocked(rp)
 	if err != nil {
+		fs.mu.RUnlock()
 		return err
 	}
-	return d.inode.removexattr(rp.Credentials(), name)
+	if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
+		fs.mu.RUnlock()
+		return err
+	}
+	fs.mu.RUnlock()
+
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 8d77b3fa8..739350cf0 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -30,9 +30,9 @@ type namedPipe struct {
 // Preconditions:
 //   * fs.mu must be locked.
 //   * rp.Mount().CheckBeginWrite() has been called successfully.
-func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
-	file.inode.init(file, fs, creds, linux.S_IFIFO|mode)
+	file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode)
 	file.inode.nlink = 1 // Only the parent has a link.
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 3f433d666..abbaa5d60 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -25,7 +25,6 @@ import (
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -85,12 +84,12 @@ type regularFile struct {
 	size uint64
 }
 
-func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &regularFile{
 		memFile: fs.memFile,
 		seals:   linux.F_SEAL_SEAL,
 	}
-	file.inode.init(file, fs, creds, linux.S_IFREG|mode)
+	file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
@@ -275,11 +274,35 @@ func (fd *regularFileFD) Release() {
 	// noop
 }
 
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	f := fd.inode().impl.(*regularFile)
+
+	f.inode.mu.Lock()
+	defer f.inode.mu.Unlock()
+	oldSize := f.size
+	size := offset + length
+	if oldSize >= size {
+		return nil
+	}
+	_, err := f.truncateLocked(size)
+	return err
+}
+
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
+
+	// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
+	// all state is in-memory.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
 	if dst.NumBytes() == 0 {
 		return 0, nil
 	}
@@ -302,40 +325,60 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
 
 // PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, _, err := fd.pwrite(ctx, src, offset, opts)
+	return n, err
+}
+
+// pwrite returns the number of bytes written, final offset and error. The
+// final offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
 	if offset < 0 {
-		return 0, syserror.EINVAL
+		return 0, offset, syserror.EINVAL
 	}
+
+	// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
+	// all state is in-memory.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+		return 0, offset, syserror.EOPNOTSUPP
+	}
+
 	srclen := src.NumBytes()
 	if srclen == 0 {
-		return 0, nil
+		return 0, offset, nil
 	}
 	f := fd.inode().impl.(*regularFile)
+	f.inode.mu.Lock()
+	defer f.inode.mu.Unlock()
+	// If the file is opened with O_APPEND, update offset to file size.
+	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+		// Locking f.inode.mu is sufficient for reading f.size.
+		offset = int64(f.size)
+	}
 	if end := offset + srclen; end < offset {
 		// Overflow.
-		return 0, syserror.EFBIG
+		return 0, offset, syserror.EINVAL
 	}
 
-	var err error
 	srclen, err = vfs.CheckLimit(ctx, offset, srclen)
 	if err != nil {
-		return 0, err
+		return 0, offset, err
 	}
 	src = src.TakeFirst64(srclen)
 
-	f.inode.mu.Lock()
 	rw := getRegularFileReadWriter(f, offset)
 	n, err := src.CopyInTo(ctx, rw)
-	fd.inode().touchCMtimeLocked()
-	f.inode.mu.Unlock()
+	f.inode.touchCMtimeLocked()
 	putRegularFileReadWriter(rw)
-	return n, err
+	return n, n + offset, err
 }
 
 // Write implements vfs.FileDescriptionImpl.Write.
 func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	fd.offMu.Lock()
-	n, err := fd.PWrite(ctx, src, fd.off, opts)
-	fd.off += n
+	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+	fd.off = off
 	fd.offMu.Unlock()
 	return n, err
 }
@@ -361,33 +404,6 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (
 	return offset, nil
 }
 
-// Sync implements vfs.FileDescriptionImpl.Sync.
-func (fd *regularFileFD) Sync(ctx context.Context) error {
-	return nil
-}
-
-// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
-func (fd *regularFileFD) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
-	return fd.inode().lockBSD(uid, t, block)
-}
-
-// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
-func (fd *regularFileFD) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
-	fd.inode().unlockBSD(uid)
-	return nil
-}
-
-// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
-func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
-	return fd.inode().lockPOSIX(uid, t, rng, block)
-}
-
-// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
-func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
-	fd.inode().unlockPOSIX(uid, rng)
-	return nil
-}
-
 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
 func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
 	file := fd.inode().impl.(*regularFile)
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index 64e1c40ad..146c7fdfe 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -138,48 +138,37 @@ func TestLocks(t *testing.T) {
 	}
 	defer cleanup()
 
-	var (
-		uid1 lock.UniqueID
-		uid2 lock.UniqueID
-		// Non-blocking.
-		block lock.Blocker
-	)
-
-	uid1 = 123
-	uid2 = 456
-
-	if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, block); err != nil {
+	uid1 := 123
+	uid2 := 456
+	if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, nil); err != nil {
 		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
 	}
-	if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, block); err != nil {
+	if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, nil); err != nil {
 		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
 	}
-	if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block), syserror.ErrWouldBlock; got != want {
+	if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want {
 		t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want)
 	}
 	if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil {
 		t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err)
 	}
-	if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block); err != nil {
+	if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil); err != nil {
 		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
 	}
 
-	rng1 := lock.LockRange{0, 1}
-	rng2 := lock.LockRange{1, 2}
-
-	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, rng1, block); err != nil {
+	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, 0, 1, linux.SEEK_SET, nil); err != nil {
 		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
 	}
-	if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng2, block); err != nil {
+	if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 1, 2, linux.SEEK_SET, nil); err != nil {
 		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
 	}
-	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, rng1, block); err != nil {
+	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, 0, 1, linux.SEEK_SET, nil); err != nil {
 		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
 	}
-	if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng1, block), syserror.ErrWouldBlock; got != want {
+	if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 0, 1, linux.SEEK_SET, nil), syserror.ErrWouldBlock; got != want {
 		t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want)
 	}
-	if err := fd.Impl().UnlockPOSIX(ctx, uid1, rng1); err != nil {
+	if err := fd.Impl().UnlockPOSIX(ctx, uid1, 0, 1, linux.SEEK_SET); err != nil {
 		t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err)
 	}
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
index 25c2321af..3ed650474 100644
--- a/pkg/sentry/fsimpl/tmpfs/socket_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -26,9 +26,9 @@ type socketFile struct {
 	ep    transport.BoundEndpoint
 }
 
-func (fs *filesystem) newSocketFile(creds *auth.Credentials, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
+func (fs *filesystem) newSocketFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
 	file := &socketFile{ep: ep}
-	file.inode.init(file, fs, creds, mode)
+	file.inode.init(file, fs, kuid, kgid, mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index 47e075ed4..b0de5fabe 100644
--- a/pkg/sentry/fsimpl/tmpfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -24,11 +24,11 @@ type symlink struct {
 	target string // immutable
 }
 
-func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
+func (fs *filesystem) newSymlink(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, target string) *inode {
 	link := &symlink{
 		target: target,
 	}
-	link.inode.init(link, fs, creds, linux.S_IFLNK|0777)
+	link.inode.init(link, fs, kuid, kgid, linux.S_IFLNK|mode)
 	link.inode.nlink = 1 // from parent directory
 	return &link.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 1e781aecd..2545d88e9 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -30,6 +30,7 @@ package tmpfs
 import (
 	"fmt"
 	"math"
+	"strconv"
 	"strings"
 	"sync/atomic"
 
@@ -40,7 +41,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/vfs/memxattr"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -112,6 +112,58 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		}
 	}
 
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	rootMode := linux.FileMode(0777)
+	if rootFileType == linux.S_IFDIR {
+		rootMode = 01777
+	}
+	modeStr, ok := mopts["mode"]
+	if ok {
+		delete(mopts, "mode")
+		mode, err := strconv.ParseUint(modeStr, 8, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr)
+			return nil, nil, syserror.EINVAL
+		}
+		rootMode = linux.FileMode(mode & 07777)
+	}
+	rootKUID := creds.EffectiveKUID
+	uidStr, ok := mopts["uid"]
+	if ok {
+		delete(mopts, "uid")
+		uid, err := strconv.ParseUint(uidStr, 10, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr)
+			return nil, nil, syserror.EINVAL
+		}
+		kuid := creds.UserNamespace.MapToKUID(auth.UID(uid))
+		if !kuid.Ok() {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid)
+			return nil, nil, syserror.EINVAL
+		}
+		rootKUID = kuid
+	}
+	rootKGID := creds.EffectiveKGID
+	gidStr, ok := mopts["gid"]
+	if ok {
+		delete(mopts, "gid")
+		gid, err := strconv.ParseUint(gidStr, 10, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr)
+			return nil, nil, syserror.EINVAL
+		}
+		kgid := creds.UserNamespace.MapToKGID(auth.GID(gid))
+		if !kgid.Ok() {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid)
+			return nil, nil, syserror.EINVAL
+		}
+		rootKGID = kgid
+	}
+	if len(mopts) != 0 {
+		ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
 	devMinor, err := vfsObj.GetAnonBlockDevMinor()
 	if err != nil {
 		return nil, nil, err
@@ -127,11 +179,11 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	var root *dentry
 	switch rootFileType {
 	case linux.S_IFREG:
-		root = fs.newDentry(fs.newRegularFile(creds, 0777))
+		root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode))
 	case linux.S_IFLNK:
-		root = fs.newDentry(fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget))
+		root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget))
 	case linux.S_IFDIR:
-		root = &fs.newDirectory(creds, 01777).dentry
+		root = &fs.newDirectory(rootKUID, rootKGID, rootMode).dentry
 	default:
 		fs.vfsfs.DecRef()
 		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
@@ -201,6 +253,33 @@ func (d *dentry) DecRef() {
 	d.inode.decRef()
 }
 
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {
+	if d.inode.isDir() {
+		events |= linux.IN_ISDIR
+	}
+
+	// tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
+	// that d was deleted.
+	deleted := d.vfsd.IsDead()
+
+	d.inode.fs.mu.RLock()
+	// The ordering below is important, Linux always notifies the parent first.
+	if d.parent != nil {
+		d.parent.inode.watches.Notify(d.name, events, cookie, et, deleted)
+	}
+	d.inode.watches.Notify("", events, cookie, et, deleted)
+	d.inode.fs.mu.RUnlock()
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+	return &d.inode.watches
+}
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+func (d *dentry) OnZeroWatches() {}
+
 // inode represents a filesystem object.
 type inode struct {
 	// fs is the owning filesystem. fs is immutable.
@@ -209,11 +288,9 @@ type inode struct {
 	// refs is a reference count. refs is accessed using atomic memory
 	// operations.
 	//
-	// A reference is held on all inodes that are reachable in the filesystem
-	// tree. For non-directories (which may have multiple hard links), this
-	// means that a reference is dropped when nlink reaches 0. For directories,
-	// nlink never reaches 0 due to the "." entry; instead,
-	// filesystem.RmdirAt() drops the reference.
+	// A reference is held on all inodes as long as they are reachable in the
+	// filesystem tree, i.e. nlink is nonzero. This reference is dropped when
+	// nlink reaches 0.
 	refs int64
 
 	// xattrs implements extended attributes.
@@ -235,23 +312,25 @@ type inode struct {
 	ctime int64 // nanoseconds
 	mtime int64 // nanoseconds
 
-	// Advisory file locks, which lock at the inode level.
-	locks lock.FileLocks
+	locks vfs.FileLocks
+
+	// Inotify watches for this inode.
+	watches vfs.Watches
 
 	impl interface{} // immutable
 }
 
 const maxLinks = math.MaxUint32
 
-func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
+func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) {
 	if mode.FileType() == 0 {
 		panic("file type is required in FileMode")
 	}
 	i.fs = fs
 	i.refs = 1
 	i.mode = uint32(mode)
-	i.uid = uint32(creds.EffectiveKUID)
-	i.gid = uint32(creds.EffectiveKGID)
+	i.uid = uint32(kuid)
+	i.gid = uint32(kgid)
 	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
 	// Tmpfs creation sets atime, ctime, and mtime to current time.
 	now := fs.clock.Now().Nanoseconds()
@@ -276,14 +355,17 @@ func (i *inode) incLinksLocked() {
 	atomic.AddUint32(&i.nlink, 1)
 }
 
-// decLinksLocked decrements i's link count.
+// decLinksLocked decrements i's link count. If the link count reaches 0, we
+// remove a reference on i as well.
 //
 // Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
 func (i *inode) decLinksLocked() {
 	if i.nlink == 0 {
 		panic("tmpfs.inode.decLinksLocked() called with no existing links")
 	}
-	atomic.AddUint32(&i.nlink, ^uint32(0))
+	if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 {
+		i.decRef()
+	}
 }
 
 func (i *inode) incRef() {
@@ -306,6 +388,7 @@ func (i *inode) tryIncRef() bool {
 
 func (i *inode) decRef() {
 	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+		i.watches.HandleDeletion()
 		if regFile, ok := i.impl.(*regularFile); ok {
 			// Release memory used by regFile to store data. Since regFile is
 			// no longer usable, we don't need to grab any locks or update any
@@ -369,7 +452,8 @@ func (i *inode) statTo(stat *linux.Statx) {
 	}
 }
 
-func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error {
+func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error {
+	stat := &opts.Stat
 	if stat.Mask == 0 {
 		return nil
 	}
@@ -377,7 +461,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 		return syserror.EPERM
 	}
 	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
-	if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
 		return err
 	}
 	i.mu.Lock()
@@ -455,44 +539,6 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 	return nil
 }
 
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
-	switch i.impl.(type) {
-	case *regularFile:
-		return i.locks.LockBSD(uid, t, block)
-	}
-	return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) unlockBSD(uid fslock.UniqueID) error {
-	switch i.impl.(type) {
-	case *regularFile:
-		i.locks.UnlockBSD(uid)
-		return nil
-	}
-	return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
-	switch i.impl.(type) {
-	case *regularFile:
-		return i.locks.LockPOSIX(uid, t, rng, block)
-	}
-	return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error {
-	switch i.impl.(type) {
-	case *regularFile:
-		i.locks.UnlockPOSIX(uid, rng)
-		return nil
-	}
-	return syserror.EBADF
-}
-
 // allocatedBlocksForSize returns the number of 512B blocks needed to
 // accommodate the given size in bytes, as appropriate for struct
 // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
@@ -531,6 +577,9 @@ func (i *inode) isDir() bool {
 }
 
 func (i *inode) touchAtime(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime {
+		return
+	}
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return
 	}
@@ -621,14 +670,19 @@ func (i *inode) userXattrSupported() bool {
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
 	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
 }
 
+func (fd *fileDescription) dentry() *dentry {
+	return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
 func (fd *fileDescription) inode() *inode {
-	return fd.vfsfd.Dentry().Impl().(*dentry).inode
+	return fd.dentry().inode
 }
 
 // Stat implements vfs.FileDescriptionImpl.Stat.
@@ -641,7 +695,15 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	creds := auth.CredentialsFromContext(ctx)
-	return fd.inode().setStat(ctx, creds, &opts.Stat)
+	d := fd.dentry()
+	if err := d.inode.setStat(ctx, creds, &opts); err != nil {
+		return err
+	}
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+	}
+	return nil
 }
 
 // Listxattr implements vfs.FileDescriptionImpl.Listxattr.
@@ -656,12 +718,26 @@ func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOption
 
 // Setxattr implements vfs.FileDescriptionImpl.Setxattr.
 func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
-	return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts)
+	d := fd.dentry()
+	if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+		return err
+	}
+
+	// Generate inotify events.
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // Removexattr implements vfs.FileDescriptionImpl.Removexattr.
 func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
-	return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name)
+	d := fd.dentry()
+	if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+		return err
+	}
+
+	// Generate inotify events.
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // NewMemfd creates a new tmpfs regular file and file description that can back
@@ -674,8 +750,7 @@ func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name s
 
 	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
 	// S_IRWXUGO.
-	mode := linux.FileMode(0777)
-	inode := fs.newRegularFile(creds, mode)
+	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
 	rf := inode.impl.(*regularFile)
 	if allowSeals {
 		rf.seals = 0
@@ -688,9 +763,26 @@ func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name s
 	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
 	// FMODE_READ | FMODE_WRITE.
 	var fd regularFileFD
+	fd.Init(&inode.locks)
 	flags := uint32(linux.O_RDWR)
 	if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all
+// filesystem state is in-memory.
+func (*fileDescription) Sync(context.Context) error {
+	return nil
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index a28eab8b8..f6886a758 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -85,6 +85,7 @@ go_library(
     name = "kernel",
     srcs = [
         "abstract_socket_namespace.go",
+        "aio.go",
         "context.go",
         "fd_table.go",
         "fd_table_unsafe.go",
@@ -131,6 +132,7 @@ go_library(
         "task_stop.go",
         "task_syscall.go",
         "task_usermem.go",
+        "task_work.go",
         "thread_group.go",
         "threads.go",
         "timekeeper.go",
@@ -199,6 +201,7 @@ go_library(
         "//pkg/sentry/vfs",
         "//pkg/state",
         "//pkg/state/statefile",
+        "//pkg/state/wire",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go
new file mode 100644
index 000000000..0ac78c0b8
--- /dev/null
+++ b/pkg/sentry/kernel/aio.go
@@ -0,0 +1,81 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+// AIOCallback is an function that does asynchronous I/O on behalf of a task.
+type AIOCallback func(context.Context)
+
+// QueueAIO queues an AIOCallback which will be run asynchronously.
+func (t *Task) QueueAIO(cb AIOCallback) {
+	ctx := taskAsyncContext{t: t}
+	wg := &t.TaskSet().aioGoroutines
+	wg.Add(1)
+	go func() {
+		cb(ctx)
+		wg.Done()
+	}()
+}
+
+type taskAsyncContext struct {
+	context.NoopSleeper
+	t *Task
+}
+
+// Debugf implements log.Logger.Debugf.
+func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
+	ctx.t.Debugf(format, v...)
+}
+
+// Infof implements log.Logger.Infof.
+func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
+	ctx.t.Infof(format, v...)
+}
+
+// Warningf implements log.Logger.Warningf.
+func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
+	ctx.t.Warningf(format, v...)
+}
+
+// IsLogging implements log.Logger.IsLogging.
+func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
+	return ctx.t.IsLogging(level)
+}
+
+// Deadline implements context.Context.Deadline.
+func (ctx taskAsyncContext) Deadline() (time.Time, bool) {
+	return ctx.t.Deadline()
+}
+
+// Done implements context.Context.Done.
+func (ctx taskAsyncContext) Done() <-chan struct{} {
+	return ctx.t.Done()
+}
+
+// Err implements context.Context.Err.
+func (ctx taskAsyncContext) Err() error {
+	return ctx.t.Err()
+}
+
+// Value implements context.Context.Value.
+func (ctx taskAsyncContext) Value(key interface{}) interface{} {
+	return ctx.t.Value(key)
+}
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index e057d2c6d..6862f2ef5 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -232,3 +232,31 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) {
 	}
 	return NoID, syserror.EPERM
 }
+
+// SetUID translates the provided uid to the root user namespace and updates c's
+// uids to it. This performs no permissions or capabilities checks, the caller
+// is responsible for ensuring the calling context is permitted to modify c.
+func (c *Credentials) SetUID(uid UID) error {
+	kuid := c.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return syserror.EINVAL
+	}
+	c.RealKUID = kuid
+	c.EffectiveKUID = kuid
+	c.SavedKUID = kuid
+	return nil
+}
+
+// SetGID translates the provided gid to the root user namespace and updates c's
+// gids to it. This performs no permissions or capabilities checks, the caller
+// is responsible for ensuring the calling context is permitted to modify c.
+func (c *Credentials) SetGID(gid GID) error {
+	kgid := c.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return syserror.EINVAL
+	}
+	c.RealKGID = kgid
+	c.EffectiveKGID = kgid
+	c.SavedKGID = kgid
+	return nil
+}
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index 0c40bf315..dd5f0f5fa 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -18,7 +18,6 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/log"
 )
 
 // contextID is the kernel package's type for context.Context.Value keys.
@@ -113,55 +112,3 @@ func (*Task) Done() <-chan struct{} {
 func (*Task) Err() error {
 	return nil
 }
-
-// AsyncContext returns a context.Context that may be used by goroutines that
-// do work on behalf of t and therefore share its contextual values, but are
-// not t's task goroutine (e.g. asynchronous I/O).
-func (t *Task) AsyncContext() context.Context {
-	return taskAsyncContext{t: t}
-}
-
-type taskAsyncContext struct {
-	context.NoopSleeper
-	t *Task
-}
-
-// Debugf implements log.Logger.Debugf.
-func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
-	ctx.t.Debugf(format, v...)
-}
-
-// Infof implements log.Logger.Infof.
-func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
-	ctx.t.Infof(format, v...)
-}
-
-// Warningf implements log.Logger.Warningf.
-func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
-	ctx.t.Warningf(format, v...)
-}
-
-// IsLogging implements log.Logger.IsLogging.
-func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
-	return ctx.t.IsLogging(level)
-}
-
-// Deadline implements context.Context.Deadline.
-func (ctx taskAsyncContext) Deadline() (time.Time, bool) {
-	return ctx.t.Deadline()
-}
-
-// Done implements context.Context.Done.
-func (ctx taskAsyncContext) Done() <-chan struct{} {
-	return ctx.t.Done()
-}
-
-// Err implements context.Context.Err.
-func (ctx taskAsyncContext) Err() error {
-	return ctx.t.Err()
-}
-
-// Value implements context.Context.Value.
-func (ctx taskAsyncContext) Value(key interface{}) interface{} {
-	return ctx.t.Value(key)
-}
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 3d78cd48f..4c0f1e41f 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -107,7 +107,7 @@ type EventPoll struct {
 	// different lock to avoid circular lock acquisition order involving
 	// the wait queue mutexes and mu. The full order is mu, observed file
 	// wait queue mutex, then listsMu; this allows listsMu to be acquired
-	// when readyCallback is called.
+	// when (*pollEntry).Callback is called.
 	//
 	// An entry is always in one of the following lists:
 	//	readyList -- when there's a chance that it's ready to have
@@ -116,7 +116,7 @@ type EventPoll struct {
 	//		readEvents() functions always call the entry's file
 	//		Readiness() function to confirm it's ready.
 	//	waitingList -- when there's no chance that the entry is ready,
-	//		so it's waiting for the readyCallback to be called
+	//		so it's waiting for the (*pollEntry).Callback to be called
 	//		on it before it gets moved to the readyList.
 	//	disabledList -- when the entry is disabled. This happens when
 	//		a one-shot entry gets delivered via readEvents().
@@ -269,21 +269,19 @@ func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent {
 	return ret
 }
 
-// readyCallback is called when one of the files we're polling becomes ready. It
-// moves said file to the readyList if it's currently in the waiting list.
-type readyCallback struct{}
-
 // Callback implements waiter.EntryCallback.Callback.
-func (*readyCallback) Callback(w *waiter.Entry) {
-	entry := w.Context.(*pollEntry)
-	e := entry.epoll
+//
+// Callback is called when one of the files we're polling becomes ready. It
+// moves said file to the readyList if it's currently in the waiting list.
+func (p *pollEntry) Callback(*waiter.Entry) {
+	e := p.epoll
 
 	e.listsMu.Lock()
 
-	if entry.curList == &e.waitingList {
-		e.waitingList.Remove(entry)
-		e.readyList.PushBack(entry)
-		entry.curList = &e.readyList
+	if p.curList == &e.waitingList {
+		e.waitingList.Remove(p)
+		e.readyList.PushBack(p)
+		p.curList = &e.readyList
 		e.listsMu.Unlock()
 
 		e.Notify(waiter.EventIn)
@@ -310,7 +308,7 @@ func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
 	// Check if the file happens to already be in a ready state.
 	ready := f.Readiness(entry.mask) & entry.mask
 	if ready != 0 {
-		(*readyCallback).Callback(nil, &entry.waiter)
+		entry.Callback(&entry.waiter)
 	}
 }
 
@@ -380,10 +378,9 @@ func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.Ev
 		userData: data,
 		epoll:    e,
 		flags:    flags,
-		waiter:   waiter.Entry{Callback: &readyCallback{}},
 		mask:     mask,
 	}
-	entry.waiter.Context = entry
+	entry.waiter.Callback = entry
 	e.files[id] = entry
 	entry.file = refs.NewWeakRef(id.File, entry)
 
@@ -406,7 +403,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter
 	}
 
 	// Unregister the old mask and remove entry from the list it's in, so
-	// readyCallback is guaranteed to not be called on this entry anymore.
+	// (*pollEntry).Callback is guaranteed to not be called on this entry anymore.
 	entry.id.File.EventUnregister(&entry.waiter)
 
 	// Remove entry from whatever list it's in. This ensure that no other
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
index 8e9f200d0..7c61e0258 100644
--- a/pkg/sentry/kernel/epoll/epoll_state.go
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -21,8 +21,7 @@ import (
 
 // afterLoad is invoked by stateify.
 func (p *pollEntry) afterLoad() {
-	p.waiter = waiter.Entry{Callback: &readyCallback{}}
-	p.waiter.Context = p
+	p.waiter.Callback = p
 	p.file = refs.NewWeakRef(p.id.File, p)
 	p.id.File.EventRegister(&p.waiter, p.mask)
 }
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index b9126e946..2b3955598 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -11,6 +11,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index d32c3e90a..153d2cd9b 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -20,15 +20,21 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// New creates a new FileAsync.
+// New creates a new fs.FileAsync.
 func New() fs.FileAsync {
 	return &FileAsync{}
 }
 
+// NewVFS2 creates a new vfs.FileAsync.
+func NewVFS2() vfs.FileAsync {
+	return &FileAsync{}
+}
+
 // FileAsync sends signals when the registered file is ready for IO.
 //
 // +stateify savable
@@ -170,3 +176,13 @@ func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kern
 	a.recipientTG = nil
 	a.recipientPG = recipient
 }
+
+// ClearOwner unsets the current signal recipient.
+func (a *FileAsync) ClearOwner() {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.requester = nil
+	a.recipientT = nil
+	a.recipientTG = nil
+	a.recipientPG = nil
+}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index ed40b5303..4b7d234a4 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // FDFlags define flags for an individual descriptor.
@@ -80,9 +81,6 @@ type FDTable struct {
 	refs.AtomicRefCount
 	k *Kernel
 
-	// uid is a unique identifier.
-	uid uint64
-
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
 
@@ -130,7 +128,7 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
 // drop drops the table reference.
 func (f *FDTable) drop(file *fs.File) {
 	// Release locks.
-	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF})
+	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF})
 
 	// Send inotify events.
 	d := file.Dirent
@@ -151,24 +149,27 @@ func (f *FDTable) drop(file *fs.File) {
 
 // dropVFS2 drops the table reference.
 func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
-	// TODO(gvisor.dev/issue/1480): Release locks.
-	// TODO(gvisor.dev/issue/1479): Send inotify events.
+	// Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the
+	// entire file.
+	err := file.UnlockPOSIX(context.Background(), f, 0, 0, linux.SEEK_SET)
+	if err != nil && err != syserror.ENOLCK {
+		panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
+	}
 
-	// Drop the table reference.
-	file.DecRef()
-}
+	// Generate inotify events.
+	ev := uint32(linux.IN_CLOSE_NOWRITE)
+	if file.IsWritable() {
+		ev = linux.IN_CLOSE_WRITE
+	}
+	file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent)
 
-// ID returns a unique identifier for this FDTable.
-func (f *FDTable) ID() uint64 {
-	return f.uid
+	// Drop the table's reference.
+	file.DecRef()
 }
 
 // NewFDTable allocates a new FDTable that may be used by tasks in k.
 func (k *Kernel) NewFDTable() *FDTable {
-	f := &FDTable{
-		k:   k,
-		uid: atomic.AddUint64(&k.fdMapUids, 1),
-	}
+	f := &FDTable{k: k}
 	f.init()
 	return f
 }
@@ -463,6 +464,29 @@ func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
 	return nil
 }
 
+// SetFlagsVFS2 sets the flags for the given file descriptor.
+//
+// True is returned iff flags were changed.
+func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return syscall.EBADF
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	file, _, _ := f.getVFS2(fd)
+	if file == nil {
+		// No file found.
+		return syscall.EBADF
+	}
+
+	// Update the flags.
+	f.setVFS2(fd, file, flags)
+	return nil
+}
+
 // Get returns a reference to the file and the flags for the FD or nil if no
 // file is defined for the given fd.
 //
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 732e66da4..bcc1b29a8 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -717,10 +717,10 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint3
 	}
 }
 
-// UnlockPI unlock the futex following the Priority-inheritance futex
-// rules. The address provided must contain the caller's TID. If there are
-// waiters, TID of the next waiter (FIFO) is set to the given address, and the
-// waiter woken up. If there are no waiters, 0 is set to the address.
+// UnlockPI unlocks the futex following the Priority-inheritance futex rules.
+// The address provided must contain the caller's TID. If there are waiters,
+// TID of the next waiter (FIFO) is set to the given address, and the waiter
+// woken up. If there are no waiters, 0 is set to the address.
 func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
 	k, err := getKey(t, addr, private)
 	if err != nil {
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 5efeb3767..15dae0f5b 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -34,7 +34,6 @@ package kernel
 import (
 	"errors"
 	"fmt"
-	"io"
 	"path/filepath"
 	"sync/atomic"
 	"time"
@@ -73,6 +72,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/state"
+	"gvisor.dev/gvisor/pkg/state/wire"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
@@ -81,6 +81,10 @@ import (
 // easy access everywhere. To be removed once VFS2 becomes the default.
 var VFS2Enabled = false
 
+// FUSEEnabled is set to true when FUSE is enabled. Added as a global for allow
+// easy access everywhere. To be removed once FUSE is completed.
+var FUSEEnabled = false
+
 // Kernel represents an emulated Linux kernel. It must be initialized by calling
 // Init() or LoadFrom().
 //
@@ -194,11 +198,6 @@ type Kernel struct {
 	// cpuClockTickerSetting is protected by runningTasksMu.
 	cpuClockTickerSetting ktime.Setting
 
-	// fdMapUids is an ever-increasing counter for generating FDTable uids.
-	//
-	// fdMapUids is mutable, and is accessed using atomic memory operations.
-	fdMapUids uint64
-
 	// uniqueID is used to generate unique identifiers.
 	//
 	// uniqueID is mutable, and is accessed using atomic memory operations.
@@ -422,7 +421,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 // SaveTo saves the state of k to w.
 //
 // Preconditions: The kernel must be paused throughout the call to SaveTo.
-func (k *Kernel) SaveTo(w io.Writer) error {
+func (k *Kernel) SaveTo(w wire.Writer) error {
 	saveStart := time.Now()
 	ctx := k.SupervisorContext()
 
@@ -457,9 +456,7 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 		return err
 	}
 
-	// Ensure that all pending asynchronous work is complete:
-	//   - inode and mount release
-	//   - asynchronuous IO
+	// Ensure that all inode and mount release operations have completed.
 	fs.AsyncBarrier()
 
 	// Once all fs work has completed (flushed references have all been released),
@@ -480,18 +477,18 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 	//
 	// N.B. This will also be saved along with the full kernel save below.
 	cpuidStart := time.Now()
-	if err := state.Save(k.SupervisorContext(), w, k.FeatureSet(), nil); err != nil {
+	if _, err := state.Save(k.SupervisorContext(), w, k.FeatureSet()); err != nil {
 		return err
 	}
 	log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
 
 	// Save the kernel state.
 	kernelStart := time.Now()
-	var stats state.Stats
-	if err := state.Save(k.SupervisorContext(), w, k, &stats); err != nil {
+	stats, err := state.Save(k.SupervisorContext(), w, k)
+	if err != nil {
 		return err
 	}
-	log.Infof("Kernel save stats: %s", &stats)
+	log.Infof("Kernel save stats: %s", stats.String())
 	log.Infof("Kernel save took [%s].", time.Since(kernelStart))
 
 	// Save the memory file's state.
@@ -636,7 +633,7 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 }
 
 // LoadFrom returns a new Kernel loaded from args.
-func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
+func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
 	loadStart := time.Now()
 
 	initAppCores := k.applicationCores
@@ -647,7 +644,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
 	// don't need to explicitly install it in the Kernel.
 	cpuidStart := time.Now()
 	var features cpuid.FeatureSet
-	if err := state.Load(k.SupervisorContext(), r, &features, nil); err != nil {
+	if _, err := state.Load(k.SupervisorContext(), r, &features); err != nil {
 		return err
 	}
 	log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
@@ -662,11 +659,11 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
 
 	// Load the kernel state.
 	kernelStart := time.Now()
-	var stats state.Stats
-	if err := state.Load(k.SupervisorContext(), r, k, &stats); err != nil {
+	stats, err := state.Load(k.SupervisorContext(), r, k)
+	if err != nil {
 		return err
 	}
-	log.Infof("Kernel load stats: %s", &stats)
+	log.Infof("Kernel load stats: %s", stats.String())
 	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
 
 	// rootNetworkNamespace should be populated after loading the state file.
@@ -897,7 +894,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		if mntnsVFS2 == nil {
 			// MountNamespaceVFS2 adds a reference to the namespace, which is
 			// transferred to the new process.
-			mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2()
+			mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
 		}
 		// Get the root directory from the MountNamespace.
 		root := args.MountNamespaceVFS2.Root()
@@ -1254,13 +1251,15 @@ func (k *Kernel) Kill(es ExitStatus) {
 }
 
 // Pause requests that all tasks in k temporarily stop executing, and blocks
-// until all tasks in k have stopped. Multiple calls to Pause nest and require
-// an equal number of calls to Unpause to resume execution.
+// until all tasks and asynchronous I/O operations in k have stopped. Multiple
+// calls to Pause nest and require an equal number of calls to Unpause to
+// resume execution.
 func (k *Kernel) Pause() {
 	k.extMu.Lock()
 	k.tasks.BeginExternalStop()
 	k.extMu.Unlock()
 	k.tasks.runningGoroutines.Wait()
+	k.tasks.aioGoroutines.Wait()
 }
 
 // Unpause ends the effect of a previous call to Pause. If Unpause is called
@@ -1470,6 +1469,11 @@ func (k *Kernel) NowMonotonic() int64 {
 	return now
 }
 
+// AfterFunc implements tcpip.Clock.AfterFunc.
+func (k *Kernel) AfterFunc(d time.Duration, f func()) tcpip.Timer {
+	return ktime.TcpipAfterFunc(k.realtimeClock, d, f)
+}
+
 // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
 // LoadFrom.
 func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index f29dc0472..449643118 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -8,6 +8,7 @@ go_library(
         "device.go",
         "node.go",
         "pipe.go",
+        "pipe_unsafe.go",
         "pipe_util.go",
         "reader.go",
         "reader_writer.go",
@@ -20,10 +21,12 @@ go_library(
         "//pkg/amutex",
         "//pkg/buffer",
         "//pkg/context",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserror",
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 62c8691f1..79645d7d2 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -207,7 +207,10 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
 
 	p.mu.Lock()
 	defer p.mu.Unlock()
+	return p.readLocked(ctx, ops)
+}
 
+func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
 	// Is the pipe empty?
 	if p.view.Size() == 0 {
 		if !p.HasWriters() {
@@ -246,7 +249,10 @@ type writeOps struct {
 func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 	p.mu.Lock()
 	defer p.mu.Unlock()
+	return p.writeLocked(ctx, ops)
+}
 
+func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) {
 	// Can't write to a pipe with no readers.
 	if !p.HasReaders() {
 		return 0, syscall.EPIPE
diff --git a/pkg/sentry/kernel/pipe/pipe_unsafe.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go
new file mode 100644
index 000000000..dd60cba24
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go
@@ -0,0 +1,35 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"unsafe"
+)
+
+// lockTwoPipes locks both x.mu and y.mu in an order that is guaranteed to be
+// consistent for both lockTwoPipes(x, y) and lockTwoPipes(y, x), such that
+// concurrent calls cannot deadlock.
+//
+// Preconditions: x != y.
+func lockTwoPipes(x, y *Pipe) {
+	// Lock the two pipes in order of increasing address.
+	if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) {
+		x.mu.Lock()
+		y.mu.Lock()
+	} else {
+		y.mu.Lock()
+		x.mu.Lock()
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index b54f08a30..45d4c5fc1 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -16,8 +16,11 @@ package pipe
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -59,11 +62,13 @@ func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe {
 //
 // Preconditions: statusFlags should not contain an open access mode.
 func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) {
-	return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags)
+	// Connected pipes share the same locks.
+	locks := &vfs.FileLocks{}
+	return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks)
 }
 
 // Open opens the pipe represented by vp.
-func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) {
+func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
 	vp.mu.Lock()
 	defer vp.mu.Unlock()
 
@@ -73,7 +78,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
 		return nil, syserror.EINVAL
 	}
 
-	fd := vp.newFD(mnt, vfsd, statusFlags)
+	fd := vp.newFD(mnt, vfsd, statusFlags, locks)
 
 	// Named pipes have special blocking semantics during open:
 	//
@@ -125,10 +130,11 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
 }
 
 // Preconditions: vp.mu must be held.
-func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription {
+func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) *vfs.FileDescription {
 	fd := &VFSPipeFD{
 		pipe: &vp.pipe,
 	}
+	fd.LockFD.Init(locks)
 	fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{
 		DenyPRead:         true,
 		DenyPWrite:        true,
@@ -150,11 +156,14 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *
 	return &fd.vfsfd
 }
 
-// VFSPipeFD implements vfs.FileDescriptionImpl for pipes.
+// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements
+// non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to
+// other FileDescriptions for splice(2) and tee(2).
 type VFSPipeFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.LockFD
 
 	pipe *Pipe
 }
@@ -191,6 +200,11 @@ func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask {
 	}
 }
 
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return syserror.ESPIPE
+}
+
 // EventRegister implements waiter.Waitable.EventRegister.
 func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
 	fd.pipe.EventRegister(e, mask)
@@ -229,3 +243,226 @@ func (fd *VFSPipeFD) PipeSize() int64 {
 func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
 	return fd.pipe.SetFifoSize(size)
 }
+
+// IOSequence returns a useremm.IOSequence that reads up to count bytes from,
+// or writes up to count bytes to, fd.
+func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence {
+	return usermem.IOSequence{
+		IO:    fd,
+		Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+	}
+}
+
+// CopyIn implements usermem.IO.CopyIn.
+func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
+	origCount := int64(len(dst))
+	n, err := fd.pipe.read(ctx, readOps{
+		left: func() int64 {
+			return int64(len(dst))
+		},
+		limit: func(l int64) {
+			dst = dst[:l]
+		},
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadAt(dst, 0)
+			view.TrimFront(int64(n))
+			return int64(n), err
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventOut)
+	}
+	if err == nil && n != origCount {
+		return int(n), syserror.ErrWouldBlock
+	}
+	return int(n), err
+}
+
+// CopyOut implements usermem.IO.CopyOut.
+func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
+	origCount := int64(len(src))
+	n, err := fd.pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return int64(len(src))
+		},
+		limit: func(l int64) {
+			src = src[:l]
+		},
+		write: func(view *buffer.View) (int64, error) {
+			view.Append(src)
+			return int64(len(src)), nil
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventIn)
+	}
+	if err == nil && n != origCount {
+		return int(n), syserror.ErrWouldBlock
+	}
+	return int(n), err
+}
+
+// ZeroOut implements usermem.IO.ZeroOut.
+func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
+	origCount := toZero
+	n, err := fd.pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return toZero
+		},
+		limit: func(l int64) {
+			toZero = l
+		},
+		write: func(view *buffer.View) (int64, error) {
+			view.Grow(view.Size()+toZero, true /* zero */)
+			return toZero, nil
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventIn)
+	}
+	if err == nil && n != origCount {
+		return n, syserror.ErrWouldBlock
+	}
+	return n, err
+}
+
+// CopyInTo implements usermem.IO.CopyInTo.
+func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
+	count := ars.NumBytes()
+	if count == 0 {
+		return 0, nil
+	}
+	origCount := count
+	n, err := fd.pipe.read(ctx, readOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadToSafememWriter(dst, uint64(count))
+			view.TrimFront(int64(n))
+			return int64(n), err
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventOut)
+	}
+	if err == nil && n != origCount {
+		return n, syserror.ErrWouldBlock
+	}
+	return n, err
+}
+
+// CopyOutFrom implements usermem.IO.CopyOutFrom.
+func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
+	count := ars.NumBytes()
+	if count == 0 {
+		return 0, nil
+	}
+	origCount := count
+	n, err := fd.pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		write: func(view *buffer.View) (int64, error) {
+			n, err := view.WriteFromSafememReader(src, uint64(count))
+			return int64(n), err
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventIn)
+	}
+	if err == nil && n != origCount {
+		return n, syserror.ErrWouldBlock
+	}
+	return n, err
+}
+
+// SwapUint32 implements usermem.IO.SwapUint32.
+func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
+	// How did a pipe get passed as the virtual address space to futex(2)?
+	panic("VFSPipeFD.SwapUint32 called unexpectedly")
+}
+
+// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
+func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
+	panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly")
+}
+
+// LoadUint32 implements usermem.IO.LoadUint32.
+func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) {
+	panic("VFSPipeFD.LoadUint32 called unexpectedly")
+}
+
+// Splice reads up to count bytes from src and writes them to dst. It returns
+// the number of bytes moved.
+//
+// Preconditions: count > 0.
+func Splice(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
+	return spliceOrTee(ctx, dst, src, count, true /* removeFromSrc */)
+}
+
+// Tee reads up to count bytes from src and writes them to dst, without
+// removing the read bytes from src. It returns the number of bytes copied.
+//
+// Preconditions: count > 0.
+func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
+	return spliceOrTee(ctx, dst, src, count, false /* removeFromSrc */)
+}
+
+// Preconditions: count > 0.
+func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) {
+	if dst.pipe == src.pipe {
+		return 0, syserror.EINVAL
+	}
+
+	lockTwoPipes(dst.pipe, src.pipe)
+	defer dst.pipe.mu.Unlock()
+	defer src.pipe.mu.Unlock()
+
+	n, err := dst.pipe.writeLocked(ctx, writeOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		write: func(dstView *buffer.View) (int64, error) {
+			return src.pipe.readLocked(ctx, readOps{
+				left: func() int64 {
+					return count
+				},
+				limit: func(l int64) {
+					count = l
+				},
+				read: func(srcView *buffer.View) (int64, error) {
+					n, err := srcView.ReadToSafememWriter(dstView, uint64(count))
+					if n > 0 && removeFromSrc {
+						srcView.TrimFront(int64(n))
+					}
+					return int64(n), err
+				},
+			})
+		},
+	})
+	if n > 0 {
+		dst.pipe.Notify(waiter.EventIn)
+		src.pipe.Notify(waiter.EventOut)
+	}
+	return n, err
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *VFSPipeFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *VFSPipeFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index bfd779837..c211fc8d0 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -20,7 +20,6 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/pgalloc",
-        "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sync",
         "//pkg/syserror",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index f66cfcc7f..55b4c2cdb 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -45,7 +45,6 @@ import (
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -370,7 +369,7 @@ type Shm struct {
 
 	// fr is the offset into mfp.MemoryFile() that backs this contents of this
 	// segment. Immutable.
-	fr platform.FileRange
+	fr memmap.FileRange
 
 	// mu protects all fields below.
 	mu sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 4607cde2f..a83ce219c 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -98,6 +98,15 @@ func (s *syslog) Log() []byte {
 		s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...)
 	}
 
+	if VFS2Enabled {
+		time += rand.Float64() / 2
+		s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up VFS2..."))...)
+		if FUSEEnabled {
+			time += rand.Float64() / 2
+			s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up FUSE..."))...)
+		}
+	}
+
 	time += rand.Float64() / 2
 	s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...)
 
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index f48247c94..c4db05bd8 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -68,6 +68,21 @@ type Task struct {
 	// runState is exclusive to the task goroutine.
 	runState taskRunState
 
+	// taskWorkCount represents the current size of the task work queue. It is
+	// used to avoid acquiring taskWorkMu when the queue is empty.
+	//
+	// Must accessed with atomic memory operations.
+	taskWorkCount int32
+
+	// taskWorkMu protects taskWork.
+	taskWorkMu sync.Mutex `state:"nosave"`
+
+	// taskWork is a queue of work to be executed before resuming user execution.
+	// It is similar to the task_work mechanism in Linux.
+	//
+	// taskWork is exclusive to the task goroutine.
+	taskWork []TaskWorker
+
 	// haveSyscallReturn is true if tc.Arch().Return() represents a value
 	// returned by a syscall (or set by ptrace after a syscall).
 	//
@@ -550,6 +565,10 @@ type Task struct {
 	// futexWaiter is exclusive to the task goroutine.
 	futexWaiter *futex.Waiter `state:"nosave"`
 
+	// robustList is a pointer to the head of the tasks's robust futex
+	// list.
+	robustList usermem.Addr
+
 	// startTime is the real time at which the task started. It is set when
 	// a Task is created or invokes execve(2).
 	//
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 00c425cca..7803b98d0 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -198,11 +198,18 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
 	t.tg.pidns.owner.mu.Unlock()
 
+	oldFDTable := t.fdTable
+	t.fdTable = t.fdTable.Fork()
+	oldFDTable.DecRef()
+
 	// Remove FDs with the CloseOnExec flag set.
 	t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
 		return flags.CloseOnExec
 	})
 
+	// Handle the robust futex list.
+	t.exitRobustList()
+
 	// NOTE(b/30815691): We currently do not implement privileged
 	// executables (set-user/group-ID bits and file capabilities). This
 	// allows us to unconditionally enable user dumpability on the new mm.
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index c4ade6e8e..231ac548a 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -253,6 +253,9 @@ func (*runExitMain) execute(t *Task) taskRunState {
 		}
 	}
 
+	// Handle the robust futex list.
+	t.exitRobustList()
+
 	// Deactivate the address space and update max RSS before releasing the
 	// task's MM.
 	t.Deactivate()
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index a53e77c9f..4b535c949 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -15,6 +15,7 @@
 package kernel
 
 import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -52,3 +53,127 @@ func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) {
 func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) {
 	return t.MemoryManager().GetSharedFutexKey(t, addr)
 }
+
+// GetRobustList sets the robust futex list for the task.
+func (t *Task) GetRobustList() usermem.Addr {
+	t.mu.Lock()
+	addr := t.robustList
+	t.mu.Unlock()
+	return addr
+}
+
+// SetRobustList sets the robust futex list for the task.
+func (t *Task) SetRobustList(addr usermem.Addr) {
+	t.mu.Lock()
+	t.robustList = addr
+	t.mu.Unlock()
+}
+
+// exitRobustList walks the robust futex list, marking locks dead and notifying
+// wakers. It corresponds to Linux's exit_robust_list(). Following Linux,
+// errors are silently ignored.
+func (t *Task) exitRobustList() {
+	t.mu.Lock()
+	addr := t.robustList
+	t.robustList = 0
+	t.mu.Unlock()
+
+	if addr == 0 {
+		return
+	}
+
+	var rl linux.RobustListHead
+	if _, err := rl.CopyIn(t, usermem.Addr(addr)); err != nil {
+		return
+	}
+
+	next := rl.List
+	done := 0
+	var pendingLockAddr usermem.Addr
+	if rl.ListOpPending != 0 {
+		pendingLockAddr = usermem.Addr(rl.ListOpPending + rl.FutexOffset)
+	}
+
+	// Wake up normal elements.
+	for usermem.Addr(next) != addr {
+		// We traverse to the next element of the list before we
+		// actually wake anything. This prevents the race where waking
+		// this futex causes a modification of the list.
+		thisLockAddr := usermem.Addr(next + rl.FutexOffset)
+
+		// Try to decode the next element in the list before waking the
+		// current futex. But don't check the error until after we've
+		// woken the current futex. Linux does it in this order too
+		_, nextErr := t.CopyIn(usermem.Addr(next), &next)
+
+		// Wakeup the current futex if it's not pending.
+		if thisLockAddr != pendingLockAddr {
+			t.wakeRobustListOne(thisLockAddr)
+		}
+
+		// If there was an error copying the next futex, we must bail.
+		if nextErr != nil {
+			break
+		}
+
+		// This is a user structure, so it could be a massive list, or
+		// even contain a loop if they are trying to mess with us. We
+		// cap traversal to prevent that.
+		done++
+		if done >= linux.ROBUST_LIST_LIMIT {
+			break
+		}
+	}
+
+	// Is there a pending entry to wake?
+	if pendingLockAddr != 0 {
+		t.wakeRobustListOne(pendingLockAddr)
+	}
+}
+
+// wakeRobustListOne wakes a single futex from the robust list.
+func (t *Task) wakeRobustListOne(addr usermem.Addr) {
+	// Bit 0 in address signals PI futex.
+	pi := addr&1 == 1
+	addr = addr &^ 1
+
+	// Load the futex.
+	f, err := t.LoadUint32(addr)
+	if err != nil {
+		// Can't read this single value? Ignore the problem.
+		// We can wake the other futexes in the list.
+		return
+	}
+
+	tid := uint32(t.ThreadID())
+	for {
+		// Is this held by someone else?
+		if f&linux.FUTEX_TID_MASK != tid {
+			return
+		}
+
+		// This thread is dying and it's holding this futex. We need to
+		// set the owner died bit and wake up any waiters.
+		newF := (f & linux.FUTEX_WAITERS) | linux.FUTEX_OWNER_DIED
+		if curF, err := t.CompareAndSwapUint32(addr, f, newF); err != nil {
+			return
+		} else if curF != f {
+			// Futex changed out from under us. Try again...
+			f = curF
+			continue
+		}
+
+		// Wake waiters if there are any.
+		if f&linux.FUTEX_WAITERS != 0 {
+			private := f&linux.FUTEX_PRIVATE_FLAG != 0
+			if pi {
+				t.Futex().UnlockPI(t, addr, tid, private)
+				return
+			}
+			t.Futex().Wake(t, addr, private, linux.FUTEX_BITSET_MATCH_ANY, 1)
+		}
+
+		// Done.
+		return
+	}
+}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index d654dd997..7d4f44caf 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -167,7 +167,22 @@ func (app *runApp) execute(t *Task) taskRunState {
 		return (*runInterrupt)(nil)
 	}
 
-	// We're about to switch to the application again. If there's still a
+	// Execute any task work callbacks before returning to user space.
+	if atomic.LoadInt32(&t.taskWorkCount) > 0 {
+		t.taskWorkMu.Lock()
+		queue := t.taskWork
+		t.taskWork = nil
+		atomic.StoreInt32(&t.taskWorkCount, 0)
+		t.taskWorkMu.Unlock()
+
+		// Do not hold taskWorkMu while executing task work, which may register
+		// more work.
+		for _, work := range queue {
+			work.TaskWork(t)
+		}
+	}
+
+	// We're about to switch to the application again. If there's still an
 	// unhandled SyscallRestartErrno that wasn't translated to an EINTR,
 	// restart the syscall that was interrupted. If there's a saved signal
 	// mask, restore it. (Note that restoring the saved signal mask may unblock
diff --git a/pkg/sentry/kernel/task_work.go b/pkg/sentry/kernel/task_work.go
new file mode 100644
index 000000000..dda5a433a
--- /dev/null
+++ b/pkg/sentry/kernel/task_work.go
@@ -0,0 +1,38 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import "sync/atomic"
+
+// TaskWorker is a deferred task.
+//
+// This must be savable.
+type TaskWorker interface {
+	// TaskWork will be executed prior to returning to user space. Note that
+	// TaskWork may call RegisterWork again, but this will not be executed until
+	// the next return to user space, unlike in Linux. This effectively allows
+	// registration of indefinite user return hooks, but not by default.
+	TaskWork(t *Task)
+}
+
+// RegisterWork can be used to register additional task work that will be
+// performed prior to returning to user space. See TaskWorker.TaskWork for
+// semantics regarding registration.
+func (t *Task) RegisterWork(work TaskWorker) {
+	t.taskWorkMu.Lock()
+	defer t.taskWorkMu.Unlock()
+	atomic.AddInt32(&t.taskWorkCount, 1)
+	t.taskWork = append(t.taskWork, work)
+}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 52849f5b3..4dfd2c990 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -366,7 +366,8 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error {
 	// terminal is stolen, and all processes that had it as controlling
 	// terminal lose it." - tty_ioctl(4)
 	if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session {
-		if !auth.CredentialsFromContext(tg.leader).HasCapability(linux.CAP_SYS_ADMIN) || arg != 1 {
+		// Stealing requires CAP_SYS_ADMIN in the root user namespace.
+		if creds := auth.CredentialsFromContext(tg.leader); !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) || arg != 1 {
 			return syserror.EPERM
 		}
 		// Steal the TTY away. Unlike TIOCNOTTY, don't send signals.
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index bf2dabb6e..872e1a82d 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -87,6 +87,13 @@ type TaskSet struct {
 	// at time of save (but note that this is not necessarily the same thing as
 	// sync.WaitGroup's zero value).
 	runningGoroutines sync.WaitGroup `state:"nosave"`
+
+	// aioGoroutines is the number of goroutines running async I/O
+	// callbacks.
+	//
+	// aioGoroutines is not saved but is required to be zero at the time of
+	// save.
+	aioGoroutines sync.WaitGroup `state:"nosave"`
 }
 
 // newTaskSet returns a new, empty TaskSet.
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 7ba7dc50c..2817aa3ba 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "time",
     srcs = [
         "context.go",
+        "tcpip.go",
         "time.go",
     ],
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/kernel/time/tcpip.go b/pkg/sentry/kernel/time/tcpip.go
new file mode 100644
index 000000000..c4474c0cf
--- /dev/null
+++ b/pkg/sentry/kernel/time/tcpip.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+	"sync"
+	"time"
+)
+
+// TcpipAfterFunc waits for duration to elapse according to clock then runs fn.
+// The timer is started immediately and will fire exactly once.
+func TcpipAfterFunc(clock Clock, duration time.Duration, fn func()) *TcpipTimer {
+	timer := &TcpipTimer{
+		clock: clock,
+	}
+	timer.notifier = functionNotifier{
+		fn: func() {
+			// tcpip.Timer.Stop() explicitly states that the function is called in a
+			// separate goroutine that Stop() does not synchronize with.
+			// Timer.Destroy() synchronizes with calls to TimerListener.Notify().
+			// This is semantically meaningful because, in the former case, it's
+			// legal to call tcpip.Timer.Stop() while holding locks that may also be
+			// taken by the function, but this isn't so in the latter case. Most
+			// immediately, Timer calls TimerListener.Notify() while holding
+			// Timer.mu. A deadlock occurs without spawning a goroutine:
+			//   T1: (Timer expires)
+			//     => Timer.Tick()           <- Timer.mu.Lock() called
+			//     => TimerListener.Notify()
+			//     => Timer.Stop()
+			//     => Timer.Destroy()        <- Timer.mu.Lock() called, deadlock!
+			//
+			// Spawning a goroutine avoids the deadlock:
+			//   T1: (Timer expires)
+			//     => Timer.Tick()           <- Timer.mu.Lock() called
+			//     => TimerListener.Notify() <- Launches T2
+			//   T2:
+			//     => Timer.Stop()
+			//     => Timer.Destroy()        <- Timer.mu.Lock() called, blocks
+			//   T1:
+			//     => (returns)              <- Timer.mu.Unlock() called
+			//   T2:
+			//     => (continues)            <- No deadlock!
+			go func() {
+				timer.Stop()
+				fn()
+			}()
+		},
+	}
+	timer.Reset(duration)
+	return timer
+}
+
+// TcpipTimer is a resettable timer with variable duration expirations.
+// Implements tcpip.Timer, which does not define a Destroy method; instead, all
+// resources are released after timer expiration and calls to Timer.Stop.
+//
+// Must be created by AfterFunc.
+type TcpipTimer struct {
+	// clock is the time source. clock is immutable.
+	clock Clock
+
+	// notifier is called when the Timer expires. notifier is immutable.
+	notifier functionNotifier
+
+	// mu protects t.
+	mu sync.Mutex
+
+	// t stores the latest running Timer. This is replaced whenever Reset is
+	// called since Timer cannot be restarted once it has been Destroyed by Stop.
+	//
+	// This field is nil iff Stop has been called.
+	t *Timer
+}
+
+// Stop implements tcpip.Timer.Stop.
+func (r *TcpipTimer) Stop() bool {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.t == nil {
+		return false
+	}
+	_, lastSetting := r.t.Swap(Setting{})
+	r.t.Destroy()
+	r.t = nil
+	return lastSetting.Enabled
+}
+
+// Reset implements tcpip.Timer.Reset.
+func (r *TcpipTimer) Reset(d time.Duration) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.t == nil {
+		r.t = NewTimer(r.clock, &r.notifier)
+	}
+
+	r.t.Swap(Setting{
+		Enabled: true,
+		Period:  0,
+		Next:    r.clock.Now().Add(d),
+	})
+}
+
+// functionNotifier is a TimerListener that runs a function.
+//
+// functionNotifier cannot be saved or loaded.
+type functionNotifier struct {
+	fn func()
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (f *functionNotifier) Notify(uint64, Setting) (Setting, bool) {
+	f.fn()
+	return Setting{}, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (f *functionNotifier) Destroy() {}
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index da0ea7bb5..7c4fefb16 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -21,8 +21,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/log"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/sync"
 )
@@ -90,7 +90,7 @@ type Timekeeper struct {
 // NewTimekeeper does not take ownership of paramPage.
 //
 // SetClocks must be called on the returned Timekeeper before it is usable.
-func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) {
+func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage memmap.FileRange) (*Timekeeper, error) {
 	return &Timekeeper{
 		params: NewVDSOParamPage(mfp, paramPage),
 	}, nil
@@ -186,6 +186,7 @@ func (t *Timekeeper) startUpdater() {
 	timer := time.NewTicker(sentrytime.ApproxUpdateInterval)
 	t.wg.Add(1)
 	go func() { // S/R-SAFE: stopped during save.
+		defer t.wg.Done()
 		for {
 			// Start with an update immediately, so the clocks are
 			// ready ASAP.
@@ -209,9 +210,6 @@ func (t *Timekeeper) startUpdater() {
 					p.realtimeBaseRef = int64(realtimeParams.BaseRef)
 					p.realtimeFrequency = realtimeParams.Frequency
 				}
-
-				log.Debugf("Updating VDSO parameters: %+v", p)
-
 				return p
 			}); err != nil {
 				log.Warningf("Unable to update VDSO parameter page: %v", err)
@@ -220,7 +218,6 @@ func (t *Timekeeper) startUpdater() {
 			select {
 			case <-timer.C:
 			case <-t.stop:
-				t.wg.Done()
 				return
 			}
 		}
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index f1b3c212c..290c32466 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -19,8 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -58,7 +58,7 @@ type vdsoParams struct {
 type VDSOParamPage struct {
 	// The parameter page is fr, allocated from mfp.MemoryFile().
 	mfp pgalloc.MemoryFileProvider
-	fr  platform.FileRange
+	fr  memmap.FileRange
 
 	// seq is the current sequence count written to the page.
 	//
@@ -81,7 +81,7 @@ type VDSOParamPage struct {
 // * VDSOParamPage must be the only writer to fr.
 //
 // * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block.
-func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage {
+func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage {
 	return &VDSOParamPage{mfp: mfp, fr: fr}
 }
 
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index c6aa65f28..34bdb0b69 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -30,9 +30,6 @@ go_library(
         "//pkg/rand",
         "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/fs",
-        "//pkg/sentry/fs/anon",
-        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/limits",
@@ -45,6 +42,5 @@ go_library(
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/usermem",
-        "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 616fafa2c..ddeaff3db 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -90,14 +90,23 @@ type elfInfo struct {
 	sharedObject bool
 }
 
+// fullReader interface extracts the ReadFull method from fsbridge.File so that
+// client code does not need to define an entire fsbridge.File when only read
+// functionality is needed.
+//
+// TODO(gvisor.dev/issue/1035): Once VFS2 ships, rewrite this to wrap
+// vfs.FileDescription's PRead/Read instead.
+type fullReader interface {
+	// ReadFull is the same as fsbridge.File.ReadFull.
+	ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error)
+}
+
 // parseHeader parse the ELF header, verifying that this is a supported ELF
 // file and returning the ELF program headers.
 //
 // This is similar to elf.NewFile, except that it is more strict about what it
 // accepts from the ELF, and it doesn't parse unnecessary parts of the file.
-//
-// ctx may be nil if f does not need it.
-func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) {
+func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) {
 	// Check ident first; it will tell us the endianness of the rest of the
 	// structs.
 	var ident [elf.EI_NIDENT]byte
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 88449fe95..986c7fb4d 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -27,7 +27,6 @@ import (
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -80,22 +79,6 @@ type LoadArgs struct {
 	Features *cpuid.FeatureSet
 }
 
-// readFull behaves like io.ReadFull for an *fs.File.
-func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
-	var total int64
-	for dst.NumBytes() > 0 {
-		n, err := f.Preadv(ctx, dst, offset+total)
-		total += n
-		if err == io.EOF && total != 0 {
-			return total, io.ErrUnexpectedEOF
-		} else if err != nil {
-			return total, err
-		}
-		dst = dst.DropFirst64(n)
-	}
-	return total, nil
-}
-
 // openPath opens args.Filename and checks that it is valid for loading.
 //
 // openPath returns an *fs.Dirent and *fs.File for args.Filename, which is not
@@ -238,14 +221,14 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
 	// Load the executable itself.
 	loaded, ac, file, newArgv, err := loadExecutable(ctx, args)
 	if err != nil {
-		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
 	}
 	defer file.DecRef()
 
 	// Load the VDSO.
 	vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded)
 	if err != nil {
-		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Error loading VDSO: %v", err), syserr.FromError(err).ToLinux())
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("error loading VDSO: %v", err), syserr.FromError(err).ToLinux())
 	}
 
 	// Setup the heap. brk starts at the next page after the end of the
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 00977fc08..05a294fe6 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -26,10 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
-	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -37,7 +33,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/pkg/waiter"
 )
 
 const vdsoPrelink = 0xffffffffff700000
@@ -55,52 +50,11 @@ func (f *fileContext) Value(key interface{}) interface{} {
 	}
 }
 
-// byteReader implements fs.FileOperations for reading from a []byte source.
-type byteReader struct {
-	fsutil.FileNoFsync              `state:"nosave"`
-	fsutil.FileNoIoctl              `state:"nosave"`
-	fsutil.FileNoMMap               `state:"nosave"`
-	fsutil.FileNoSplice             `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
-	fsutil.FileNoopRelease          `state:"nosave"`
-	fsutil.FileNotDirReaddir        `state:"nosave"`
-	fsutil.FilePipeSeek             `state:"nosave"`
-	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
-	waiter.AlwaysReady              `state:"nosave"`
-
+type byteFullReader struct {
 	data []byte
 }
 
-var _ fs.FileOperations = (*byteReader)(nil)
-
-// newByteReaderFile creates a fake file to read data from.
-//
-// TODO(gvisor.dev/issue/1623): Convert to VFS2.
-func newByteReaderFile(ctx context.Context, data []byte) *fs.File {
-	// Create a fake inode.
-	inode := fs.NewInode(
-		ctx,
-		&fsutil.SimpleFileInode{},
-		fs.NewPseudoMountSource(ctx),
-		fs.StableAttr{
-			Type:      fs.Anonymous,
-			DeviceID:  anon.PseudoDevice.DeviceID(),
-			InodeID:   anon.PseudoDevice.NextIno(),
-			BlockSize: usermem.PageSize,
-		})
-
-	// Use the fake inode to create a fake dirent.
-	dirent := fs.NewTransientDirent(inode)
-	defer dirent.DecRef()
-
-	// Use the fake dirent to make a fake file.
-	flags := fs.FileFlags{Read: true, Pread: true}
-	return fs.NewFile(&fileContext{Context: context.Background()}, dirent, flags, &byteReader{
-		data: data,
-	})
-}
-
-func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+func (b *byteFullReader) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
@@ -111,10 +65,6 @@ func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequ
 	return int64(n), err
 }
 
-func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
-	panic("Write not supported")
-}
-
 // validateVDSO checks that the VDSO can be loaded by loadVDSO.
 //
 // VDSOs are special (see below). Since we are going to map the VDSO directly
@@ -130,7 +80,7 @@ func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSeq
 // * PT_LOAD segments don't extend beyond the end of the file.
 //
 // ctx may be nil if f does not need it.
-func validateVDSO(ctx context.Context, f fsbridge.File, size uint64) (elfInfo, error) {
+func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
 		log.Infof("Unable to parse VDSO header: %v", err)
@@ -248,13 +198,12 @@ func getSymbolValueFromVDSO(symbol string) (uint64, error) {
 
 // PrepareVDSO validates the system VDSO and returns a VDSO, containing the
 // param page for updating by the kernel.
-func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
-	vdsoFile := fsbridge.NewFSFile(newByteReaderFile(ctx, vdsoBin))
+func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
+	vdsoFile := &byteFullReader{data: vdsoBin}
 
 	// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
 	// nil context can be passed.
 	info, err := validateVDSO(nil, vdsoFile, uint64(len(vdsoBin)))
-	vdsoFile.DecRef()
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index a98b66de1..2c95669cd 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -28,9 +28,21 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "file_range",
+    out = "file_range.go",
+    package = "memmap",
+    prefix = "File",
+    template = "//pkg/segment:generic_range",
+    types = {
+        "T": "uint64",
+    },
+)
+
 go_library(
     name = "memmap",
     srcs = [
+        "file_range.go",
         "mappable_range.go",
         "mapping_set.go",
         "mapping_set_impl.go",
@@ -40,7 +52,7 @@ go_library(
     deps = [
         "//pkg/context",
         "//pkg/log",
-        "//pkg/sentry/platform",
+        "//pkg/safemem",
         "//pkg/syserror",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index c6db9fc8f..c188f6c29 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -19,12 +19,12 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Mappable represents a memory-mappable object, a mutable mapping from uint64
-// offsets to (platform.File, uint64 File offset) pairs.
+// offsets to (File, uint64 File offset) pairs.
 //
 // See mm/mm.go for Mappable's place in the lock order.
 //
@@ -74,7 +74,7 @@ type Mappable interface {
 	// Translations are valid until invalidated by a callback to
 	// MappingSpace.Invalidate or until the caller removes its mapping of the
 	// translated range. Mappable implementations must ensure that at least one
-	// reference is held on all pages in a platform.File that may be the result
+	// reference is held on all pages in a File that may be the result
 	// of a valid Translation.
 	//
 	// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
@@ -100,7 +100,7 @@ type Translation struct {
 	Source MappableRange
 
 	// File is the mapped file.
-	File platform.File
+	File File
 
 	// Offset is the offset into File at which this Translation begins.
 	Offset uint64
@@ -110,9 +110,9 @@ type Translation struct {
 	Perms usermem.AccessType
 }
 
-// FileRange returns the platform.FileRange represented by t.
-func (t Translation) FileRange() platform.FileRange {
-	return platform.FileRange{t.Offset, t.Offset + t.Source.Length()}
+// FileRange returns the FileRange represented by t.
+func (t Translation) FileRange() FileRange {
+	return FileRange{t.Offset, t.Offset + t.Source.Length()}
 }
 
 // CheckTranslateResult returns an error if (ts, terr) does not satisfy all
@@ -361,3 +361,49 @@ type MMapOpts struct {
 	// TODO(jamieliu): Replace entirely with MappingIdentity?
 	Hint string
 }
+
+// File represents a host file that may be mapped into an platform.AddressSpace.
+type File interface {
+	// All pages in a File are reference-counted.
+
+	// IncRef increments the reference count on all pages in fr.
+	//
+	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
+	// 0. At least one reference must be held on all pages in fr. (The File
+	// interface does not provide a way to acquire an initial reference;
+	// implementors may define mechanisms for doing so.)
+	IncRef(fr FileRange)
+
+	// DecRef decrements the reference count on all pages in fr.
+	//
+	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
+	// 0. At least one reference must be held on all pages in fr.
+	DecRef(fr FileRange)
+
+	// MapInternal returns a mapping of the given file offsets in the invoking
+	// process' address space for reading and writing.
+	//
+	// Note that fr.Start and fr.End need not be page-aligned.
+	//
+	// Preconditions: fr.Length() > 0. At least one reference must be held on
+	// all pages in fr.
+	//
+	// Postconditions: The returned mapping is valid as long as at least one
+	// reference is held on the mapped pages.
+	MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error)
+
+	// FD returns the file descriptor represented by the File.
+	//
+	// The only permitted operation on the returned file descriptor is to map
+	// pages from it consistent with the requirements of AddressSpace.MapFile.
+	FD() int
+}
+
+// FileRange represents a range of uint64 offsets into a File.
+//
+// type FileRange <generated using go_generics>
+
+// String implements fmt.Stringer.String.
+func (fr FileRange) String() string {
+	return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
+}
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index a036ce53c..f9d0837a1 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -7,14 +7,14 @@ go_template_instance(
     name = "file_refcount_set",
     out = "file_refcount_set.go",
     imports = {
-        "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
+        "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
     },
     package = "mm",
     prefix = "fileRefcount",
     template = "//pkg/segment:generic_set",
     types = {
         "Key": "uint64",
-        "Range": "platform.FileRange",
+        "Range": "memmap.FileRange",
         "Value": "int32",
         "Functions": "fileRefcountSetFunctions",
     },
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 379148903..1999ec706 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -20,7 +20,6 @@ import (
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -243,7 +242,7 @@ type aioMappable struct {
 	refs.AtomicRefCount
 
 	mfp pgalloc.MemoryFileProvider
-	fr  platform.FileRange
+	fr  memmap.FileRange
 }
 
 var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp())
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 6db7c3d40..3e85964e4 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -25,7 +25,7 @@
 //           Locks taken by memmap.Mappable.Translate
 //             mm.privateRefs.mu
 //               platform.AddressSpace locks
-//                 platform.File locks
+//                 memmap.File locks
 //         mm.aioManager.mu
 //           mm.AIOContext.mu
 //
@@ -396,7 +396,7 @@ type pma struct {
 	// file is the file mapped by this pma. Only pmas for which file ==
 	// MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to
 	// the corresponding file range while they exist.
-	file platform.File `state:"nosave"`
+	file memmap.File `state:"nosave"`
 
 	// off is the offset into file at which this pma begins.
 	//
@@ -436,7 +436,7 @@ type pma struct {
 	private bool
 
 	// If internalMappings is not empty, it is the cached return value of
-	// file.MapInternal for the platform.FileRange mapped by this pma.
+	// file.MapInternal for the memmap.FileRange mapped by this pma.
 	internalMappings safemem.BlockSeq `state:"nosave"`
 }
 
@@ -469,10 +469,10 @@ func (fileRefcountSetFunctions) MaxKey() uint64 {
 func (fileRefcountSetFunctions) ClearValue(_ *int32) {
 }
 
-func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) {
+func (fileRefcountSetFunctions) Merge(_ memmap.FileRange, rc1 int32, _ memmap.FileRange, rc2 int32) (int32, bool) {
 	return rc1, rc1 == rc2
 }
 
-func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) {
+func (fileRefcountSetFunctions) Split(_ memmap.FileRange, rc int32, _ uint64) (int32, int32) {
 	return rc, rc
 }
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index 62e4c20af..930ec895f 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -21,7 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/safecopy"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -604,7 +603,7 @@ func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivat
 	}
 }
 
-// Pin returns the platform.File ranges currently mapped by addresses in ar in
+// Pin returns the memmap.File ranges currently mapped by addresses in ar in
 // mm, acquiring a reference on the returned ranges which the caller must
 // release by calling Unpin. If not all addresses are mapped, Pin returns a
 // non-nil error. Note that Pin may return both a non-empty slice of
@@ -674,15 +673,15 @@ type PinnedRange struct {
 	Source usermem.AddrRange
 
 	// File is the mapped file.
-	File platform.File
+	File memmap.File
 
 	// Offset is the offset into File at which this PinnedRange begins.
 	Offset uint64
 }
 
-// FileRange returns the platform.File offsets mapped by pr.
-func (pr PinnedRange) FileRange() platform.FileRange {
-	return platform.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}
+// FileRange returns the memmap.File offsets mapped by pr.
+func (pr PinnedRange) FileRange() memmap.FileRange {
+	return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}
 }
 
 // Unpin releases the reference held by prs.
@@ -857,7 +856,7 @@ func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) saf
 }
 
 // incPrivateRef acquires a reference on private pages in fr.
-func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) {
+func (mm *MemoryManager) incPrivateRef(fr memmap.FileRange) {
 	mm.privateRefs.mu.Lock()
 	defer mm.privateRefs.mu.Unlock()
 	refSet := &mm.privateRefs.refs
@@ -878,8 +877,8 @@ func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) {
 }
 
 // decPrivateRef releases a reference on private pages in fr.
-func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) {
-	var freed []platform.FileRange
+func (mm *MemoryManager) decPrivateRef(fr memmap.FileRange) {
+	var freed []memmap.FileRange
 
 	mm.privateRefs.mu.Lock()
 	refSet := &mm.privateRefs.refs
@@ -951,7 +950,7 @@ func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRa
 
 	// Discard internal mappings instead of trying to merge them, since merging
 	// them requires an allocation and getting them again from the
-	// platform.File might not.
+	// memmap.File might not.
 	pma1.internalMappings = safemem.BlockSeq{}
 	return pma1, true
 }
@@ -1012,12 +1011,12 @@ func (pseg pmaIterator) getInternalMappingsLocked() error {
 	return nil
 }
 
-func (pseg pmaIterator) fileRange() platform.FileRange {
+func (pseg pmaIterator) fileRange() memmap.FileRange {
 	return pseg.fileRangeOf(pseg.Range())
 }
 
 // Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0.
-func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange {
+func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) memmap.FileRange {
 	if checkInvariants {
 		if !pseg.Ok() {
 			panic("terminal pma iterator")
@@ -1032,5 +1031,5 @@ func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange {
 
 	pma := pseg.ValuePtr()
 	pstart := pseg.Start()
-	return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
+	return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
 }
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 9ad52082d..0e142fb11 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -19,7 +19,6 @@ import (
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -35,7 +34,7 @@ type SpecialMappable struct {
 	refs.AtomicRefCount
 
 	mfp  pgalloc.MemoryFileProvider
-	fr   platform.FileRange
+	fr   memmap.FileRange
 	name string
 }
 
@@ -44,7 +43,7 @@ type SpecialMappable struct {
 // SpecialMappable will use the given name in /proc/[pid]/maps.
 //
 // Preconditions: fr.Length() != 0.
-func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable {
+func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable {
 	m := SpecialMappable{mfp: mfp, fr: fr, name: name}
 	m.EnableLeakCheck("mm.SpecialMappable")
 	return &m
@@ -126,7 +125,7 @@ func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider {
 
 // FileRange returns the offsets into MemoryFileProvider().MemoryFile() that
 // store the SpecialMappable's contents.
-func (m *SpecialMappable) FileRange() platform.FileRange {
+func (m *SpecialMappable) FileRange() memmap.FileRange {
 	return m.fr
 }
 
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 1eeb9f317..7a3311a70 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -33,21 +33,42 @@ go_template_instance(
     out = "usage_set.go",
     consts = {
         "minDegree": "10",
+        "trackGaps": "1",
     },
     imports = {
-        "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
+        "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
     },
     package = "pgalloc",
     prefix = "usage",
     template = "//pkg/segment:generic_set",
     types = {
         "Key": "uint64",
-        "Range": "platform.FileRange",
+        "Range": "memmap.FileRange",
         "Value": "usageInfo",
         "Functions": "usageSetFunctions",
     },
 )
 
+go_template_instance(
+    name = "reclaim_set",
+    out = "reclaim_set.go",
+    consts = {
+        "minDegree": "10",
+    },
+    imports = {
+        "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
+    },
+    package = "pgalloc",
+    prefix = "reclaim",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "memmap.FileRange",
+        "Value": "reclaimSetValue",
+        "Functions": "reclaimSetFunctions",
+    },
+)
+
 go_library(
     name = "pgalloc",
     srcs = [
@@ -56,6 +77,7 @@ go_library(
         "evictable_range_set.go",
         "pgalloc.go",
         "pgalloc_unsafe.go",
+        "reclaim_set.go",
         "save_restore.go",
         "usage_set.go",
     ],
@@ -67,9 +89,10 @@ go_library(
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/hostmm",
-        "//pkg/sentry/platform",
+        "//pkg/sentry/memmap",
         "//pkg/sentry/usage",
         "//pkg/state",
+        "//pkg/state/wire",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 2b11ea4ae..3243d7214 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -33,14 +33,14 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/hostmm"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// MemoryFile is a platform.File whose pages may be allocated to arbitrary
+// MemoryFile is a memmap.File whose pages may be allocated to arbitrary
 // users.
 type MemoryFile struct {
 	// opts holds options passed to NewMemoryFile. opts is immutable.
@@ -108,12 +108,6 @@ type MemoryFile struct {
 	usageSwapped  uint64
 	usageLast     time.Time
 
-	// minUnallocatedPage is the minimum page that may be unallocated.
-	// i.e., there are no unallocated pages below minUnallocatedPage.
-	//
-	// minUnallocatedPage is protected by mu.
-	minUnallocatedPage uint64
-
 	// fileSize is the size of the backing memory file in bytes. fileSize is
 	// always a power-of-two multiple of chunkSize.
 	//
@@ -146,11 +140,9 @@ type MemoryFile struct {
 	// is protected by mu.
 	reclaimable bool
 
-	// minReclaimablePage is the minimum page that may be reclaimable.
-	// i.e., all reclaimable pages are >= minReclaimablePage.
-	//
-	// minReclaimablePage is protected by mu.
-	minReclaimablePage uint64
+	// relcaim is the collection of regions for reclaim. relcaim is protected
+	// by mu.
+	reclaim reclaimSet
 
 	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
 	// transitions from false to true.
@@ -273,12 +265,10 @@ type evictableMemoryUserInfo struct {
 }
 
 const (
-	chunkShift = 24
-	chunkSize  = 1 << chunkShift // 16 MB
+	chunkShift = 30
+	chunkSize  = 1 << chunkShift // 1 GB
 	chunkMask  = chunkSize - 1
 
-	initialSize = chunkSize
-
 	// maxPage is the highest 64-bit page.
 	maxPage = math.MaxUint64 &^ (usermem.PageSize - 1)
 )
@@ -302,19 +292,12 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
 	if err := file.Truncate(0); err != nil {
 		return nil, err
 	}
-	if err := file.Truncate(initialSize); err != nil {
-		return nil, err
-	}
 	f := &MemoryFile{
-		opts:     opts,
-		fileSize: initialSize,
-		file:     file,
-		// No pages are reclaimable. DecRef will always be able to
-		// decrease minReclaimablePage from this point.
-		minReclaimablePage: maxPage,
-		evictable:          make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
+		opts:      opts,
+		file:      file,
+		evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
 	}
-	f.mappings.Store(make([]uintptr, initialSize/chunkSize))
+	f.mappings.Store(make([]uintptr, 0))
 	f.reclaimCond.L = &f.mu
 
 	if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
@@ -389,7 +372,7 @@ func (f *MemoryFile) Destroy() {
 // to Allocate.
 //
 // Preconditions: length must be page-aligned and non-zero.
-func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) {
+func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.FileRange, error) {
 	if length == 0 || length%usermem.PageSize != 0 {
 		panic(fmt.Sprintf("invalid allocation length: %#x", length))
 	}
@@ -404,46 +387,36 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi
 		alignment = usermem.HugePageSize
 	}
 
-	start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment)
-	end := start + length
-	// File offsets are int64s. Since length must be strictly positive, end
-	// cannot legitimately be 0.
-	if end < start || int64(end) <= 0 {
-		return platform.FileRange{}, syserror.ENOMEM
+	// Find a range in the underlying file.
+	fr, ok := findAvailableRange(&f.usage, f.fileSize, length, alignment)
+	if !ok {
+		return memmap.FileRange{}, syserror.ENOMEM
 	}
 
-	// Expand the file if needed. Double the file size on each expansion;
-	// uncommitted pages have effectively no cost.
-	fileSize := f.fileSize
-	for int64(end) > fileSize {
-		if fileSize >= 2*fileSize {
-			// fileSize overflow.
-			return platform.FileRange{}, syserror.ENOMEM
+	// Expand the file if needed.
+	if int64(fr.End) > f.fileSize {
+		// Round the new file size up to be chunk-aligned.
+		newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask
+		if err := f.file.Truncate(newFileSize); err != nil {
+			return memmap.FileRange{}, err
 		}
-		fileSize *= 2
-	}
-	if fileSize > f.fileSize {
-		if err := f.file.Truncate(fileSize); err != nil {
-			return platform.FileRange{}, err
-		}
-		f.fileSize = fileSize
+		f.fileSize = newFileSize
 		f.mappingsMu.Lock()
 		oldMappings := f.mappings.Load().([]uintptr)
-		newMappings := make([]uintptr, fileSize>>chunkShift)
+		newMappings := make([]uintptr, newFileSize>>chunkShift)
 		copy(newMappings, oldMappings)
 		f.mappings.Store(newMappings)
 		f.mappingsMu.Unlock()
 	}
 
 	// Mark selected pages as in use.
-	fr := platform.FileRange{start, end}
 	if f.opts.ManualZeroing {
 		if err := f.forEachMappingSlice(fr, func(bs []byte) {
 			for i := range bs {
 				bs[i] = 0
 			}
 		}); err != nil {
-			return platform.FileRange{}, err
+			return memmap.FileRange{}, err
 		}
 	}
 	if !f.usage.Add(fr, usageInfo{
@@ -453,49 +426,79 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi
 		panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage))
 	}
 
-	if minUnallocatedPage < start {
-		f.minUnallocatedPage = minUnallocatedPage
-	} else {
-		// start was the first unallocated page. The next must be
-		// somewhere beyond end.
-		f.minUnallocatedPage = end
-	}
-
 	return fr, nil
 }
 
-// findUnallocatedRange returns the first unallocated page in usage of the
-// specified length and alignment beginning at page start and the first single
-// unallocated page.
-func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) {
-	// Only searched until the first page is found.
-	firstPage := start
-	foundFirstPage := false
-	alignMask := alignment - 1
-	for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() {
-		r := seg.Range()
-
-		if !foundFirstPage && r.Start > firstPage {
-			foundFirstPage = true
+// findAvailableRange returns an available range in the usageSet.
+//
+// Note that scanning for available slots takes place from end first backwards,
+// then forwards. This heuristic has important consequence for how sequential
+// mappings can be merged in the host VMAs, given that addresses for both
+// application and sentry mappings are allocated top-down (from higher to
+// lower addresses). The file is also grown expoentially in order to create
+// space for mappings to be allocated downwards.
+//
+// Precondition: alignment must be a power of 2.
+func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) {
+	alignmentMask := alignment - 1
+
+	// Search for space in existing gaps, starting at the current end of the
+	// file and working backward.
+	lastGap := usage.LastGap()
+	gap := lastGap
+	for {
+		end := gap.End()
+		if end > uint64(fileSize) {
+			end = uint64(fileSize)
 		}
 
-		if start >= r.End {
-			// start was rounded up to an alignment boundary from the end
-			// of a previous segment and is now beyond r.End.
-			continue
+		// Try to allocate from the end of this gap, with the start of the
+		// allocated range aligned down to alignment.
+		unalignedStart := end - length
+		if unalignedStart > end {
+			// Negative overflow: this and all preceding gaps are too small to
+			// accommodate length.
+			break
 		}
-		// This segment represents allocated or reclaimable pages; only the
-		// range from start to the segment's beginning is allocatable, and the
-		// next allocatable range begins after the segment.
-		if r.Start > start && r.Start-start >= length {
+		if start := unalignedStart &^ alignmentMask; start >= gap.Start() {
+			return memmap.FileRange{start, start + length}, true
+		}
+
+		gap = gap.PrevLargeEnoughGap(length)
+		if !gap.Ok() {
 			break
 		}
-		start = (r.End + alignMask) &^ alignMask
-		if !foundFirstPage {
-			firstPage = r.End
+	}
+
+	// Check that it's possible to fit this allocation at the end of a file of any size.
+	min := lastGap.Start()
+	min = (min + alignmentMask) &^ alignmentMask
+	if min+length < min {
+		// Overflow: allocation would exceed the range of uint64.
+		return memmap.FileRange{}, false
+	}
+
+	// Determine the minimum file size required to fit this allocation at its end.
+	for {
+		newFileSize := 2 * fileSize
+		if newFileSize <= fileSize {
+			if fileSize != 0 {
+				// Overflow: allocation would exceed the range of int64.
+				return memmap.FileRange{}, false
+			}
+			newFileSize = chunkSize
+		}
+		fileSize = newFileSize
+
+		unalignedStart := uint64(fileSize) - length
+		if unalignedStart > uint64(fileSize) {
+			// Negative overflow: fileSize is still inadequate.
+			continue
+		}
+		if start := unalignedStart &^ alignmentMask; start >= min {
+			return memmap.FileRange{start, start + length}, true
 		}
 	}
-	return start, firstPage
 }
 
 // AllocateAndFill allocates memory of the given kind and fills it by calling
@@ -505,22 +508,22 @@ func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uin
 // by r.ReadToBlocks(), it returns that error.
 //
 // Preconditions: length > 0. length must be page-aligned.
-func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (platform.FileRange, error) {
+func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (memmap.FileRange, error) {
 	fr, err := f.Allocate(length, kind)
 	if err != nil {
-		return platform.FileRange{}, err
+		return memmap.FileRange{}, err
 	}
 	dsts, err := f.MapInternal(fr, usermem.Write)
 	if err != nil {
 		f.DecRef(fr)
-		return platform.FileRange{}, err
+		return memmap.FileRange{}, err
 	}
 	n, err := safemem.ReadFullToBlocks(r, dsts)
 	un := uint64(usermem.Addr(n).RoundDown())
 	if un < length {
 		// Free unused memory and update fr to contain only the memory that is
 		// still allocated.
-		f.DecRef(platform.FileRange{fr.Start + un, fr.End})
+		f.DecRef(memmap.FileRange{fr.Start + un, fr.End})
 		fr.End = fr.Start + un
 	}
 	return fr, err
@@ -537,7 +540,7 @@ const (
 // will read zeroes.
 //
 // Preconditions: fr.Length() > 0.
-func (f *MemoryFile) Decommit(fr platform.FileRange) error {
+func (f *MemoryFile) Decommit(fr memmap.FileRange) error {
 	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
 		panic(fmt.Sprintf("invalid range: %v", fr))
 	}
@@ -557,7 +560,7 @@ func (f *MemoryFile) Decommit(fr platform.FileRange) error {
 	return nil
 }
 
-func (f *MemoryFile) markDecommitted(fr platform.FileRange) {
+func (f *MemoryFile) markDecommitted(fr memmap.FileRange) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 	// Since we're changing the knownCommitted attribute, we need to merge
@@ -578,8 +581,8 @@ func (f *MemoryFile) markDecommitted(fr platform.FileRange) {
 	f.usage.MergeRange(fr)
 }
 
-// IncRef implements platform.File.IncRef.
-func (f *MemoryFile) IncRef(fr platform.FileRange) {
+// IncRef implements memmap.File.IncRef.
+func (f *MemoryFile) IncRef(fr memmap.FileRange) {
 	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
 		panic(fmt.Sprintf("invalid range: %v", fr))
 	}
@@ -597,8 +600,8 @@ func (f *MemoryFile) IncRef(fr platform.FileRange) {
 	f.usage.MergeAdjacent(fr)
 }
 
-// DecRef implements platform.File.DecRef.
-func (f *MemoryFile) DecRef(fr platform.FileRange) {
+// DecRef implements memmap.File.DecRef.
+func (f *MemoryFile) DecRef(fr memmap.FileRange) {
 	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
 		panic(fmt.Sprintf("invalid range: %v", fr))
 	}
@@ -616,6 +619,7 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) {
 		}
 		val.refs--
 		if val.refs == 0 {
+			f.reclaim.Add(seg.Range(), reclaimSetValue{})
 			freed = true
 			// Reclassify memory as System, until it's freed by the reclaim
 			// goroutine.
@@ -628,17 +632,13 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) {
 	f.usage.MergeAdjacent(fr)
 
 	if freed {
-		if fr.Start < f.minReclaimablePage {
-			// We've freed at least one lower page.
-			f.minReclaimablePage = fr.Start
-		}
 		f.reclaimable = true
 		f.reclaimCond.Signal()
 	}
 }
 
-// MapInternal implements platform.File.MapInternal.
-func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+// MapInternal implements memmap.File.MapInternal.
+func (f *MemoryFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
 	if !fr.WellFormed() || fr.Length() == 0 {
 		panic(fmt.Sprintf("invalid range: %v", fr))
 	}
@@ -664,7 +664,7 @@ func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (
 
 // forEachMappingSlice invokes fn on a sequence of byte slices that
 // collectively map all bytes in fr.
-func (f *MemoryFile) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error {
+func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error {
 	mappings := f.mappings.Load().([]uintptr)
 	for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
 		chunk := int(chunkStart >> chunkShift)
@@ -944,7 +944,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
 					continue
 				case !populated && populatedRun:
 					// Finish the run by changing this segment.
-					runRange := platform.FileRange{
+					runRange := memmap.FileRange{
 						Start: r.Start + uint64(populatedRunStart*usermem.PageSize),
 						End:   r.Start + uint64(i*usermem.PageSize),
 					}
@@ -1009,7 +1009,7 @@ func (f *MemoryFile) File() *os.File {
 	return f.file
 }
 
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
 func (f *MemoryFile) FD() int {
 	return int(f.file.Fd())
 }
@@ -1030,6 +1030,7 @@ func (f *MemoryFile) String() string {
 // for allocation.
 func (f *MemoryFile) runReclaim() {
 	for {
+		// N.B. We must call f.markReclaimed on the returned FrameRange.
 		fr, ok := f.findReclaimable()
 		if !ok {
 			break
@@ -1085,13 +1086,17 @@ func (f *MemoryFile) runReclaim() {
 	}
 }
 
-func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
+// findReclaimable finds memory that has been marked for reclaim.
+//
+// Note that there returned range will be removed from tracking. It
+// must be reclaimed (removed from f.usage) at this point.
+func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 	for {
 		for {
 			if f.destroyed {
-				return platform.FileRange{}, false
+				return memmap.FileRange{}, false
 			}
 			if f.reclaimable {
 				break
@@ -1103,27 +1108,24 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
 			}
 			f.reclaimCond.Wait()
 		}
-		// Allocate returns the first usable range in offset order and is
-		// currently a linear scan, so reclaiming from the beginning of the
-		// file minimizes the expected latency of Allocate.
-		for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
-			if seg.ValuePtr().refs == 0 {
-				f.minReclaimablePage = seg.End()
-				return seg.Range(), true
-			}
+		// Allocate works from the back of the file inwards, so reclaim
+		// preserves this order to minimize the cost of the search.
+		if seg := f.reclaim.LastSegment(); seg.Ok() {
+			fr := seg.Range()
+			f.reclaim.Remove(seg)
+			return fr, true
 		}
-		// No pages are reclaimable.
+		// Nothing is reclaimable.
 		f.reclaimable = false
-		f.minReclaimablePage = maxPage
 	}
 }
 
-func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
+func (f *MemoryFile) markReclaimed(fr memmap.FileRange) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 	seg := f.usage.FindSegment(fr.Start)
-	// All of fr should be mapped to a single uncommitted reclaimable segment
-	// accounted to System.
+	// All of fr should be mapped to a single uncommitted reclaimable
+	// segment accounted to System.
 	if !seg.Ok() {
 		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
 	}
@@ -1137,14 +1139,10 @@ func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
 	}); got != want {
 		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
 	}
-	// Deallocate reclaimed pages. Even though all of seg is reclaimable, the
-	// caller of markReclaimed may not have decommitted it, so we can only mark
-	// fr as reclaimed.
+	// Deallocate reclaimed pages. Even though all of seg is reclaimable,
+	// the caller of markReclaimed may not have decommitted it, so we can
+	// only mark fr as reclaimed.
 	f.usage.Remove(f.usage.Isolate(seg, fr))
-	if fr.Start < f.minUnallocatedPage {
-		// We've deallocated at least one lower page.
-		f.minUnallocatedPage = fr.Start
-	}
 }
 
 // StartEvictions requests that f evict all evictable allocations. It does not
@@ -1224,11 +1222,11 @@ func (usageSetFunctions) MaxKey() uint64 {
 func (usageSetFunctions) ClearValue(val *usageInfo) {
 }
 
-func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) {
+func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) {
 	return val1, val1 == val2
 }
 
-func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
+func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
 	return val, val
 }
 
@@ -1255,3 +1253,27 @@ func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetVal
 func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
 	return evictableRangeSetValue{}, evictableRangeSetValue{}
 }
+
+// reclaimSetValue is the value type of reclaimSet.
+type reclaimSetValue struct{}
+
+type reclaimSetFunctions struct{}
+
+func (reclaimSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (reclaimSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) {
+}
+
+func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) {
+	return reclaimSetValue{}, true
+}
+
+func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) {
+	return reclaimSetValue{}, reclaimSetValue{}
+}
diff --git a/pkg/sentry/pgalloc/pgalloc_test.go b/pkg/sentry/pgalloc/pgalloc_test.go
index 293f22c6b..405db141f 100644
--- a/pkg/sentry/pgalloc/pgalloc_test.go
+++ b/pkg/sentry/pgalloc/pgalloc_test.go
@@ -23,39 +23,49 @@ import (
 const (
 	page     = usermem.PageSize
 	hugepage = usermem.HugePageSize
+	topPage  = (1 << 63) - page
 )
 
 func TestFindUnallocatedRange(t *testing.T) {
 	for _, test := range []struct {
-		desc           string
-		usage          *usageSegmentDataSlices
-		start          uint64
-		length         uint64
-		alignment      uint64
-		unallocated    uint64
-		minUnallocated uint64
+		desc       string
+		usage      *usageSegmentDataSlices
+		fileSize   int64
+		length     uint64
+		alignment  uint64
+		start      uint64
+		expectFail bool
 	}{
 		{
-			desc:           "Initial allocation succeeds",
-			usage:          &usageSegmentDataSlices{},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    0,
-			minUnallocated: 0,
+			desc:      "Initial allocation succeeds",
+			usage:     &usageSegmentDataSlices{},
+			length:    page,
+			alignment: page,
+			start:     chunkSize - page, // Grows by chunkSize, allocate down.
 		},
 		{
-			desc: "Allocation begins at start of file",
+			desc: "Allocation finds empty space at start of file",
 			usage: &usageSegmentDataSlices{
 				Start:  []uint64{page},
 				End:    []uint64{2 * page},
 				Values: []usageInfo{{refs: 1}},
 			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    0,
-			minUnallocated: 0,
+			fileSize:  2 * page,
+			length:    page,
+			alignment: page,
+			start:     0,
+		},
+		{
+			desc: "Allocation finds empty space at end of file",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0},
+				End:    []uint64{page},
+				Values: []usageInfo{{refs: 1}},
+			},
+			fileSize:  2 * page,
+			length:    page,
+			alignment: page,
+			start:     page,
 		},
 		{
 			desc: "In-use frames are not allocatable",
@@ -64,11 +74,10 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 2 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 2}},
 			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
+			fileSize:  2 * page,
+			length:    page,
+			alignment: page,
+			start:     3 * page, // Double fileSize, allocate top-down.
 		},
 		{
 			desc: "Reclaimable frames are not allocatable",
@@ -77,11 +86,10 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 2 * page, 3 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 0}, {refs: 1}},
 			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    3 * page,
-			minUnallocated: 3 * page,
+			fileSize:  3 * page,
+			length:    page,
+			alignment: page,
+			start:     5 * page, // Double fileSize, grow down.
 		},
 		{
 			desc: "Gaps between in-use frames are allocatable",
@@ -90,11 +98,10 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 3 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 1}},
 			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    page,
-			minUnallocated: page,
+			fileSize:  3 * page,
+			length:    page,
+			alignment: page,
+			start:     page,
 		},
 		{
 			desc: "Inadequately-sized gaps are rejected",
@@ -103,14 +110,13 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 3 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 1}},
 			},
-			start:          0,
-			length:         2 * page,
-			alignment:      page,
-			unallocated:    3 * page,
-			minUnallocated: page,
+			fileSize:  3 * page,
+			length:    2 * page,
+			alignment: page,
+			start:     4 * page, // Double fileSize, grow down.
 		},
 		{
-			desc: "Hugepage alignment is honored",
+			desc: "Alignment is honored at end of file",
 			usage: &usageSegmentDataSlices{
 				Start: []uint64{0, hugepage + page},
 				// Hugepage-sized gap here that shouldn't be allocated from
@@ -118,37 +124,103 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, hugepage + 2*page},
 				Values: []usageInfo{{refs: 1}, {refs: 1}},
 			},
-			start:          0,
-			length:         hugepage,
-			alignment:      hugepage,
-			unallocated:    2 * hugepage,
-			minUnallocated: page,
+			fileSize:  hugepage + 2*page,
+			length:    hugepage,
+			alignment: hugepage,
+			start:     3 * hugepage, // Double fileSize until alignment is satisfied, grow down.
 		},
 		{
-			desc: "Pages before start ignored",
+			desc: "Alignment is honored before end of file",
+			usage: &usageSegmentDataSlices{
+				Start: []uint64{0, 2*hugepage + page},
+				// Page will need to be shifted down from top.
+				End:    []uint64{page, 2*hugepage + 2*page},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			fileSize:  2*hugepage + 2*page,
+			length:    hugepage,
+			alignment: hugepage,
+			start:     hugepage,
+		},
+		{
+			desc:      "Allocation doubles file size more than once if necessary",
+			usage:     &usageSegmentDataSlices{},
+			fileSize:  page,
+			length:    4 * page,
+			alignment: page,
+			start:     0,
+		},
+		{
+			desc: "Allocations are compact if possible",
 			usage: &usageSegmentDataSlices{
 				Start:  []uint64{page, 3 * page},
 				End:    []uint64{2 * page, 4 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 2}},
 			},
-			start:          page,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
+			fileSize:  4 * page,
+			length:    page,
+			alignment: page,
+			start:     2 * page,
+		},
+		{
+			desc: "Top-down allocation within one gap",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page, 4 * page, 7 * page},
+				End:    []uint64{2 * page, 5 * page, 8 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}, {refs: 1}},
+			},
+			fileSize:  8 * page,
+			length:    page,
+			alignment: page,
+			start:     6 * page,
+		},
+		{
+			desc: "Top-down allocation between multiple gaps",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page, 3 * page, 5 * page},
+				End:    []uint64{2 * page, 4 * page, 6 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}, {refs: 1}},
+			},
+			fileSize:  6 * page,
+			length:    page,
+			alignment: page,
+			start:     4 * page,
 		},
 		{
-			desc: "start may be in the middle of segment",
+			desc: "Top-down allocation with large top gap",
 			usage: &usageSegmentDataSlices{
-				Start:  []uint64{0, 3 * page},
+				Start:  []uint64{page, 3 * page},
 				End:    []uint64{2 * page, 4 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 2}},
 			},
-			start:          page,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
+			fileSize:  8 * page,
+			length:    page,
+			alignment: page,
+			start:     7 * page,
+		},
+		{
+			desc: "Gaps found with possible overflow",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page, topPage - page},
+				End:    []uint64{2 * page, topPage},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			fileSize:  topPage,
+			length:    page,
+			alignment: page,
+			start:     topPage - 2*page,
+		},
+		{
+			desc: "Overflow detected",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page},
+				End:    []uint64{topPage},
+				Values: []usageInfo{{refs: 1}},
+			},
+			fileSize:   topPage,
+			length:     2 * page,
+			alignment:  page,
+			expectFail: true,
 		},
 	} {
 		t.Run(test.desc, func(t *testing.T) {
@@ -156,12 +228,18 @@ func TestFindUnallocatedRange(t *testing.T) {
 			if err := usage.ImportSortedSlices(test.usage); err != nil {
 				t.Fatalf("Failed to initialize usage from %v: %v", test.usage, err)
 			}
-			unallocated, minUnallocated := findUnallocatedRange(&usage, test.start, test.length, test.alignment)
-			if unallocated != test.unallocated {
-				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got unallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, unallocated, test.unallocated)
+			fr, ok := findAvailableRange(&usage, test.fileSize, test.length, test.alignment)
+			if !test.expectFail && !ok {
+				t.Fatalf("findAvailableRange(%v, %x, %x, %x): got %x, false wanted %x, true", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
+			}
+			if test.expectFail && ok {
+				t.Fatalf("findAvailableRange(%v, %x, %x, %x): got %x, true wanted %x, false", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
+			}
+			if ok && fr.Start != test.start {
+				t.Errorf("findAvailableRange(%v, %x, %x, %x): got start=%x, wanted %x", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
 			}
-			if minUnallocated != test.minUnallocated {
-				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got minUnallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, minUnallocated, test.minUnallocated)
+			if ok && fr.End != test.start+test.length {
+				t.Errorf("findAvailableRange(%v, %x, %x, %x): got end=%x, wanted %x", test.usage, test.fileSize, test.length, test.alignment, fr.End, test.start+test.length)
 			}
 		})
 	}
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
index f8385c146..78317fa35 100644
--- a/pkg/sentry/pgalloc/save_restore.go
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -26,11 +26,12 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/state"
+	"gvisor.dev/gvisor/pkg/state/wire"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // SaveTo writes f's state to the given stream.
-func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error {
+func (f *MemoryFile) SaveTo(ctx context.Context, w wire.Writer) error {
 	// Wait for reclaim.
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -79,10 +80,10 @@ func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error {
 	}
 
 	// Save metadata.
-	if err := state.Save(ctx, w, &f.fileSize, nil); err != nil {
+	if _, err := state.Save(ctx, w, &f.fileSize); err != nil {
 		return err
 	}
-	if err := state.Save(ctx, w, &f.usage, nil); err != nil {
+	if _, err := state.Save(ctx, w, &f.usage); err != nil {
 		return err
 	}
 
@@ -115,9 +116,9 @@ func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error {
 }
 
 // LoadFrom loads MemoryFile state from the given stream.
-func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader) error {
+func (f *MemoryFile) LoadFrom(ctx context.Context, r wire.Reader) error {
 	// Load metadata.
-	if err := state.Load(ctx, r, &f.fileSize, nil); err != nil {
+	if _, err := state.Load(ctx, r, &f.fileSize); err != nil {
 		return err
 	}
 	if err := f.file.Truncate(f.fileSize); err != nil {
@@ -125,7 +126,7 @@ func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader) error {
 	}
 	newMappings := make([]uintptr, f.fileSize>>chunkShift)
 	f.mappings.Store(newMappings)
-	if err := state.Load(ctx, r, &f.usage, nil); err != nil {
+	if _, err := state.Load(ctx, r, &f.usage); err != nil {
 		return err
 	}
 
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index 453241eca..209b28053 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -1,39 +1,21 @@
 load("//tools:defs.bzl", "go_library")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-go_template_instance(
-    name = "file_range",
-    out = "file_range.go",
-    package = "platform",
-    prefix = "File",
-    template = "//pkg/segment:generic_range",
-    types = {
-        "T": "uint64",
-    },
-)
-
 go_library(
     name = "platform",
     srcs = [
         "context.go",
-        "file_range.go",
         "mmap_min_addr.go",
         "platform.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/atomicbitops",
         "//pkg/context",
-        "//pkg/log",
-        "//pkg/safecopy",
-        "//pkg/safemem",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
-        "//pkg/sentry/usage",
-        "//pkg/syserror",
+        "//pkg/sentry/memmap",
         "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 159f7eafd..b5d27a72a 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -6,8 +6,8 @@ go_library(
     name = "kvm",
     srcs = [
         "address_space.go",
-        "allocator.go",
         "bluepill.go",
+        "bluepill_allocator.go",
         "bluepill_amd64.go",
         "bluepill_amd64.s",
         "bluepill_amd64_unsafe.go",
@@ -47,6 +47,7 @@ go_library(
         "//pkg/safecopy",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
+        "//pkg/sentry/memmap",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/interrupt",
         "//pkg/sentry/platform/ring0",
@@ -60,6 +61,7 @@ go_library(
 go_test(
     name = "kvm_test",
     srcs = [
+        "kvm_amd64_test.go",
         "kvm_test.go",
         "virtual_map_test.go",
     ],
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index be213bfe8..98a3e539d 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -18,6 +18,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/atomicbitops"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -26,16 +27,15 @@ import (
 
 // dirtySet tracks vCPUs for invalidation.
 type dirtySet struct {
-	vCPUs []uint64
+	vCPUMasks []uint64
 }
 
 // forEach iterates over all CPUs in the dirty set.
+//
+//go:nosplit
 func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
-	m.mu.RLock()
-	defer m.mu.RUnlock()
-
-	for index := range ds.vCPUs {
-		mask := atomic.SwapUint64(&ds.vCPUs[index], 0)
+	for index := range ds.vCPUMasks {
+		mask := atomic.SwapUint64(&ds.vCPUMasks[index], 0)
 		if mask != 0 {
 			for bit := 0; bit < 64; bit++ {
 				if mask&(1<<uint64(bit)) == 0 {
@@ -54,7 +54,7 @@ func (ds *dirtySet) mark(c *vCPU) bool {
 	index := uint64(c.id) / 64
 	bit := uint64(1) << uint(c.id%64)
 
-	oldValue := atomic.LoadUint64(&ds.vCPUs[index])
+	oldValue := atomic.LoadUint64(&ds.vCPUMasks[index])
 	if oldValue&bit != 0 {
 		return false // Not clean.
 	}
@@ -62,7 +62,7 @@ func (ds *dirtySet) mark(c *vCPU) bool {
 	// Set the bit unilaterally, and ensure that a flush takes place. Note
 	// that it's possible for races to occur here, but since the flush is
 	// taking place long after these lines there's no race in practice.
-	atomicbitops.OrUint64(&ds.vCPUs[index], bit)
+	atomicbitops.OrUint64(&ds.vCPUMasks[index], bit)
 	return true // Previously clean.
 }
 
@@ -113,7 +113,12 @@ type hostMapEntry struct {
 	length uintptr
 }
 
-func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
+// mapLocked maps the given host entry.
+//
+// +checkescape:hard,stack
+//
+//go:nosplit
+func (as *addressSpace) mapLocked(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
 	for m.length > 0 {
 		physical, length, ok := translateToPhysical(m.addr)
 		if !ok {
@@ -133,18 +138,10 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac
 		// important; if the pagetable mappings were installed before
 		// ensuring the physical pages were available, then some other
 		// thread could theoretically access them.
-		//
-		// Due to the way KVM's shadow paging implementation works,
-		// modifications to the page tables while in host mode may not
-		// be trapped, leading to the shadow pages being out of sync.
-		// Therefore, we need to ensure that we are in guest mode for
-		// page table modifications. See the call to bluepill, below.
-		as.machine.retryInGuest(func() {
-			inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
-				AccessType: at,
-				User:       true,
-			}, physical) || inv
-		})
+		inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
+			AccessType: at,
+			User:       true,
+		}, physical) || inv
 		m.addr += length
 		m.length -= length
 		addr += usermem.Addr(length)
@@ -154,7 +151,7 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac
 }
 
 // MapFile implements platform.AddressSpace.MapFile.
-func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+func (as *addressSpace) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error {
 	as.mu.Lock()
 	defer as.mu.Unlock()
 
@@ -176,6 +173,10 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
 		return err
 	}
 
+	// See block in mapLocked.
+	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
+	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
+
 	// Map the mappings in the sentry's address space (guest physical memory)
 	// into the application's address space (guest virtual memory).
 	inv := false
@@ -190,7 +191,12 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
 				_ = s[i] // Touch to commit.
 			}
 		}
-		prev := as.mapHost(addr, hostMapEntry{
+
+		// See bluepill_allocator.go.
+		bluepill(as.pageTables.Allocator.(*allocator).cpu)
+
+		// Perform the mapping.
+		prev := as.mapLocked(addr, hostMapEntry{
 			addr:   b.Addr(),
 			length: uintptr(b.Len()),
 		}, at)
@@ -204,17 +210,27 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
 	return nil
 }
 
+// unmapLocked is an escape-checked wrapped around Unmap.
+//
+// +checkescape:hard,stack
+//
+//go:nosplit
+func (as *addressSpace) unmapLocked(addr usermem.Addr, length uint64) bool {
+	return as.pageTables.Unmap(addr, uintptr(length))
+}
+
 // Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
 func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
 	as.mu.Lock()
 	defer as.mu.Unlock()
 
-	// See above re: retryInGuest.
-	var prev bool
-	as.machine.retryInGuest(func() {
-		prev = as.pageTables.Unmap(addr, uintptr(length)) || prev
-	})
-	if prev {
+	// See above & bluepill_allocator.go.
+	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
+	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
+	bluepill(as.pageTables.Allocator.(*allocator).cpu)
+
+	if prev := as.unmapLocked(addr, length); prev {
+		// Invalidate all active vCPUs.
 		as.invalidate()
 
 		// Recycle any freed intermediate pages.
@@ -227,7 +243,7 @@ func (as *addressSpace) Release() {
 	as.Unmap(0, ^uint64(0))
 
 	// Free all pages from the allocator.
-	as.pageTables.Allocator.(allocator).base.Drain()
+	as.pageTables.Allocator.(*allocator).base.Drain()
 
 	// Drop all cached machine references.
 	as.machine.dropPageTables(as.pageTables)
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/bluepill_allocator.go
index 3f35414bb..9485e1301 100644
--- a/pkg/sentry/platform/kvm/allocator.go
+++ b/pkg/sentry/platform/kvm/bluepill_allocator.go
@@ -21,56 +21,80 @@ import (
 )
 
 type allocator struct {
-	base *pagetables.RuntimeAllocator
+	base pagetables.RuntimeAllocator
+
+	// cpu must be set prior to any pagetable operation.
+	//
+	// Due to the way KVM's shadow paging implementation works,
+	// modifications to the page tables while in host mode may not be
+	// trapped, leading to the shadow pages being out of sync.  Therefore,
+	// we need to ensure that we are in guest mode for page table
+	// modifications. See the call to bluepill, below.
+	cpu *vCPU
 }
 
 // newAllocator is used to define the allocator.
-func newAllocator() allocator {
-	return allocator{
-		base: pagetables.NewRuntimeAllocator(),
-	}
+func newAllocator() *allocator {
+	a := new(allocator)
+	a.base.Init()
+	return a
 }
 
 // NewPTEs implements pagetables.Allocator.NewPTEs.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (a allocator) NewPTEs() *pagetables.PTEs {
-	return a.base.NewPTEs()
+func (a *allocator) NewPTEs() *pagetables.PTEs {
+	ptes := a.base.NewPTEs() // escapes: bluepill below.
+	if a.cpu != nil {
+		bluepill(a.cpu)
+	}
+	return ptes
 }
 
 // PhysicalFor returns the physical address for a set of PTEs.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (a allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
+func (a *allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
 	virtual := a.base.PhysicalFor(ptes)
 	physical, _, ok := translateToPhysical(virtual)
 	if !ok {
-		panic(fmt.Sprintf("PhysicalFor failed for %p", ptes))
+		panic(fmt.Sprintf("PhysicalFor failed for %p", ptes)) // escapes: panic.
 	}
 	return physical
 }
 
 // LookupPTEs implements pagetables.Allocator.LookupPTEs.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
+func (a *allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
 	virtualStart, physicalStart, _, ok := calculateBluepillFault(physical, physicalRegions)
 	if !ok {
-		panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical))
+		panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical)) // escapes: panic.
 	}
 	return a.base.LookupPTEs(virtualStart + (physical - physicalStart))
 }
 
 // FreePTEs implements pagetables.Allocator.FreePTEs.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (a allocator) FreePTEs(ptes *pagetables.PTEs) {
-	a.base.FreePTEs(ptes)
+func (a *allocator) FreePTEs(ptes *pagetables.PTEs) {
+	a.base.FreePTEs(ptes) // escapes: bluepill below.
+	if a.cpu != nil {
+		bluepill(a.cpu)
+	}
 }
 
 // Recycle implements pagetables.Allocator.Recycle.
 //
 //go:nosplit
-func (a allocator) Recycle() {
+func (a *allocator) Recycle() {
 	a.base.Recycle()
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index 133c2203d..ddc1554d5 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -63,6 +63,8 @@ func bluepillArchEnter(context *arch.SignalContext64) *vCPU {
 
 // KernelSyscall handles kernel syscalls.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *vCPU) KernelSyscall() {
 	regs := c.Registers()
@@ -72,13 +74,15 @@ func (c *vCPU) KernelSyscall() {
 	// We only trigger a bluepill entry in the bluepill function, and can
 	// therefore be guaranteed that there is no floating point state to be
 	// loaded on resuming from halt. We only worry about saving on exit.
-	ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+	ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no.
 	ring0.Halt()
-	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+	ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no, reload host segment.
 }
 
 // KernelException handles kernel exceptions.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *vCPU) KernelException(vector ring0.Vector) {
 	regs := c.Registers()
@@ -89,9 +93,9 @@ func (c *vCPU) KernelException(vector ring0.Vector) {
 		regs.Rip = 0
 	}
 	// See above.
-	ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+	ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no.
 	ring0.Halt()
-	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+	ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no; reload host segment.
 }
 
 // bluepillArchExit is called during bluepillEnter.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 99cac665d..03a98512e 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -17,6 +17,7 @@
 package kvm
 
 import (
+	"syscall"
 	"unsafe"
 
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -53,3 +54,34 @@ func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
 	context.Rbx = uint64(uintptr(unsafe.Pointer(c)))
 	context.Rip = uint64(dieTrampolineAddr)
 }
+
+// getHypercallID returns hypercall ID.
+//
+//go:nosplit
+func getHypercallID(addr uintptr) int {
+	return _KVM_HYPERCALL_MAX
+}
+
+// bluepillStopGuest is reponsible for injecting interrupt.
+//
+//go:nosplit
+func bluepillStopGuest(c *vCPU) {
+	// Interrupt: we must have requested an interrupt
+	// window; set the interrupt line.
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_INTERRUPT,
+		uintptr(unsafe.Pointer(&bounce))); errno != 0 {
+		throw("interrupt injection failed")
+	}
+	// Clear previous injection request.
+	c.runData.requestInterruptWindow = 0
+}
+
+// bluepillReadyStopGuest checks whether the current vCPU is ready for interrupt injection.
+//
+//go:nosplit
+func bluepillReadyStopGuest(c *vCPU) bool {
+	return c.runData.readyForInterruptInjection != 0
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
index c215d443c..dba563160 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -26,6 +26,17 @@ import (
 var (
 	// The action for bluepillSignal is changed by sigaction().
 	bluepillSignal = syscall.SIGILL
+
+	// vcpuSErr is the event of system error.
+	vcpuSErr = kvmVcpuEvents{
+		exception: exception{
+			sErrPending: 1,
+			sErrHasEsr:  0,
+			pad:         [6]uint8{0, 0, 0, 0, 0, 0},
+			sErrEsr:     1,
+		},
+		rsvd: [12]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+	}
 )
 
 // bluepillArchEnter is called during bluepillEnter.
@@ -66,6 +77,8 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
 
 // KernelSyscall handles kernel syscalls.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *vCPU) KernelSyscall() {
 	regs := c.Registers()
@@ -88,6 +101,8 @@ func (c *vCPU) KernelSyscall() {
 
 // KernelException handles kernel exceptions.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *vCPU) KernelException(vector ring0.Vector) {
 	regs := c.Registers()
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
index 4ca2b7717..8b64f3a1e 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -17,6 +17,7 @@
 package kvm
 
 import (
+	"syscall"
 	"unsafe"
 
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -61,3 +62,36 @@ func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
 func bluepillArchFpContext(context unsafe.Pointer) *arch.FpsimdContext {
 	return &((*arch.SignalContext64)(context).Fpsimd64)
 }
+
+// getHypercallID returns hypercall ID.
+//
+// On Arm64, the MMIO address should be 64-bit aligned.
+//
+//go:nosplit
+func getHypercallID(addr uintptr) int {
+	if addr < arm64HypercallMMIOBase || addr >= (arm64HypercallMMIOBase+_AARCH64_HYPERCALL_MMIO_SIZE) {
+		return _KVM_HYPERCALL_MAX
+	} else {
+		return int(((addr) - arm64HypercallMMIOBase) >> 3)
+	}
+}
+
+// bluepillStopGuest is reponsible for injecting sError.
+//
+//go:nosplit
+func bluepillStopGuest(c *vCPU) {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_VCPU_EVENTS,
+		uintptr(unsafe.Pointer(&vcpuSErr))); errno != 0 {
+		throw("sErr injection failed")
+	}
+}
+
+// bluepillReadyStopGuest checks whether the current vCPU is ready for sError injection.
+//
+//go:nosplit
+func bluepillReadyStopGuest(c *vCPU) bool {
+	return true
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 9add7c944..bf357de1a 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -58,12 +58,32 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
 	return &((*arch.UContext64)(context).MContext)
 }
 
+// bluepillHandleHlt is reponsible for handling VM-Exit.
+//
+//go:nosplit
+func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
+	// Copy out registers.
+	bluepillArchExit(c, bluepillArchContext(context))
+
+	// Return to the vCPUReady state; notify any waiters.
+	user := atomic.LoadUint32(&c.state) & vCPUUser
+	switch atomic.SwapUint32(&c.state, user) {
+	case user | vCPUGuest: // Expected case.
+	case user | vCPUGuest | vCPUWaiter:
+		c.notify()
+	default:
+		throw("invalid state")
+	}
+}
+
 // bluepillHandler is called from the signal stub.
 //
 // The world may be stopped while this is executing, and it executes on the
 // signal stack. It should only execute raw system calls and functions that are
 // explicitly marked go:nosplit.
 //
+// +checkescape:all
+//
 //go:nosplit
 func bluepillHandler(context unsafe.Pointer) {
 	// Sanitize the registers; interrupts must always be disabled.
@@ -82,7 +102,8 @@ func bluepillHandler(context unsafe.Pointer) {
 	}
 
 	for {
-		switch _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0); errno {
+		_, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no.
+		switch errno {
 		case 0: // Expected case.
 		case syscall.EINTR:
 			// First, we process whatever pending signal
@@ -90,7 +111,7 @@ func bluepillHandler(context unsafe.Pointer) {
 			// currently, all signals are masked and the signal
 			// must have been delivered directly to this thread.
 			timeout := syscall.Timespec{}
-			sig, _, errno := syscall.RawSyscall6(
+			sig, _, errno := syscall.RawSyscall6( // escapes: no.
 				syscall.SYS_RT_SIGTIMEDWAIT,
 				uintptr(unsafe.Pointer(&bounceSignalMask)),
 				0,                                 // siginfo.
@@ -112,12 +133,12 @@ func bluepillHandler(context unsafe.Pointer) {
 			// PIC, we can't inject an interrupt while they are
 			// masked. We need to request a window if it's not
 			// ready.
-			if c.runData.readyForInterruptInjection == 0 {
-				c.runData.requestInterruptWindow = 1
-				continue // Rerun vCPU.
-			} else {
+			if bluepillReadyStopGuest(c) {
 				// Force injection below; the vCPU is ready.
 				c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
+			} else {
+				c.runData.requestInterruptWindow = 1
+				continue // Rerun vCPU.
 			}
 		case syscall.EFAULT:
 			// If a fault is not serviceable due to the host
@@ -125,7 +146,7 @@ func bluepillHandler(context unsafe.Pointer) {
 			// MMIO exit we receive EFAULT from the run ioctl. We
 			// always inject an NMI here since we may be in kernel
 			// mode and have interrupts disabled.
-			if _, _, errno := syscall.RawSyscall(
+			if _, _, errno := syscall.RawSyscall( // escapes: no.
 				syscall.SYS_IOCTL,
 				uintptr(c.fd),
 				_KVM_NMI, 0); errno != 0 {
@@ -156,25 +177,20 @@ func bluepillHandler(context unsafe.Pointer) {
 			c.die(bluepillArchContext(context), "debug")
 			return
 		case _KVM_EXIT_HLT:
-			// Copy out registers.
-			bluepillArchExit(c, bluepillArchContext(context))
-
-			// Return to the vCPUReady state; notify any waiters.
-			user := atomic.LoadUint32(&c.state) & vCPUUser
-			switch atomic.SwapUint32(&c.state, user) {
-			case user | vCPUGuest: // Expected case.
-			case user | vCPUGuest | vCPUWaiter:
-				c.notify()
-			default:
-				throw("invalid state")
-			}
+			bluepillGuestExit(c, context)
 			return
 		case _KVM_EXIT_MMIO:
+			physical := uintptr(c.runData.data[0])
+			if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT {
+				bluepillGuestExit(c, context)
+				return
+			}
+
 			// Increment the fault count.
 			atomic.AddUint32(&c.faults, 1)
 
 			// For MMIO, the physical address is the first data item.
-			physical := uintptr(c.runData.data[0])
+			physical = uintptr(c.runData.data[0])
 			virtual, ok := handleBluepillFault(c.machine, physical, physicalRegions, _KVM_MEM_FLAGS_NONE)
 			if !ok {
 				c.die(bluepillArchContext(context), "invalid physical address")
@@ -201,17 +217,7 @@ func bluepillHandler(context unsafe.Pointer) {
 				}
 			}
 		case _KVM_EXIT_IRQ_WINDOW_OPEN:
-			// Interrupt: we must have requested an interrupt
-			// window; set the interrupt line.
-			if _, _, errno := syscall.RawSyscall(
-				syscall.SYS_IOCTL,
-				uintptr(c.fd),
-				_KVM_INTERRUPT,
-				uintptr(unsafe.Pointer(&bounce))); errno != 0 {
-				throw("interrupt injection failed")
-			}
-			// Clear previous injection request.
-			c.runData.requestInterruptWindow = 0
+			bluepillStopGuest(c)
 		case _KVM_EXIT_SHUTDOWN:
 			c.die(bluepillArchContext(context), "shutdown")
 			return
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_test.go b/pkg/sentry/platform/kvm/kvm_amd64_test.go
new file mode 100644
index 000000000..c0b4fd374
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_amd64_test.go
@@ -0,0 +1,51 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+func TestSegments(t *testing.T) {
+	applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
+		testutil.SetTestSegments(regs)
+		for {
+			var si arch.SignalInfo
+			if _, err := c.SwitchToUser(ring0.SwitchOpts{
+				Registers:          regs,
+				FloatingPointState: dummyFPState,
+				PageTables:         pt,
+				FullRestore:        true,
+			}, &si); err == platform.ErrContextInterrupt {
+				continue // Retry.
+			} else if err != nil {
+				t.Errorf("application segment check with full restore got unexpected error: %v", err)
+			}
+			if err := testutil.CheckTestSegments(regs); err != nil {
+				t.Errorf("application segment check with full restore failed: %v", err)
+			}
+			break // Done.
+		}
+		return false
+	})
+}
diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go
index 29d457a7e..0b06a923a 100644
--- a/pkg/sentry/platform/kvm/kvm_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_arm64.go
@@ -26,6 +26,9 @@ type kvmOneReg struct {
 	addr uint64
 }
 
+// arm64HypercallMMIOBase is MMIO base address used to dispatch hypercalls.
+var arm64HypercallMMIOBase uintptr
+
 const KVM_NR_SPSR = 5
 
 type userFpsimdState struct {
@@ -43,6 +46,18 @@ type userRegs struct {
 	fpRegs  userFpsimdState
 }
 
+type exception struct {
+	sErrPending uint8
+	sErrHasEsr  uint8
+	pad         [6]uint8
+	sErrEsr     uint64
+}
+
+type kvmVcpuEvents struct {
+	exception
+	rsvd [12]uint32
+}
+
 // updateGlobalOnce does global initialization. It has to be called only once.
 func updateGlobalOnce(fd int) error {
 	physicalInit()
diff --git a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
index 6531bae1d..48ccf8474 100644
--- a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
@@ -22,7 +22,8 @@ import (
 )
 
 var (
-	runDataSize int
+	runDataSize  int
+	hasGuestPCID bool
 )
 
 func updateSystemValues(fd int) error {
@@ -33,6 +34,7 @@ func updateSystemValues(fd int) error {
 	}
 	// Save the data.
 	runDataSize = int(sz)
+	hasGuestPCID = true
 
 	// Success.
 	return nil
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 1d5c77ff4..3bf918446 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -35,6 +35,8 @@ const (
 	_KVM_GET_SUPPORTED_CPUID    = 0xc008ae05
 	_KVM_SET_CPUID2             = 0x4008ae90
 	_KVM_SET_SIGNAL_MASK        = 0x4004ae8b
+	_KVM_GET_VCPU_EVENTS        = 0x8040ae9f
+	_KVM_SET_VCPU_EVENTS        = 0x4040aea0
 )
 
 // KVM exit reasons.
@@ -54,8 +56,10 @@ const (
 
 // KVM capability options.
 const (
-	_KVM_CAP_MAX_VCPUS       = 0x42
-	_KVM_CAP_ARM_VM_IPA_SIZE = 0xa5
+	_KVM_CAP_MAX_VCPUS             = 0x42
+	_KVM_CAP_ARM_VM_IPA_SIZE       = 0xa5
+	_KVM_CAP_VCPU_EVENTS           = 0x29
+	_KVM_CAP_ARM_INJECT_SERROR_ESR = 0x9e
 )
 
 // KVM limits.
@@ -71,3 +75,13 @@ const (
 	_KVM_MEM_READONLY        = uint32(1) << 1
 	_KVM_MEM_FLAGS_NONE      = 0
 )
+
+// KVM hypercall list.
+// Canonical list of hypercalls supported.
+const (
+	// On amd64, it uses 'HLT' to leave the guest.
+	// Unlike amd64, arm64 can only uses mmio_exit/psci to leave the guest.
+	// _KVM_HYPERCALL_VMEXIT is only used on Arm64 for now.
+	_KVM_HYPERCALL_VMEXIT int = iota
+	_KVM_HYPERCALL_MAX
+)
diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go
index 531ae8b1e..fdc599477 100644
--- a/pkg/sentry/platform/kvm/kvm_const_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go
@@ -116,6 +116,17 @@ const (
 
 // Arm64: Exception Syndrome Register EL1.
 const (
+	_ESR_ELx_EC_SHIFT = 26
+	_ESR_ELx_EC_MASK  = 0x3F << _ESR_ELx_EC_SHIFT
+
+	_ESR_ELx_EC_IMP_DEF  = 0x1f
+	_ESR_ELx_EC_IABT_LOW = 0x20
+	_ESR_ELx_EC_IABT_CUR = 0x21
+	_ESR_ELx_EC_PC_ALIGN = 0x22
+
+	_ESR_ELx_CM  = 1 << 8
+	_ESR_ELx_WNR = 1 << 6
+
 	_ESR_ELx_FSC = 0x3F
 
 	_ESR_SEGV_MAPERR_L0 = 0x4
@@ -131,3 +142,10 @@ const (
 	_ESR_SEGV_PEMERR_L2 = 0xe
 	_ESR_SEGV_PEMERR_L3 = 0xf
 )
+
+// Arm64: MMIO base address used to dispatch hypercalls.
+const (
+	// on Arm64, the MMIO address must be 64-bit aligned.
+	// Currently, we only need 1 hypercall: hypercall_vmexit.
+	_AARCH64_HYPERCALL_MMIO_SIZE = 1 << 3
+)
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 6c8f4fa28..45b3180f1 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -262,30 +262,6 @@ func TestRegistersFault(t *testing.T) {
 	})
 }
 
-func TestSegments(t *testing.T) {
-	applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
-		testutil.SetTestSegments(regs)
-		for {
-			var si arch.SignalInfo
-			if _, err := c.SwitchToUser(ring0.SwitchOpts{
-				Registers:          regs,
-				FloatingPointState: dummyFPState,
-				PageTables:         pt,
-				FullRestore:        true,
-			}, &si); err == platform.ErrContextInterrupt {
-				continue // Retry.
-			} else if err != nil {
-				t.Errorf("application segment check with full restore got unexpected error: %v", err)
-			}
-			if err := testutil.CheckTestSegments(regs); err != nil {
-				t.Errorf("application segment check with full restore failed: %v", err)
-			}
-			break // Done.
-		}
-		return false
-	})
-}
-
 func TestBounce(t *testing.T) {
 	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		go func() {
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index f1afc74dc..6c54712d1 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -52,16 +52,19 @@ type machine struct {
 	// available is notified when vCPUs are available.
 	available sync.Cond
 
-	// vCPUs are the machine vCPUs.
+	// vCPUsByTID are the machine vCPUs.
 	//
 	// These are populated dynamically.
-	vCPUs map[uint64]*vCPU
+	vCPUsByTID map[uint64]*vCPU
 
 	// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
-	vCPUsByID map[int]*vCPU
+	vCPUsByID []*vCPU
 
 	// maxVCPUs is the maximum number of vCPUs supported by the machine.
 	maxVCPUs int
+
+	// nextID is the next vCPU ID.
+	nextID uint32
 }
 
 const (
@@ -137,9 +140,8 @@ type dieState struct {
 //
 // Precondition: mu must be held.
 func (m *machine) newVCPU() *vCPU {
-	id := len(m.vCPUs)
-
 	// Create the vCPU.
+	id := int(atomic.AddUint32(&m.nextID, 1) - 1)
 	fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
 	if errno != 0 {
 		panic(fmt.Sprintf("error creating new vCPU: %v", errno))
@@ -176,11 +178,7 @@ func (m *machine) newVCPU() *vCPU {
 // newMachine returns a new VM context.
 func newMachine(vm int) (*machine, error) {
 	// Create the machine.
-	m := &machine{
-		fd:        vm,
-		vCPUs:     make(map[uint64]*vCPU),
-		vCPUsByID: make(map[int]*vCPU),
-	}
+	m := &machine{fd: vm}
 	m.available.L = &m.mu
 	m.kernel.Init(ring0.KernelOpts{
 		PageTables: pagetables.New(newAllocator()),
@@ -194,6 +192,10 @@ func newMachine(vm int) (*machine, error) {
 	}
 	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
 
+	// Create the vCPUs map/slices.
+	m.vCPUsByTID = make(map[uint64]*vCPU)
+	m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
+
 	// Apply the physical mappings. Note that these mappings may point to
 	// guest physical addresses that are not actually available. These
 	// physical pages are mapped on demand, see kernel_unsafe.go.
@@ -274,6 +276,8 @@ func newMachine(vm int) (*machine, error) {
 // not available. This attempts to be efficient for calls in the hot path.
 //
 // This panics on error.
+//
+//go:nosplit
 func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion, flags uint32) {
 	for end := physical + length; physical < end; {
 		_, physicalStart, length, ok := calculateBluepillFault(physical, phyRegions)
@@ -304,7 +308,11 @@ func (m *machine) Destroy() {
 	runtime.SetFinalizer(m, nil)
 
 	// Destroy vCPUs.
-	for _, c := range m.vCPUs {
+	for _, c := range m.vCPUsByID {
+		if c == nil {
+			continue
+		}
+
 		// Ensure the vCPU is not still running in guest mode. This is
 		// possible iff teardown has been done by other threads, and
 		// somehow a single thread has not executed any system calls.
@@ -337,7 +345,7 @@ func (m *machine) Get() *vCPU {
 	tid := procid.Current()
 
 	// Check for an exact match.
-	if c := m.vCPUs[tid]; c != nil {
+	if c := m.vCPUsByTID[tid]; c != nil {
 		c.lock()
 		m.mu.RUnlock()
 		return c
@@ -356,7 +364,7 @@ func (m *machine) Get() *vCPU {
 	tid = procid.Current()
 
 	// Recheck for an exact match.
-	if c := m.vCPUs[tid]; c != nil {
+	if c := m.vCPUsByTID[tid]; c != nil {
 		c.lock()
 		m.mu.Unlock()
 		return c
@@ -364,10 +372,10 @@ func (m *machine) Get() *vCPU {
 
 	for {
 		// Scan for an available vCPU.
-		for origTID, c := range m.vCPUs {
+		for origTID, c := range m.vCPUsByTID {
 			if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
-				delete(m.vCPUs, origTID)
-				m.vCPUs[tid] = c
+				delete(m.vCPUsByTID, origTID)
+				m.vCPUsByTID[tid] = c
 				m.mu.Unlock()
 				c.loadSegments(tid)
 				return c
@@ -375,17 +383,17 @@ func (m *machine) Get() *vCPU {
 		}
 
 		// Create a new vCPU (maybe).
-		if len(m.vCPUs) < m.maxVCPUs {
+		if int(m.nextID) < m.maxVCPUs {
 			c := m.newVCPU()
 			c.lock()
-			m.vCPUs[tid] = c
+			m.vCPUsByTID[tid] = c
 			m.mu.Unlock()
 			c.loadSegments(tid)
 			return c
 		}
 
 		// Scan for something not in user mode.
-		for origTID, c := range m.vCPUs {
+		for origTID, c := range m.vCPUsByTID {
 			if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) {
 				continue
 			}
@@ -403,8 +411,8 @@ func (m *machine) Get() *vCPU {
 			}
 
 			// Steal the vCPU.
-			delete(m.vCPUs, origTID)
-			m.vCPUs[tid] = c
+			delete(m.vCPUsByTID, origTID)
+			m.vCPUsByTID[tid] = c
 			m.mu.Unlock()
 			c.loadSegments(tid)
 			return c
@@ -431,7 +439,7 @@ func (m *machine) Put(c *vCPU) {
 // newDirtySet returns a new dirty set.
 func (m *machine) newDirtySet() *dirtySet {
 	return &dirtySet{
-		vCPUs: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
+		vCPUMasks: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
 	}
 }
 
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 923ce3909..acc823ba6 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -51,9 +51,10 @@ func (m *machine) initArchState() error {
 		recover()
 		debug.SetPanicOnFault(old)
 	}()
-	m.retryInGuest(func() {
-		ring0.SetCPUIDFaulting(true)
-	})
+	c := m.Get()
+	defer m.Put(c)
+	bluepill(c)
+	ring0.SetCPUIDFaulting(true)
 
 	return nil
 }
@@ -89,8 +90,8 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) {
 	defer m.mu.Unlock()
 
 	// Clear from all PCIDs.
-	for _, c := range m.vCPUs {
-		if c.PCIDs != nil {
+	for _, c := range m.vCPUsByID {
+		if c != nil && c.PCIDs != nil {
 			c.PCIDs.Drop(pt)
 		}
 	}
@@ -335,29 +336,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
 	}
 }
 
-// retryInGuest runs the given function in guest mode.
-//
-// If the function does not complete in guest mode (due to execution of a
-// system call due to a GC stall, for example), then it will be retried. The
-// given function must be idempotent as a result of the retry mechanism.
-func (m *machine) retryInGuest(fn func()) {
-	c := m.Get()
-	defer m.Put(c)
-	for {
-		c.ClearErrorCode() // See below.
-		bluepill(c)        // Force guest mode.
-		fn()               // Execute the given function.
-		_, user := c.ErrorCode()
-		if user {
-			// If user is set, then we haven't bailed back to host
-			// mode via a kernel exception or system call. We
-			// consider the full function to have executed in guest
-			// mode and we can return.
-			break
-		}
-	}
-}
-
 // On x86 platform, the flags for "setMemoryRegion" can always be set as 0.
 // There is no need to return read-only physicalRegions.
 func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 7156c245f..290f035dd 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -154,7 +154,7 @@ func (c *vCPU) setUserRegisters(uregs *userRegs) error {
 //
 //go:nosplit
 func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
-	if _, _, errno := syscall.RawSyscall(
+	if _, _, errno := syscall.RawSyscall( // escapes: no.
 		syscall.SYS_IOCTL,
 		uintptr(c.fd),
 		_KVM_GET_REGS,
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index e42505542..9db171af9 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -60,6 +60,12 @@ func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
 		if !vr.accessType.Write && vr.accessType.Read {
 			rdonlyRegions = append(rdonlyRegions, vr.region)
 		}
+
+		// TODO(gvisor.dev/issue/2686): PROT_NONE should be specially treated.
+		// Workaround: treated as rdonly temporarily.
+		if !vr.accessType.Write && !vr.accessType.Read && !vr.accessType.Execute {
+			rdonlyRegions = append(rdonlyRegions, vr.region)
+		}
 	})
 
 	for _, r := range rdonlyRegions {
@@ -100,7 +106,7 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) {
 	defer m.mu.Unlock()
 
 	// Clear from all PCIDs.
-	for _, c := range m.vCPUs {
+	for _, c := range m.vCPUsByID {
 		if c.PCIDs != nil {
 			c.PCIDs.Drop(pt)
 		}
@@ -119,71 +125,59 @@ func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.Acc
 	return usermem.NoAccess, platform.ErrContextSignal
 }
 
+// isInstructionAbort returns true if it is an instruction abort.
+//
+//go:nosplit
+func isInstructionAbort(code uint64) bool {
+	value := (code & _ESR_ELx_EC_MASK) >> _ESR_ELx_EC_SHIFT
+	return value == _ESR_ELx_EC_IABT_LOW
+}
+
+// isWriteFault returns whether it is a write fault.
+//
+//go:nosplit
+func isWriteFault(code uint64) bool {
+	if isInstructionAbort(code) {
+		return false
+	}
+
+	return (code & _ESR_ELx_WNR) != 0
+}
+
 // fault generates an appropriate fault return.
 //
 //go:nosplit
 func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+	bluepill(c) // Probably no-op, but may not be.
 	faultAddr := c.GetFaultAddr()
 	code, user := c.ErrorCode()
 
+	if !user {
+		// The last fault serviced by this CPU was not a user
+		// fault, so we can't reliably trust the faultAddr or
+		// the code provided here. We need to re-execute.
+		return usermem.NoAccess, platform.ErrContextInterrupt
+	}
+
 	// Reset the pointed SignalInfo.
 	*info = arch.SignalInfo{Signo: signal}
 	info.SetAddr(uint64(faultAddr))
 
-	read := true
-	write := false
-	execute := true
-
 	ret := code & _ESR_ELx_FSC
 	switch ret {
 	case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3:
 		info.Code = 1 //SEGV_MAPERR
-		read = false
-		write = true
-		execute = false
 	case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3:
 		info.Code = 2 // SEGV_ACCERR.
-		read = true
-		write = false
-		execute = false
 	default:
 		info.Code = 2
 	}
 
-	if !user {
-		read = true
-		write = false
-		execute = true
-
-	}
 	accessType := usermem.AccessType{
-		Read:    read,
-		Write:   write,
-		Execute: execute,
+		Read:    !isWriteFault(uint64(code)),
+		Write:   isWriteFault(uint64(code)),
+		Execute: isInstructionAbort(uint64(code)),
 	}
 
 	return accessType, platform.ErrContextSignal
 }
-
-// retryInGuest runs the given function in guest mode.
-//
-// If the function does not complete in guest mode (due to execution of a
-// system call due to a GC stall, for example), then it will be retried. The
-// given function must be idempotent as a result of the retry mechanism.
-func (m *machine) retryInGuest(fn func()) {
-	c := m.Get()
-	defer m.Put(c)
-	for {
-		c.ClearErrorCode() // See below.
-		bluepill(c)        // Force guest mode.
-		fn()               // Execute the given function.
-		_, user := c.ErrorCode()
-		if user {
-			// If user is set, then we haven't bailed back to host
-			// mode via a kernel exception or system call. We
-			// consider the full function to have executed in guest
-			// mode and we can return.
-			break
-		}
-	}
-}
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 3c02cef7c..ff8c068c0 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -78,19 +79,6 @@ func (c *vCPU) initArchState() error {
 		return err
 	}
 
-	// sctlr_el1
-	regGet.id = _KVM_ARM64_REGS_SCTLR_EL1
-	if err := c.getOneRegister(&regGet); err != nil {
-		return err
-	}
-
-	dataGet |= (_SCTLR_M | _SCTLR_C | _SCTLR_I)
-	data = dataGet
-	reg.id = _KVM_ARM64_REGS_SCTLR_EL1
-	if err := c.setOneRegister(&reg); err != nil {
-		return err
-	}
-
 	// tcr_el1
 	data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS
 	reg.id = _KVM_ARM64_REGS_TCR_EL1
@@ -159,12 +147,24 @@ func (c *vCPU) initArchState() error {
 		return err
 	}
 
+	// Use the address of the exception vector table as
+	// the MMIO address base.
+	arm64HypercallMMIOBase = toLocation
+
 	data = ring0.PsrDefaultSet | ring0.KernelFlagsSet
 	reg.id = _KVM_ARM64_REGS_PSTATE
 	if err := c.setOneRegister(&reg); err != nil {
 		return err
 	}
 
+	// Initialize the PCID database.
+	if hasGuestPCID {
+		// Note that NewPCIDs may return a nil table here, in which
+		// case we simply don't use PCID support (see below). In
+		// practice, this should not happen, however.
+		c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs)
+	}
+
 	c.floatingPointState = arch.NewFloatingPointData()
 	return nil
 }
@@ -243,6 +243,13 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
 		return nonCanonical(regs.Sp, int32(syscall.SIGBUS), info)
 	}
 
+	// Assign PCIDs.
+	if c.PCIDs != nil {
+		var requireFlushPCID bool // Force a flush?
+		switchOpts.UserASID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables)
+		switchOpts.Flush = switchOpts.Flush || requireFlushPCID
+	}
+
 	var vector ring0.Vector
 	ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0)
 	c.SetTtbr0App(uintptr(ttbr0App))
@@ -269,8 +276,8 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
 
 	case ring0.PageFault:
 		return c.fault(int32(syscall.SIGSEGV), info)
-	case 0xaa:
-		return usermem.NoAccess, nil
+	case ring0.Vector(bounce): // ring0.VirtualizationException
+		return usermem.NoAccess, platform.ErrContextInterrupt
 	default:
 		return usermem.NoAccess, platform.ErrContextSignal
 	}
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index f04be2ab5..9f86f6a7a 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -115,7 +115,7 @@ func (a *atomicAddressSpace) get() *addressSpace {
 //
 //go:nosplit
 func (c *vCPU) notify() {
-	_, _, errno := syscall.RawSyscall6(
+	_, _, errno := syscall.RawSyscall6( // escapes: no.
 		syscall.SYS_FUTEX,
 		uintptr(unsafe.Pointer(&c.state)),
 		linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
index ca902c8c1..4dad877ba 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
@@ -56,5 +56,9 @@ func CheckTestRegs(regs *arch.Registers, full bool) (err error) {
 			err = addRegisterMismatch(err, fmt.Sprintf("R%d", i), regs.Regs[i], need)
 		}
 	}
+	// Check tls.
+	if need := ^uint64(11); regs.TPIDR_EL0 != need {
+		err = addRegisterMismatch(err, "tpdir_el0", regs.TPIDR_EL0, need)
+	}
 	return
 }
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
index 0bebee852..6caf7282d 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
@@ -52,6 +52,8 @@ start:
 
 TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8
 	NO_LOCAL_POINTERS
+	// gc will touch fpsimd, so we should test it.
+	// such as in <runtime.deductSweepCredit>.
 	FMOVD $(9.9), F0
 	MOVD $SYS_GETPID, R8 // getpid
 	SVC
@@ -102,5 +104,15 @@ isNaN:
 
 TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0
 	TWIDDLE_REGS()
+	MSR R10, TPIDR_EL0
+	// Trapped in el0_svc.
 	SVC
 	RET // never reached
+
+TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0
+	TWIDDLE_REGS()
+	MSR R10, TPIDR_EL0
+	// Trapped in el0_ia.
+	// Branch to Register branches unconditionally to an address in <Rn>.
+	JMP (R6) // <=> br x6, must fault
+	RET // never reached
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index 171513f3f..4b13eec30 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -22,9 +22,9 @@ import (
 	"os"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -207,7 +207,7 @@ type AddressSpace interface {
 	// Preconditions: addr and fr must be page-aligned. fr.Length() > 0.
 	// at.Any() == true. At least one reference must be held on all pages in
 	// fr, and must continue to be held as long as pages are mapped.
-	MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, precommit bool) error
+	MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error
 
 	// Unmap unmaps the given range.
 	//
@@ -310,52 +310,6 @@ func (f SegmentationFault) Error() string {
 	return fmt.Sprintf("segmentation fault at %#x", f.Addr)
 }
 
-// File represents a host file that may be mapped into an AddressSpace.
-type File interface {
-	// All pages in a File are reference-counted.
-
-	// IncRef increments the reference count on all pages in fr.
-	//
-	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
-	// 0. At least one reference must be held on all pages in fr. (The File
-	// interface does not provide a way to acquire an initial reference;
-	// implementors may define mechanisms for doing so.)
-	IncRef(fr FileRange)
-
-	// DecRef decrements the reference count on all pages in fr.
-	//
-	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
-	// 0. At least one reference must be held on all pages in fr.
-	DecRef(fr FileRange)
-
-	// MapInternal returns a mapping of the given file offsets in the invoking
-	// process' address space for reading and writing.
-	//
-	// Note that fr.Start and fr.End need not be page-aligned.
-	//
-	// Preconditions: fr.Length() > 0. At least one reference must be held on
-	// all pages in fr.
-	//
-	// Postconditions: The returned mapping is valid as long as at least one
-	// reference is held on the mapped pages.
-	MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error)
-
-	// FD returns the file descriptor represented by the File.
-	//
-	// The only permitted operation on the returned file descriptor is to map
-	// pages from it consistent with the requirements of AddressSpace.MapFile.
-	FD() int
-}
-
-// FileRange represents a range of uint64 offsets into a File.
-//
-// type FileRange <generated using go_generics>
-
-// String implements fmt.Stringer.String.
-func (fr FileRange) String() string {
-	return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
-}
-
 // Requirements is used to specify platform specific requirements.
 type Requirements struct {
 	// RequiresCurrentPIDNS indicates that the sandbox has to be started in the
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index 30402c2df..29fd23cc3 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -30,6 +30,7 @@ go_library(
         "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/hostcpu",
+        "//pkg/sentry/memmap",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/interrupt",
         "//pkg/sync",
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 2389423b0..c990f3454 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -24,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/procid"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -616,7 +617,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp
 }
 
 // MapFile implements platform.AddressSpace.MapFile.
-func (s *subprocess) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+func (s *subprocess) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error {
 	var flags int
 	if precommit {
 		flags |= syscall.MAP_POPULATE
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index 2ae6b9f9d..0bee995e4 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index db6465663..9fd02d628 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -40,6 +40,20 @@
 
 #define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT)
 
+// sctlr_el1: system control register el1.
+#define SCTLR_M         1 << 0
+#define SCTLR_C         1 << 2
+#define SCTLR_I         1 << 12
+#define SCTLR_UCT       1 << 15
+
+#define SCTLR_EL1_DEFAULT       (SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_UCT)
+
+// cntkctl_el1: counter-timer kernel control register el1.
+#define CNTKCTL_EL0PCTEN 	1 << 0
+#define CNTKCTL_EL0VCTEN 	1 << 1
+
+#define CNTKCTL_EL1_DEFAULT 	(CNTKCTL_EL0PCTEN | CNTKCTL_EL0VCTEN)
+
 // Saves a register set.
 //
 // This is a macro because it may need to executed in contents where a stack is
@@ -362,9 +376,17 @@ mmio_exit:
 	MOVD R1, CPU_LAZY_VFP(RSV_REG)
 	VFP_DISABLE
 
-	// MMIO_EXIT.
-	MOVD $0, R9
-	MOVD R0, 0xffff000000001000(R9)
+	// Trigger MMIO_EXIT/_KVM_HYPERCALL_VMEXIT.
+	//
+	// To keep it simple, I used the address of exception table as the
+	// MMIO base address, so that I can trigger a MMIO-EXIT by forcibly writing
+	// a read-only space.
+	// Also, the length is engough to match a sufficient number of hypercall ID.
+	// Then, in host user space, I can calculate this address to find out
+	// which hypercall.
+	MRS VBAR_EL1, R9
+	MOVD R0, 0x0(R9)
+
 	RET
 
 // HaltAndResume halts execution and point the pointer to the resume function.
@@ -488,6 +510,14 @@ TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
 // Start is the CPU entrypoint.
 TEXT ·Start(SB),NOSPLIT,$0
 	IRQ_DISABLE
+
+	// Init.
+	MOVD $SCTLR_EL1_DEFAULT, R1
+	MSR R1, SCTLR_EL1
+
+	MOVD $CNTKCTL_EL1_DEFAULT, R1
+	MSR R1, CNTKCTL_EL1
+
 	MOVD R8, RSV_REG
 	ORR $0xffff000000000000, RSV_REG, RSV_REG
 	WORD $0xd518d092        //MSR R18, TPIDR_EL1
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 900c0bba7..021693791 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -31,23 +31,39 @@ type defaultHooks struct{}
 
 // KernelSyscall implements Hooks.KernelSyscall.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (defaultHooks) KernelSyscall() { Halt() }
+func (defaultHooks) KernelSyscall() {
+	Halt()
+}
 
 // KernelException implements Hooks.KernelException.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (defaultHooks) KernelException(Vector) { Halt() }
+func (defaultHooks) KernelException(Vector) {
+	Halt()
+}
 
 // kernelSyscall is a trampoline.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
-func kernelSyscall(c *CPU) { c.hooks.KernelSyscall() }
+func kernelSyscall(c *CPU) {
+	c.hooks.KernelSyscall()
+}
 
 // kernelException is a trampoline.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
-func kernelException(c *CPU, vector Vector) { c.hooks.KernelException(vector) }
+func kernelException(c *CPU, vector Vector) {
+	c.hooks.KernelException(vector)
+}
 
 // Init initializes a new CPU.
 //
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 0feff8778..d37981dbf 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -178,6 +178,8 @@ func IsCanonical(addr uint64) bool {
 //
 // Precondition: the Rip, Rsp, Fs and Gs registers must be canonical.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
@@ -192,9 +194,9 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 
 	// Perform the switch.
 	swapgs()                                         // GS will be swapped on return.
-	WriteFS(uintptr(regs.Fs_base))                   // Set application FS.
-	WriteGS(uintptr(regs.Gs_base))                   // Set application GS.
-	LoadFloatingPoint(switchOpts.FloatingPointState) // Copy in floating point.
+	WriteFS(uintptr(regs.Fs_base))                   // escapes: no. Set application FS.
+	WriteGS(uintptr(regs.Gs_base))                   // escapes: no. Set application GS.
+	LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point.
 	jumpToKernel()                                   // Switch to upper half.
 	writeCR3(uintptr(userCR3))                       // Change to user address space.
 	if switchOpts.FullRestore {
@@ -204,8 +206,8 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	}
 	writeCR3(uintptr(kernelCR3))                     // Return to kernel address space.
 	jumpToUser()                                     // Return to lower half.
-	SaveFloatingPoint(switchOpts.FloatingPointState) // Copy out floating point.
-	WriteFS(uintptr(c.registers.Fs_base))            // Restore kernel FS.
+	SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point.
+	WriteFS(uintptr(c.registers.Fs_base))            // escapes: no. Restore kernel FS.
 	return
 }
 
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index ccacaea6b..d483ff03c 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -58,7 +58,15 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 
 	regs.Pstate &= ^uint64(UserFlagsClear)
 	regs.Pstate |= UserFlagsSet
+
+	LoadFloatingPoint(switchOpts.FloatingPointState)
+	SetTLS(regs.TPIDR_EL0)
+
 	kernelExitToEl0()
+
+	regs.TPIDR_EL0 = GetTLS()
+	SaveFloatingPoint(switchOpts.FloatingPointState)
+
 	vector = c.vecCode
 
 	// Perform the switch.
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index a6345010d..00e52c8af 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -38,6 +38,12 @@ func SaveVRegs(*byte)
 // LoadVRegs loads V0-V31 registers.
 func LoadVRegs(*byte)
 
+// LoadFloatingPoint loads floating point state.
+func LoadFloatingPoint(*byte)
+
+// SaveFloatingPoint saves floating point state.
+func SaveFloatingPoint(*byte)
+
 // GetTLS returns the value of TPIDR_EL0 register.
 func GetTLS() (value uint64)
 
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
index b63e14b41..86bfbe46f 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.s
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -129,3 +129,89 @@ TEXT ·LoadVRegs(SB),NOSPLIT,$0-8
 	ISB $15
 
 	RET
+
+TEXT ·LoadFloatingPoint(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R0
+
+	MOVD 0(R0), R1
+	MOVD R1, FPSR
+	MOVD 8(R0), R1
+	MOVD R1, NZCV
+
+	FMOVD 16*1(R0), F0
+	FMOVD 16*2(R0), F1
+	FMOVD 16*3(R0), F2
+	FMOVD 16*4(R0), F3
+	FMOVD 16*5(R0), F4
+	FMOVD 16*6(R0), F5
+	FMOVD 16*7(R0), F6
+	FMOVD 16*8(R0), F7
+	FMOVD 16*9(R0), F8
+	FMOVD 16*10(R0), F9
+	FMOVD 16*11(R0), F10
+	FMOVD 16*12(R0), F11
+	FMOVD 16*13(R0), F12
+	FMOVD 16*14(R0), F13
+	FMOVD 16*15(R0), F14
+	FMOVD 16*16(R0), F15
+	FMOVD 16*17(R0), F16
+	FMOVD 16*18(R0), F17
+	FMOVD 16*19(R0), F18
+	FMOVD 16*20(R0), F19
+	FMOVD 16*21(R0), F20
+	FMOVD 16*22(R0), F21
+	FMOVD 16*23(R0), F22
+	FMOVD 16*24(R0), F23
+	FMOVD 16*25(R0), F24
+	FMOVD 16*26(R0), F25
+	FMOVD 16*27(R0), F26
+	FMOVD 16*28(R0), F27
+	FMOVD 16*29(R0), F28
+	FMOVD 16*30(R0), F29
+	FMOVD 16*31(R0), F30
+	FMOVD 16*32(R0), F31
+
+	RET
+
+TEXT ·SaveFloatingPoint(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R0
+
+	MOVD FPSR, R1
+	MOVD R1, 0(R0)
+	MOVD NZCV, R1
+	MOVD R1, 8(R0)
+
+	FMOVD F0, 16*1(R0)
+	FMOVD F1, 16*2(R0)
+	FMOVD F2, 16*3(R0)
+	FMOVD F3, 16*4(R0)
+	FMOVD F4, 16*5(R0)
+	FMOVD F5, 16*6(R0)
+	FMOVD F6, 16*7(R0)
+	FMOVD F7, 16*8(R0)
+	FMOVD F8, 16*9(R0)
+	FMOVD F9, 16*10(R0)
+	FMOVD F10, 16*11(R0)
+	FMOVD F11, 16*12(R0)
+	FMOVD F12, 16*13(R0)
+	FMOVD F13, 16*14(R0)
+	FMOVD F14, 16*15(R0)
+	FMOVD F15, 16*16(R0)
+	FMOVD F16, 16*17(R0)
+	FMOVD F17, 16*18(R0)
+	FMOVD F18, 16*19(R0)
+	FMOVD F19, 16*20(R0)
+	FMOVD F20, 16*21(R0)
+	FMOVD F21, 16*22(R0)
+	FMOVD F22, 16*23(R0)
+	FMOVD F23, 16*24(R0)
+	FMOVD F24, 16*25(R0)
+	FMOVD F25, 16*26(R0)
+	FMOVD F26, 16*27(R0)
+	FMOVD F27, 16*28(R0)
+	FMOVD F28, 16*29(R0)
+	FMOVD F29, 16*30(R0)
+	FMOVD F30, 16*31(R0)
+	FMOVD F31, 16*32(R0)
+
+	RET
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go
index 23fd5c352..8d75b7599 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator.go
@@ -53,9 +53,14 @@ type RuntimeAllocator struct {
 
 // NewRuntimeAllocator returns an allocator that uses runtime allocation.
 func NewRuntimeAllocator() *RuntimeAllocator {
-	return &RuntimeAllocator{
-		used: make(map[*PTEs]struct{}),
-	}
+	r := new(RuntimeAllocator)
+	r.Init()
+	return r
+}
+
+// Init initializes a RuntimeAllocator.
+func (r *RuntimeAllocator) Init() {
+	r.used = make(map[*PTEs]struct{})
 }
 
 // Recycle returns freed pages to the pool.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 87e88e97d..7f18ac296 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -86,6 +86,8 @@ func (*mapVisitor) requiresSplit() bool { return true }
 //
 // Precondition: addr & length must be page-aligned, their sum must not overflow.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
 func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
 	if !opts.AccessType.Any() {
@@ -128,6 +130,8 @@ func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
 //
 // Precondition: addr & length must be page-aligned.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
 func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
 	w := unmapWalker{
@@ -162,6 +166,8 @@ func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) {
 //
 // Precondition: addr & length must be page-aligned.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
 func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool {
 	w := emptyWalker{
@@ -197,6 +203,8 @@ func (*lookupVisitor) requiresSplit() bool { return false }
 
 // Lookup returns the physical address for the given virtual address.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
 func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) {
 	mask := uintptr(usermem.PageSize - 1)
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index c40c6d673..c0fd3425b 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -20,5 +20,6 @@ go_library(
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/usermem",
+        "//tools/go_marshal/marshal",
     ],
 )
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index e82d6cd1e..e76e498de 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -26,6 +26,7 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/hostfd",
         "//pkg/sentry/inet",
@@ -39,6 +40,8 @@ go_library(
         "//pkg/tcpip/stack",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/marshal",
+        "//tools/go_marshal/primitive",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index c11e82c10..532a1ea5d 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -36,6 +36,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 const (
@@ -319,12 +321,12 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
 }
 
 // GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
 	if outLen < 0 {
 		return nil, syserr.ErrInvalidArgument
 	}
 
-	// Whitelist options and constrain option length.
+	// Only allow known and safe options.
 	optlen := getSockOptLen(t, level, name)
 	switch level {
 	case linux.SOL_IP:
@@ -364,12 +366,13 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr
 	if err != nil {
 		return nil, syserr.FromError(err)
 	}
-	return opt, nil
+	optP := primitive.ByteSlice(opt)
+	return &optP, nil
 }
 
 // SetSockOpt implements socket.Socket.SetSockOpt.
 func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
-	// Whitelist options and constrain option length.
+	// Only allow known and safe options.
 	optlen := setSockOptLen(t, level, name)
 	switch level {
 	case linux.SOL_IP:
@@ -415,7 +418,7 @@ func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []
 
 // RecvMsg implements socket.Socket.RecvMsg.
 func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
-	// Whitelist flags.
+	// Only allow known and safe flags.
 	//
 	// FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary
 	// messages that gvisor/pkg/tcpip/transport/unix doesn't understand. Kill the
@@ -537,7 +540,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 
 // SendMsg implements socket.Socket.SendMsg.
 func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
-	// Whitelist flags.
+	// Only allow known and safe flags.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
 		return 0, syserr.ErrInvalidArgument
 	}
@@ -708,6 +711,6 @@ func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int
 func init() {
 	for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} {
 		socket.RegisterProvider(family, &socketProvider{family})
-		socket.RegisterProviderVFS2(family, &socketProviderVFS2{})
+		socket.RegisterProviderVFS2(family, &socketProviderVFS2{family})
 	}
 }
diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go
index 677743113..8a1d52ebf 100644
--- a/pkg/sentry/socket/hostinet/socket_vfs2.go
+++ b/pkg/sentry/socket/hostinet/socket_vfs2.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
 	"gvisor.dev/gvisor/pkg/sentry/hostfd"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -35,6 +36,7 @@ import (
 type socketVFS2 struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	// We store metadata for hostinet sockets internally. Technically, we should
 	// access metadata (e.g. through stat, chmod) on the host for correctness,
@@ -59,6 +61,7 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in
 			fd:       fd,
 		},
 	}
+	s.LockFD.Init(&vfs.FileLocks{})
 	if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
 		return nil, syserr.FromError(err)
 	}
@@ -68,6 +71,7 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in
 		DenyPWrite:        true,
 		UseDentryMetadata: true,
 	}); err != nil {
+		fdnotifier.RemoveFD(int32(s.fd))
 		return nil, syserr.FromError(err)
 	}
 	return vfsfd, nil
@@ -93,7 +97,12 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
 	return ioctl(ctx, s.fd, uio, args)
 }
 
-// PRead implements vfs.FileDescriptionImpl.
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (s *socketVFS2) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return syserror.ENODEV
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
 func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	return 0, syserror.ESPIPE
 }
@@ -131,6 +140,16 @@ func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs
 	return int64(n), err
 }
 
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (s *socketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (s *socketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence)
+}
+
 type socketProviderVFS2 struct {
 	family int
 }
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 789bb94c8..a9f0604ae 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -41,19 +41,6 @@ const errorTargetName = "ERROR"
 // change the destination port/destination IP for packets.
 const redirectTargetName = "REDIRECT"
 
-// Metadata is used to verify that we are correctly serializing and
-// deserializing iptables into structs consumable by the iptables tool. We save
-// a metadata struct when the tables are written, and when they are read out we
-// verify that certain fields are the same.
-//
-// metadata is used by this serialization/deserializing code, not netstack.
-type metadata struct {
-	HookEntry  [linux.NF_INET_NUMHOOKS]uint32
-	Underflow  [linux.NF_INET_NUMHOOKS]uint32
-	NumEntries uint32
-	Size       uint32
-}
-
 // enableLogging controls whether to log the (de)serialization of netfilter
 // structs between userspace and netstack. These logs are useful when
 // developing iptables, but can pollute sentry logs otherwise.
@@ -64,6 +51,8 @@ const enableLogging = false
 var emptyFilter = stack.IPHeaderFilter{
 	Dst:     "\x00\x00\x00\x00",
 	DstMask: "\x00\x00\x00\x00",
+	Src:     "\x00\x00\x00\x00",
+	SrcMask: "\x00\x00\x00\x00",
 }
 
 // nflog logs messages related to the writing and reading of iptables.
@@ -77,33 +66,17 @@ func nflog(format string, args ...interface{}) {
 func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
 	// Read in the struct and table name.
 	var info linux.IPTGetinfo
-	if _, err := t.CopyIn(outPtr, &info); err != nil {
+	if _, err := info.CopyIn(t, outPtr); err != nil {
 		return linux.IPTGetinfo{}, syserr.FromError(err)
 	}
 
-	// Find the appropriate table.
-	table, err := findTable(stack, info.Name)
+	_, info, err := convertNetstackToBinary(stack, info.Name)
 	if err != nil {
-		nflog("%v", err)
+		nflog("couldn't convert iptables: %v", err)
 		return linux.IPTGetinfo{}, syserr.ErrInvalidArgument
 	}
 
-	// Get the hooks that apply to this table.
-	info.ValidHooks = table.ValidHooks()
-
-	// Grab the metadata struct, which is used to store information (e.g.
-	// the number of entries) that applies to the user's encoding of
-	// iptables, but not netstack's.
-	metadata := table.Metadata().(metadata)
-
-	// Set values from metadata.
-	info.HookEntry = metadata.HookEntry
-	info.Underflow = metadata.Underflow
-	info.NumEntries = metadata.NumEntries
-	info.Size = metadata.Size
-
 	nflog("returning info: %+v", info)
-
 	return info, nil
 }
 
@@ -111,28 +84,18 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
 	// Read in the struct and table name.
 	var userEntries linux.IPTGetEntries
-	if _, err := t.CopyIn(outPtr, &userEntries); err != nil {
+	if _, err := userEntries.CopyIn(t, outPtr); err != nil {
 		nflog("couldn't copy in entries %q", userEntries.Name)
 		return linux.KernelIPTGetEntries{}, syserr.FromError(err)
 	}
 
-	// Find the appropriate table.
-	table, err := findTable(stack, userEntries.Name)
-	if err != nil {
-		nflog("%v", err)
-		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
-	}
-
 	// Convert netstack's iptables rules to something that the iptables
 	// tool can understand.
-	entries, meta, err := convertNetstackToBinary(userEntries.Name.String(), table)
+	entries, _, err := convertNetstackToBinary(stack, userEntries.Name)
 	if err != nil {
 		nflog("couldn't read entries: %v", err)
 		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
 	}
-	if meta != table.Metadata().(metadata) {
-		panic(fmt.Sprintf("Table %q metadata changed between writing and reading. Was saved as %+v, but is now %+v", userEntries.Name.String(), table.Metadata().(metadata), meta))
-	}
 	if binary.Size(entries) > uintptr(outLen) {
 		nflog("insufficient GetEntries output size: %d", uintptr(outLen))
 		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
@@ -141,48 +104,26 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	return entries, nil
 }
 
-func findTable(stk *stack.Stack, tablename linux.TableName) (stack.Table, error) {
-	ipt := stk.IPTables()
-	table, ok := ipt.Tables[tablename.String()]
-	if !ok {
-		return stack.Table{}, fmt.Errorf("couldn't find table %q", tablename)
-	}
-	return table, nil
-}
-
-// FillDefaultIPTables sets stack's IPTables to the default tables and
-// populates them with metadata.
-func FillDefaultIPTables(stk *stack.Stack) {
-	ipt := stack.DefaultTables()
-
-	// In order to fill in the metadata, we have to translate ipt from its
-	// netstack format to Linux's giant-binary-blob format.
-	for name, table := range ipt.Tables {
-		_, metadata, err := convertNetstackToBinary(name, table)
-		if err != nil {
-			panic(fmt.Errorf("Unable to set default IP tables: %v", err))
-		}
-		table.SetMetadata(metadata)
-		ipt.Tables[name] = table
-	}
-
-	stk.SetIPTables(ipt)
-}
-
 // convertNetstackToBinary converts the iptables as stored in netstack to the
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
 // jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelIPTGetEntries, metadata, error) {
-	// Return values.
+func convertNetstackToBinary(stack *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) {
+	table, ok := stack.IPTables().GetTable(tablename.String())
+	if !ok {
+		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
+	}
+
 	var entries linux.KernelIPTGetEntries
-	var meta metadata
+	var info linux.IPTGetinfo
+	info.ValidHooks = table.ValidHooks()
 
 	// The table name has to fit in the struct.
 	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
-		return linux.KernelIPTGetEntries{}, metadata{}, fmt.Errorf("table name %q too long.", tablename)
+		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
 	}
-	copy(entries.Name[:], tablename)
+	copy(info.Name[:], tablename[:])
+	copy(entries.Name[:], tablename[:])
 
 	for ruleIdx, rule := range table.Rules {
 		nflog("convert to binary: current offset: %d", entries.Size)
@@ -191,20 +132,20 @@ func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelI
 		for hook, hookRuleIdx := range table.BuiltinChains {
 			if hookRuleIdx == ruleIdx {
 				nflog("convert to binary: found hook %d at offset %d", hook, entries.Size)
-				meta.HookEntry[hook] = entries.Size
+				info.HookEntry[hook] = entries.Size
 			}
 		}
 		// Is this a chain underflow point?
 		for underflow, underflowRuleIdx := range table.Underflows {
 			if underflowRuleIdx == ruleIdx {
 				nflog("convert to binary: found underflow %d at offset %d", underflow, entries.Size)
-				meta.Underflow[underflow] = entries.Size
+				info.Underflow[underflow] = entries.Size
 			}
 		}
 
 		// Each rule corresponds to an entry.
 		entry := linux.KernelIPTEntry{
-			IPTEntry: linux.IPTEntry{
+			Entry: linux.IPTEntry{
 				IP: linux.IPTIP{
 					Protocol: uint16(rule.Filter.Protocol),
 				},
@@ -212,15 +153,20 @@ func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelI
 				TargetOffset: linux.SizeOfIPTEntry,
 			},
 		}
-		copy(entry.IPTEntry.IP.Dst[:], rule.Filter.Dst)
-		copy(entry.IPTEntry.IP.DstMask[:], rule.Filter.DstMask)
-		copy(entry.IPTEntry.IP.OutputInterface[:], rule.Filter.OutputInterface)
-		copy(entry.IPTEntry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
+		copy(entry.Entry.IP.Dst[:], rule.Filter.Dst)
+		copy(entry.Entry.IP.DstMask[:], rule.Filter.DstMask)
+		copy(entry.Entry.IP.Src[:], rule.Filter.Src)
+		copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask)
+		copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface)
+		copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
 		if rule.Filter.DstInvert {
-			entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_DSTIP
+			entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP
+		}
+		if rule.Filter.SrcInvert {
+			entry.Entry.IP.InverseFlags |= linux.IPT_INV_SRCIP
 		}
 		if rule.Filter.OutputInterfaceInvert {
-			entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
+			entry.Entry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
 		}
 
 		for _, matcher := range rule.Matchers {
@@ -232,8 +178,8 @@ func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelI
 				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
 			}
 			entry.Elems = append(entry.Elems, serialized...)
-			entry.NextOffset += uint16(len(serialized))
-			entry.TargetOffset += uint16(len(serialized))
+			entry.Entry.NextOffset += uint16(len(serialized))
+			entry.Entry.TargetOffset += uint16(len(serialized))
 		}
 
 		// Serialize and append the target.
@@ -242,18 +188,18 @@ func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelI
 			panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
 		}
 		entry.Elems = append(entry.Elems, serialized...)
-		entry.NextOffset += uint16(len(serialized))
+		entry.Entry.NextOffset += uint16(len(serialized))
 
 		nflog("convert to binary: adding entry: %+v", entry)
 
-		entries.Size += uint32(entry.NextOffset)
+		entries.Size += uint32(entry.Entry.NextOffset)
 		entries.Entrytable = append(entries.Entrytable, entry)
-		meta.NumEntries++
+		info.NumEntries++
 	}
 
-	nflog("convert to binary: finished with an marshalled size of %d", meta.Size)
-	meta.Size = entries.Size
-	return entries, meta, nil
+	nflog("convert to binary: finished with an marshalled size of %d", info.Size)
+	info.Size = entries.Size
+	return entries, info, nil
 }
 
 func marshalTarget(target stack.Target) []byte {
@@ -396,10 +342,10 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	// TODO(gvisor.dev/issue/170): Support other tables.
 	var table stack.Table
 	switch replace.Name.String() {
-	case stack.TablenameFilter:
+	case stack.FilterTable:
 		table = stack.EmptyFilterTable()
-	case stack.TablenameNat:
-		table = stack.EmptyNatTable()
+	case stack.NATTable:
+		table = stack.EmptyNATTable()
 	default:
 		nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
 		return syserr.ErrInvalidArgument
@@ -485,6 +431,8 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	for hook, _ := range replace.HookEntry {
 		if table.ValidHooks()&(1<<hook) != 0 {
 			hk := hookFromLinux(hook)
+			table.BuiltinChains[hk] = stack.HookUnset
+			table.Underflows[hk] = stack.HookUnset
 			for offset, ruleIdx := range offsets {
 				if offset == replace.HookEntry[hook] {
 					table.BuiltinChains[hk] = ruleIdx
@@ -510,8 +458,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 
 	// Add the user chains.
 	for ruleIdx, rule := range table.Rules {
-		target, ok := rule.Target.(stack.UserChainTarget)
-		if !ok {
+		if _, ok := rule.Target.(stack.UserChainTarget); !ok {
 			continue
 		}
 
@@ -527,7 +474,6 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 			nflog("user chain's first node must have no matchers")
 			return syserr.ErrInvalidArgument
 		}
-		table.UserChains[target.Name] = ruleIdx + 1
 	}
 
 	// Set each jump to point to the appropriate rule. Right now they hold byte
@@ -553,7 +499,10 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	// Since we only support modifying the INPUT, PREROUTING and OUTPUT chain right now,
 	// make sure all other chains point to ACCEPT rules.
 	for hook, ruleIdx := range table.BuiltinChains {
-		if hook == stack.Forward || hook == stack.Postrouting {
+		if hook := stack.Hook(hook); hook == stack.Forward || hook == stack.Postrouting {
+			if ruleIdx == stack.HookUnset {
+				continue
+			}
 			if !isUnconditionalAccept(table.Rules[ruleIdx]) {
 				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
@@ -566,17 +515,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	// - There are no chains without an unconditional final rule.
 	// - There are no chains without an unconditional underflow rule.
 
-	ipt := stk.IPTables()
-	table.SetMetadata(metadata{
-		HookEntry:  replace.HookEntry,
-		Underflow:  replace.Underflow,
-		NumEntries: replace.NumEntries,
-		Size:       replace.Size,
-	})
-	ipt.Tables[replace.Name.String()] = table
-	stk.SetIPTables(ipt)
-
-	return nil
+	return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(replace.Name.String(), table))
 }
 
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
@@ -737,6 +676,9 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
 	if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
 		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
 	}
+	if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
+	}
 
 	n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
 	if n == -1 {
@@ -755,6 +697,9 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
 		Dst:                   tcpip.Address(iptip.Dst[:]),
 		DstMask:               tcpip.Address(iptip.DstMask[:]),
 		DstInvert:             iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
+		Src:                   tcpip.Address(iptip.Src[:]),
+		SrcMask:               tcpip.Address(iptip.SrcMask[:]),
+		SrcInvert:             iptip.InverseFlags&linux.IPT_INV_SRCIP != 0,
 		OutputInterface:       ifname,
 		OutputInterfaceMask:   ifnameMask,
 		OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0,
@@ -765,15 +710,13 @@ func containsUnsupportedFields(iptip linux.IPTIP) bool {
 	// The following features are supported:
 	// - Protocol
 	// - Dst and DstMask
+	// - Src and SrcMask
 	// - The inverse destination IP check flag
 	// - OutputInterface, OutputInterfaceMask and its inverse.
-	var emptyInetAddr = linux.InetAddr{}
 	var emptyInterface = [linux.IFNAMSIZ]byte{}
 	// Disable any supported inverse flags.
-	inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_VIA_OUT)
-	return iptip.Src != emptyInetAddr ||
-		iptip.SrcMask != emptyInetAddr ||
-		iptip.InputInterface != emptyInterface ||
+	inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT)
+	return iptip.InputInterface != emptyInterface ||
 		iptip.InputInterfaceMask != emptyInterface ||
 		iptip.Flags != 0 ||
 		iptip.InverseFlags&^inverseMask != 0
diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go
index 3863293c7..1b4e0ad79 100644
--- a/pkg/sentry/socket/netfilter/owner_matcher.go
+++ b/pkg/sentry/socket/netfilter/owner_matcher.go
@@ -111,7 +111,7 @@ func (*OwnerMatcher) Name() string {
 }
 
 // Match implements Matcher.Match.
-func (om *OwnerMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
+func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
 	// Support only for OUTPUT chain.
 	// TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also.
 	if hook != stack.Output {
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
index 84abe8d29..b91ba3ab3 100644
--- a/pkg/sentry/socket/netfilter/targets.go
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -30,6 +30,6 @@ type JumpTarget struct {
 }
 
 // Action implements stack.Target.Action.
-func (jt JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrackTable, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
+func (jt JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
 	return stack.RuleJump, jt.RuleNum
 }
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 57a1e1c12..4f98ee2d5 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -96,7 +96,7 @@ func (*TCPMatcher) Name() string {
 }
 
 // Match implements Matcher.Match.
-func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
+func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
 	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
@@ -111,36 +111,10 @@ func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa
 		return false, false
 	}
 
-	// Now we need the transport header. However, this may not have been set
-	// yet.
-	// TODO(gvisor.dev/issue/170): Parsing the transport header should
-	// ultimately be moved into the stack.Check codepath as matchers are
-	// added.
-	var tcpHeader header.TCP
-	if pkt.TransportHeader != nil {
-		tcpHeader = header.TCP(pkt.TransportHeader)
-	} else {
-		var length int
-		if hook == stack.Prerouting {
-			// The network header hasn't been parsed yet. We have to do it here.
-			hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
-			if !ok {
-				// There's no valid TCP header here, so we hotdrop the
-				// packet.
-				return false, true
-			}
-			h := header.IPv4(hdr)
-			pkt.NetworkHeader = hdr
-			length = int(h.HeaderLength())
-		}
-		// The TCP header hasn't been parsed yet. We have to do it here.
-		hdr, ok := pkt.Data.PullUp(length + header.TCPMinimumSize)
-		if !ok {
-			// There's no valid TCP header here, so we hotdrop the
-			// packet.
-			return false, true
-		}
-		tcpHeader = header.TCP(hdr[length:])
+	tcpHeader := header.TCP(pkt.TransportHeader)
+	if len(tcpHeader) < header.TCPMinimumSize {
+		// There's no valid TCP header here, so we drop the packet immediately.
+		return false, true
 	}
 
 	// Check whether the source and destination ports are within the
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index cfa9e621d..3f20fc891 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -93,7 +93,7 @@ func (*UDPMatcher) Name() string {
 }
 
 // Match implements Matcher.Match.
-func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
+func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
 	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
@@ -110,36 +110,10 @@ func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa
 		return false, false
 	}
 
-	// Now we need the transport header. However, this may not have been set
-	// yet.
-	// TODO(gvisor.dev/issue/170): Parsing the transport header should
-	// ultimately be moved into the stack.Check codepath as matchers are
-	// added.
-	var udpHeader header.UDP
-	if pkt.TransportHeader != nil {
-		udpHeader = header.UDP(pkt.TransportHeader)
-	} else {
-		var length int
-		if hook == stack.Prerouting {
-			// The network header hasn't been parsed yet. We have to do it here.
-			hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
-			if !ok {
-				// There's no valid UDP header here, so we hotdrop the
-				// packet.
-				return false, true
-			}
-			h := header.IPv4(hdr)
-			pkt.NetworkHeader = hdr
-			length = int(h.HeaderLength())
-		}
-		// The UDP header hasn't been parsed yet. We have to do it here.
-		hdr, ok := pkt.Data.PullUp(length + header.UDPMinimumSize)
-		if !ok {
-			// There's no valid UDP header here, so we hotdrop the
-			// packet.
-			return false, true
-		}
-		udpHeader = header.UDP(hdr[length:])
+	udpHeader := header.UDP(pkt.TransportHeader)
+	if len(udpHeader) < header.UDPMinimumSize {
+		// There's no valid UDP header here, so we drop the packet immediately.
+		return false, true
 	}
 
 	// Check whether the source and destination ports are within the
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 7212d8644..0546801bf 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -20,6 +20,7 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
@@ -35,6 +36,8 @@ go_library(
         "//pkg/tcpip",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/marshal",
+        "//tools/go_marshal/primitive",
     ],
 )
 
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 81f34c5a2..98ca7add0 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -38,6 +38,8 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 const sizeOfInt32 int = 4
@@ -330,7 +332,7 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
 }
 
 // GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
 		switch name {
@@ -340,24 +342,26 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr
 			}
 			s.mu.Lock()
 			defer s.mu.Unlock()
-			return int32(s.sendBufferSize), nil
+			sendBufferSizeP := primitive.Int32(s.sendBufferSize)
+			return &sendBufferSizeP, nil
 
 		case linux.SO_RCVBUF:
 			if outLen < sizeOfInt32 {
 				return nil, syserr.ErrInvalidArgument
 			}
 			// We don't have limit on receiving size.
-			return int32(math.MaxInt32), nil
+			recvBufferSizeP := primitive.Int32(math.MaxInt32)
+			return &recvBufferSizeP, nil
 
 		case linux.SO_PASSCRED:
 			if outLen < sizeOfInt32 {
 				return nil, syserr.ErrInvalidArgument
 			}
-			var passcred int32
+			var passcred primitive.Int32
 			if s.Passcred() {
 				passcred = 1
 			}
-			return passcred, nil
+			return &passcred, nil
 
 		default:
 			socket.GetSockOptEmitUnimplementedEvent(t, name)
diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go
index b854bf990..dbcd8b49a 100644
--- a/pkg/sentry/socket/netlink/socket_vfs2.go
+++ b/pkg/sentry/socket/netlink/socket_vfs2.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
@@ -40,6 +41,7 @@ type SocketVFS2 struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.LockFD
 
 	socketOpsCommon
 }
@@ -66,7 +68,7 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV
 		return nil, err
 	}
 
-	return &SocketVFS2{
+	fd := &SocketVFS2{
 		socketOpsCommon: socketOpsCommon{
 			ports:          t.Kernel().NetlinkPorts(),
 			protocol:       protocol,
@@ -75,7 +77,9 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV
 			connection:     connection,
 			sendBufferSize: defaultSendBufferSize,
 		},
-	}, nil
+	}
+	fd.LockFD.Init(&vfs.FileLocks{})
+	return fd, nil
 }
 
 // Readiness implements waiter.Waitable.Readiness.
@@ -136,3 +140,13 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs
 	n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{})
 	return int64(n), err.ToError()
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index 333e0042e..1fb777a6c 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -28,6 +28,7 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
@@ -50,5 +51,8 @@ go_library(
         "//pkg/tcpip/transport/udp",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/marshal",
+        "//tools/go_marshal/primitive",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 60df51dae..31a168f7e 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -26,6 +26,7 @@ package netstack
 
 import (
 	"bytes"
+	"fmt"
 	"io"
 	"math"
 	"reflect"
@@ -33,6 +34,7 @@ import (
 	"syscall"
 	"time"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/amutex"
 	"gvisor.dev/gvisor/pkg/binary"
@@ -60,6 +62,8 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 func mustCreateMetric(name, description string) *tcpip.StatCounter {
@@ -190,6 +194,8 @@ var Metrics = tcpip.Stats{
 		MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."),
 		PacketsSent:              mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
 		PacketSendErrors:         mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
+		ChecksumErrors:           mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."),
+		InvalidSourceAddress:     mustCreateMetric("/netstack/udp/invalid_source", "Number of UDP datagrams dropped due to invalid source address."),
 	},
 }
 
@@ -294,8 +300,9 @@ type socketOpsCommon struct {
 	readView buffer.View
 	// readCM holds control message information for the last packet read
 	// from Endpoint.
-	readCM tcpip.ControlMessages
-	sender tcpip.FullAddress
+	readCM         tcpip.ControlMessages
+	sender         tcpip.FullAddress
+	linkPacketInfo tcpip.LinkPacketInfo
 
 	// sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps
 	// of returned messages can be returned via control messages. When
@@ -416,7 +423,7 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
 			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
 		}
 
-		// TODO(b/129292371): Return protocol too.
+		// TODO(gvisor.dev/issue/173): Return protocol too.
 		return tcpip.FullAddress{
 			NIC:  tcpip.NICID(a.InterfaceIndex),
 			Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
@@ -444,8 +451,21 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error {
 	}
 	s.readView = nil
 	s.sender = tcpip.FullAddress{}
+	s.linkPacketInfo = tcpip.LinkPacketInfo{}
 
-	v, cms, err := s.Endpoint.Read(&s.sender)
+	var v buffer.View
+	var cms tcpip.ControlMessages
+	var err *tcpip.Error
+
+	switch e := s.Endpoint.(type) {
+	// The ordering of these interfaces matters. The most specific
+	// interfaces must be specified before the more generic Endpoint
+	// interface.
+	case tcpip.PacketEndpoint:
+		v, cms, err = e.ReadPacket(&s.sender, &s.linkPacketInfo)
+	case tcpip.Endpoint:
+		v, cms, err = e.Read(&s.sender)
+	}
 	if err != nil {
 		atomic.StoreUint32(&s.readViewHasData, 0)
 		return syserr.TranslateNetstackError(err)
@@ -719,6 +739,14 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool
 	defer s.EventUnregister(&e)
 
 	if err := s.Endpoint.Connect(addr); err != tcpip.ErrConnectStarted && err != tcpip.ErrAlreadyConnecting {
+		if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM {
+			// TCP unlike UDP returns EADDRNOTAVAIL when it can't
+			// find an available local ephemeral port.
+			if err == tcpip.ErrNoPortAvailable {
+				return syserr.ErrAddressNotAvailable
+			}
+		}
+
 		return syserr.TranslateNetstackError(err)
 	}
 
@@ -884,7 +912,7 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
 
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
 	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
 	// implemented specifically for netstack.SocketOperations rather than
 	// commonEndpoint. commonEndpoint should be extended to support socket
@@ -894,25 +922,25 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
-		val := int32(0)
+		val := primitive.Int32(0)
 		s.readMu.Lock()
 		defer s.readMu.Unlock()
 		if s.sockOptTimestamp {
 			val = 1
 		}
-		return val, nil
+		return &val, nil
 	}
 	if level == linux.SOL_TCP && name == linux.TCP_INQ {
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
-		val := int32(0)
+		val := primitive.Int32(0)
 		s.readMu.Lock()
 		defer s.readMu.Unlock()
 		if s.sockOptInq {
 			val = 1
 		}
-		return val, nil
+		return &val, nil
 	}
 
 	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
@@ -930,7 +958,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 			if err != nil {
 				return nil, err
 			}
-			return info, nil
+			return &info, nil
 
 		case linux.IPT_SO_GET_ENTRIES:
 			if outLen < linux.SizeOfIPTGetEntries {
@@ -945,7 +973,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 			if err != nil {
 				return nil, err
 			}
-			return entries, nil
+			return &entries, nil
 
 		}
 	}
@@ -955,7 +983,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 
 // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
 // sockets backed by a commonEndpoint.
-func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
+func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (marshal.Marshallable, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
 		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
@@ -988,7 +1016,7 @@ func boolToInt32(v bool) int32 {
 }
 
 // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
-func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) {
 	// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
 	switch name {
 	case linux.SO_ERROR:
@@ -999,9 +1027,12 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 		// Get the last error and convert it.
 		err := ep.GetSockOpt(tcpip.ErrorOption{})
 		if err == nil {
-			return int32(0), nil
+			optP := primitive.Int32(0)
+			return &optP, nil
 		}
-		return int32(syserr.TranslateNetstackError(err).ToLinux().Number()), nil
+
+		optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux().Number())
+		return &optP, nil
 
 	case linux.SO_PEERCRED:
 		if family != linux.AF_UNIX || outLen < syscall.SizeofUcred {
@@ -1009,11 +1040,12 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 		}
 
 		tcred := t.Credentials()
-		return syscall.Ucred{
-			Pid: int32(t.ThreadGroup().ID()),
-			Uid: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
-			Gid: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
-		}, nil
+		creds := linux.ControlMessageCredentials{
+			PID: int32(t.ThreadGroup().ID()),
+			UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
+			GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
+		}
+		return &creds, nil
 
 	case linux.SO_PASSCRED:
 		if outLen < sizeOfInt32 {
@@ -1024,7 +1056,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		return boolToInt32(v), nil
+
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
 
 	case linux.SO_SNDBUF:
 		if outLen < sizeOfInt32 {
@@ -1040,7 +1074,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			size = math.MaxInt32
 		}
 
-		return int32(size), nil
+		sizeP := primitive.Int32(size)
+		return &sizeP, nil
 
 	case linux.SO_RCVBUF:
 		if outLen < sizeOfInt32 {
@@ -1056,7 +1091,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			size = math.MaxInt32
 		}
 
-		return int32(size), nil
+		sizeP := primitive.Int32(size)
+		return &sizeP, nil
 
 	case linux.SO_REUSEADDR:
 		if outLen < sizeOfInt32 {
@@ -1067,7 +1103,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		return boolToInt32(v), nil
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
 
 	case linux.SO_REUSEPORT:
 		if outLen < sizeOfInt32 {
@@ -1078,7 +1115,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		return boolToInt32(v), nil
+
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
 
 	case linux.SO_BINDTODEVICE:
 		var v tcpip.BindToDeviceOption
@@ -1086,7 +1125,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.TranslateNetstackError(err)
 		}
 		if v == 0 {
-			return []byte{}, nil
+			var b primitive.ByteSlice
+			return &b, nil
 		}
 		if outLen < linux.IFNAMSIZ {
 			return nil, syserr.ErrInvalidArgument
@@ -1101,7 +1141,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			// interface was removed.
 			return nil, syserr.ErrUnknownDevice
 		}
-		return append([]byte(nic.Name), 0), nil
+
+		name := primitive.ByteSlice(append([]byte(nic.Name), 0))
+		return &name, nil
 
 	case linux.SO_BROADCAST:
 		if outLen < sizeOfInt32 {
@@ -1112,7 +1154,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		return boolToInt32(v), nil
+
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
 
 	case linux.SO_KEEPALIVE:
 		if outLen < sizeOfInt32 {
@@ -1123,13 +1167,17 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		return boolToInt32(v), nil
+
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
 
 	case linux.SO_LINGER:
 		if outLen < linux.SizeOfLinger {
 			return nil, syserr.ErrInvalidArgument
 		}
-		return linux.Linger{}, nil
+
+		linger := linux.Linger{}
+		return &linger, nil
 
 	case linux.SO_SNDTIMEO:
 		// TODO(igudger): Linux allows shorter lengths for partial results.
@@ -1137,7 +1185,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		return linux.NsecToTimeval(s.SendTimeout()), nil
+		sendTimeout := linux.NsecToTimeval(s.SendTimeout())
+		return &sendTimeout, nil
 
 	case linux.SO_RCVTIMEO:
 		// TODO(igudger): Linux allows shorter lengths for partial results.
@@ -1145,7 +1194,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		return linux.NsecToTimeval(s.RecvTimeout()), nil
+		recvTimeout := linux.NsecToTimeval(s.RecvTimeout())
+		return &recvTimeout, nil
 
 	case linux.SO_OOBINLINE:
 		if outLen < sizeOfInt32 {
@@ -1157,7 +1207,20 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
-		return int32(v), nil
+		vP := primitive.Int32(v)
+		return &vP, nil
+
+	case linux.SO_NO_CHECK:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.NoChecksumOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
 
 	default:
 		socket.GetSockOptEmitUnimplementedEvent(t, name)
@@ -1166,7 +1229,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 }
 
 // getSockOptTCP implements GetSockOpt when level is SOL_TCP.
-func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
 	switch name {
 	case linux.TCP_NODELAY:
 		if outLen < sizeOfInt32 {
@@ -1177,7 +1240,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		return boolToInt32(!v), nil
+
+		vP := primitive.Int32(boolToInt32(!v))
+		return &vP, nil
 
 	case linux.TCP_CORK:
 		if outLen < sizeOfInt32 {
@@ -1188,7 +1253,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		return boolToInt32(v), nil
+
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
 
 	case linux.TCP_QUICKACK:
 		if outLen < sizeOfInt32 {
@@ -1199,7 +1266,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		return boolToInt32(v), nil
+
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
 
 	case linux.TCP_MAXSEG:
 		if outLen < sizeOfInt32 {
@@ -1210,8 +1279,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		vP := primitive.Int32(v)
+		return &vP, nil
 
 	case linux.TCP_KEEPIDLE:
 		if outLen < sizeOfInt32 {
@@ -1222,8 +1291,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		if err := ep.GetSockOpt(&v); err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(time.Duration(v) / time.Second), nil
+		keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second)
+		return &keepAliveIdle, nil
 
 	case linux.TCP_KEEPINTVL:
 		if outLen < sizeOfInt32 {
@@ -1234,8 +1303,20 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		if err := ep.GetSockOpt(&v); err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
+		keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second)
+		return &keepAliveInterval, nil
 
-		return int32(time.Duration(v) / time.Second), nil
+	case linux.TCP_KEEPCNT:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+		vP := primitive.Int32(v)
+		return &vP, nil
 
 	case linux.TCP_USER_TIMEOUT:
 		if outLen < sizeOfInt32 {
@@ -1246,8 +1327,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		if err := ep.GetSockOpt(&v); err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(time.Duration(v) / time.Millisecond), nil
+		tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond)
+		return &tcpUserTimeout, nil
 
 	case linux.TCP_INFO:
 		var v tcpip.TCPInfoOption
@@ -1260,12 +1341,13 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		info := linux.TCPInfo{}
 
 		// Linux truncates the output binary to outLen.
-		ib := binary.Marshal(nil, usermem.ByteOrder, &info)
-		if len(ib) > outLen {
-			ib = ib[:outLen]
+		buf := t.CopyScratchBuffer(info.SizeBytes())
+		info.MarshalUnsafe(buf)
+		if len(buf) > outLen {
+			buf = buf[:outLen]
 		}
-
-		return ib, nil
+		bufP := primitive.ByteSlice(buf)
+		return &bufP, nil
 
 	case linux.TCP_CC_INFO,
 		linux.TCP_NOTSENT_LOWAT,
@@ -1295,7 +1377,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		}
 		b := make([]byte, toCopy)
 		copy(b, v)
-		return b, nil
+
+		bP := primitive.ByteSlice(b)
+		return &bP, nil
 
 	case linux.TCP_LINGER2:
 		if outLen < sizeOfInt32 {
@@ -1307,7 +1391,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
-		return int32(time.Duration(v) / time.Second), nil
+		lingerTimeout := primitive.Int32(time.Duration(v) / time.Second)
+		return &lingerTimeout, nil
 
 	case linux.TCP_DEFER_ACCEPT:
 		if outLen < sizeOfInt32 {
@@ -1319,7 +1404,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
-		return int32(time.Duration(v) / time.Second), nil
+		tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second)
+		return &tcpDeferAccept, nil
 
 	case linux.TCP_SYNCNT:
 		if outLen < sizeOfInt32 {
@@ -1330,8 +1416,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		vP := primitive.Int32(v)
+		return &vP, nil
 
 	case linux.TCP_WINDOW_CLAMP:
 		if outLen < sizeOfInt32 {
@@ -1342,8 +1428,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		vP := primitive.Int32(v)
+		return &vP, nil
 	default:
 		emitUnimplementedEventTCP(t, name)
 	}
@@ -1351,7 +1437,7 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 }
 
 // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
-func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
 	switch name {
 	case linux.IPV6_V6ONLY:
 		if outLen < sizeOfInt32 {
@@ -1362,7 +1448,9 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		return boolToInt32(v), nil
+
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
 
 	case linux.IPV6_PATHMTU:
 		t.Kernel().EmitUnimplementedEvent(t)
@@ -1370,21 +1458,24 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 	case linux.IPV6_TCLASS:
 		// Length handling for parity with Linux.
 		if outLen == 0 {
-			return make([]byte, 0), nil
+			var b primitive.ByteSlice
+			return &b, nil
 		}
 		v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
-		uintv := uint32(v)
+		uintv := primitive.Uint32(v)
 		// Linux truncates the output binary to outLen.
-		ib := binary.Marshal(nil, usermem.ByteOrder, &uintv)
+		ib := t.CopyScratchBuffer(uintv.SizeBytes())
+		uintv.MarshalUnsafe(ib)
 		// Handle cases where outLen is lesser than sizeOfInt32.
 		if len(ib) > outLen {
 			ib = ib[:outLen]
 		}
-		return ib, nil
+		ibP := primitive.ByteSlice(ib)
+		return &ibP, nil
 
 	case linux.IPV6_RECVTCLASS:
 		if outLen < sizeOfInt32 {
@@ -1395,7 +1486,13 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		return boolToInt32(v), nil
+
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
+
+	case linux.SO_ORIGINAL_DST:
+		// TODO(gvisor.dev/issue/170): ip6tables.
+		return nil, syserr.ErrInvalidArgument
 
 	default:
 		emitUnimplementedEventIPv6(t, name)
@@ -1404,7 +1501,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 }
 
 // getSockOptIP implements GetSockOpt when level is SOL_IP.
-func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (interface{}, *syserr.Error) {
+func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (marshal.Marshallable, *syserr.Error) {
 	switch name {
 	case linux.IP_TTL:
 		if outLen < sizeOfInt32 {
@@ -1417,11 +1514,12 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		}
 
 		// Fill in the default value, if needed.
-		if v == 0 {
-			v = DefaultTTL
+		vP := primitive.Int32(v)
+		if vP == 0 {
+			vP = DefaultTTL
 		}
 
-		return int32(v), nil
+		return &vP, nil
 
 	case linux.IP_MULTICAST_TTL:
 		if outLen < sizeOfInt32 {
@@ -1433,7 +1531,8 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
-		return int32(v), nil
+		vP := primitive.Int32(v)
+		return &vP, nil
 
 	case linux.IP_MULTICAST_IF:
 		if outLen < len(linux.InetAddr{}) {
@@ -1447,7 +1546,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 
 		a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
 
-		return a.(*linux.SockAddrInet).Addr, nil
+		return &a.(*linux.SockAddrInet).Addr, nil
 
 	case linux.IP_MULTICAST_LOOP:
 		if outLen < sizeOfInt32 {
@@ -1458,21 +1557,26 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		return boolToInt32(v), nil
+
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
 
 	case linux.IP_TOS:
 		// Length handling for parity with Linux.
 		if outLen == 0 {
-			return []byte(nil), nil
+			var b primitive.ByteSlice
+			return &b, nil
 		}
 		v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption)
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 		if outLen < sizeOfInt32 {
-			return uint8(v), nil
+			vP := primitive.Uint8(v)
+			return &vP, nil
 		}
-		return int32(v), nil
+		vP := primitive.Int32(v)
+		return &vP, nil
 
 	case linux.IP_RECVTOS:
 		if outLen < sizeOfInt32 {
@@ -1483,7 +1587,9 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		return boolToInt32(v), nil
+
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
 
 	case linux.IP_PKTINFO:
 		if outLen < sizeOfInt32 {
@@ -1494,7 +1600,22 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		return boolToInt32(v), nil
+
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
+
+	case linux.SO_ORIGINAL_DST:
+		if outLen < int(binary.Size(linux.SockAddrInet{})) {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.OriginalDestinationOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
+		return a.(*linux.SockAddrInet), nil
 
 	default:
 		emitUnimplementedEventIP(t, name)
@@ -1698,6 +1819,14 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.OutOfBandInlineOption(v)))
 
+	case linux.SO_NO_CHECK:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.NoChecksumOption, v != 0))
+
 	case linux.SO_LINGER:
 		if len(optVal) < linux.SizeOfLinger {
 			return syserr.ErrInvalidArgument
@@ -1712,6 +1841,11 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 
 		return nil
 
+	case linux.SO_DETACH_FILTER:
+		// optval is ignored.
+		var v tcpip.SocketDetachFilterOption
+		return syserr.TranslateNetstackError(ep.SetSockOpt(v))
+
 	default:
 		socket.SetSockOptEmitUnimplementedEvent(t, name)
 	}
@@ -1777,6 +1911,17 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
 
+	case linux.TCP_KEEPCNT:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		if v < 1 || v > linux.MAX_TCP_KEEPCNT {
+			return syserr.ErrInvalidArgument
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v)))
+
 	case linux.TCP_USER_TIMEOUT:
 		if len(optVal) < sizeOfInt32 {
 			return syserr.ErrInvalidArgument
@@ -2060,13 +2205,22 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0))
 
+	case linux.IP_HDRINCL:
+		if len(optVal) == 0 {
+			return nil
+		}
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.IPHdrIncludedOption, v != 0))
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
 		linux.IP_CHECKSUM,
 		linux.IP_DROP_SOURCE_MEMBERSHIP,
 		linux.IP_FREEBIND,
-		linux.IP_HDRINCL,
 		linux.IP_IPSEC_POLICY,
 		linux.IP_MINTTL,
 		linux.IP_MSFILTER,
@@ -2106,30 +2260,20 @@ func emitUnimplementedEventTCP(t *kernel.Task, name int) {
 	switch name {
 	case linux.TCP_CONGESTION,
 		linux.TCP_CORK,
-		linux.TCP_DEFER_ACCEPT,
 		linux.TCP_FASTOPEN,
 		linux.TCP_FASTOPEN_CONNECT,
 		linux.TCP_FASTOPEN_KEY,
 		linux.TCP_FASTOPEN_NO_COOKIE,
-		linux.TCP_KEEPCNT,
-		linux.TCP_KEEPIDLE,
-		linux.TCP_KEEPINTVL,
-		linux.TCP_LINGER2,
-		linux.TCP_MAXSEG,
 		linux.TCP_QUEUE_SEQ,
-		linux.TCP_QUICKACK,
 		linux.TCP_REPAIR,
 		linux.TCP_REPAIR_QUEUE,
 		linux.TCP_REPAIR_WINDOW,
 		linux.TCP_SAVED_SYN,
 		linux.TCP_SAVE_SYN,
-		linux.TCP_SYNCNT,
 		linux.TCP_THIN_DUPACK,
 		linux.TCP_THIN_LINEAR_TIMEOUTS,
 		linux.TCP_TIMESTAMP,
-		linux.TCP_ULP,
-		linux.TCP_USER_TIMEOUT,
-		linux.TCP_WINDOW_CLAMP:
+		linux.TCP_ULP:
 
 		t.Kernel().EmitUnimplementedEvent(t)
 	}
@@ -2291,7 +2435,7 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32)
 		return &out, uint32(sockAddrInet6Size)
 
 	case linux.AF_PACKET:
-		// TODO(b/129292371): Return protocol too.
+		// TODO(gvisor.dev/issue/173): Return protocol too.
 		var out linux.SockAddrLink
 		out.Family = linux.AF_PACKET
 		out.InterfaceIndex = int32(addr.NIC)
@@ -2397,6 +2541,23 @@ func (s *socketOpsCommon) fillCmsgInq(cmsg *socket.ControlMessages) {
 	cmsg.IP.Inq = int32(len(s.readView) + rcvBufUsed)
 }
 
+func toLinuxPacketType(pktType tcpip.PacketType) uint8 {
+	switch pktType {
+	case tcpip.PacketHost:
+		return linux.PACKET_HOST
+	case tcpip.PacketOtherHost:
+		return linux.PACKET_OTHERHOST
+	case tcpip.PacketOutgoing:
+		return linux.PACKET_OUTGOING
+	case tcpip.PacketBroadcast:
+		return linux.PACKET_BROADCAST
+	case tcpip.PacketMulticast:
+		return linux.PACKET_MULTICAST
+	default:
+		panic(fmt.Sprintf("unknown packet type: %d", pktType))
+	}
+}
+
 // nonBlockingRead issues a non-blocking read.
 //
 // TODO(b/78348848): Support timestamps for stream sockets.
@@ -2452,6 +2613,11 @@ func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSeq
 	var addrLen uint32
 	if isPacket && senderRequested {
 		addr, addrLen = ConvertAddress(s.family, s.sender)
+		switch v := addr.(type) {
+		case *linux.SockAddrLink:
+			v.Protocol = htons(uint16(s.linkPacketInfo.Protocol))
+			v.PacketType = toLinuxPacketType(s.linkPacketInfo.PktType)
+		}
 	}
 
 	if peek {
@@ -2686,11 +2852,16 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
 }
 
 func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		panic("ioctl(2) may only be called from a task goroutine")
+	}
+
 	// SIOCGSTAMP is implemented by netstack rather than all commonEndpoint
 	// sockets.
 	// TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
 	switch args[1].Int() {
-	case syscall.SIOCGSTAMP:
+	case linux.SIOCGSTAMP:
 		s.readMu.Lock()
 		defer s.readMu.Unlock()
 		if !s.timestampValid {
@@ -2698,9 +2869,7 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy
 		}
 
 		tv := linux.NsecToTimeval(s.timestampNS)
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tv, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err := tv.CopyOut(t, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCINQ:
@@ -2719,9 +2888,8 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy
 		}
 
 		// Copy result to userspace.
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		vP := primitive.Int32(v)
+		_, err := vP.CopyOut(t, args[2].Pointer())
 		return 0, err
 	}
 
@@ -2730,52 +2898,49 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy
 
 // Ioctl performs a socket ioctl.
 func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		panic("ioctl(2) may only be called from a task goroutine")
+	}
+
 	switch arg := int(args[1].Int()); arg {
-	case syscall.SIOCGIFFLAGS,
-		syscall.SIOCGIFADDR,
-		syscall.SIOCGIFBRDADDR,
-		syscall.SIOCGIFDSTADDR,
-		syscall.SIOCGIFHWADDR,
-		syscall.SIOCGIFINDEX,
-		syscall.SIOCGIFMAP,
-		syscall.SIOCGIFMETRIC,
-		syscall.SIOCGIFMTU,
-		syscall.SIOCGIFNAME,
-		syscall.SIOCGIFNETMASK,
-		syscall.SIOCGIFTXQLEN:
+	case linux.SIOCGIFFLAGS,
+		linux.SIOCGIFADDR,
+		linux.SIOCGIFBRDADDR,
+		linux.SIOCGIFDSTADDR,
+		linux.SIOCGIFHWADDR,
+		linux.SIOCGIFINDEX,
+		linux.SIOCGIFMAP,
+		linux.SIOCGIFMETRIC,
+		linux.SIOCGIFMTU,
+		linux.SIOCGIFNAME,
+		linux.SIOCGIFNETMASK,
+		linux.SIOCGIFTXQLEN,
+		linux.SIOCETHTOOL:
 
 		var ifr linux.IFReq
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil {
 			return 0, err.ToError()
 		}
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err := ifr.CopyOut(t, args[2].Pointer())
 		return 0, err
 
-	case syscall.SIOCGIFCONF:
+	case linux.SIOCGIFCONF:
 		// Return a list of interface addresses or the buffer size
 		// necessary to hold the list.
 		var ifc linux.IFConf
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifc, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 
-		if err := ifconfIoctl(ctx, io, &ifc); err != nil {
+		if err := ifconfIoctl(ctx, t, io, &ifc); err != nil {
 			return 0, err
 		}
 
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ifc, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-
+		_, err := ifc.CopyOut(t, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCINQ:
@@ -2788,9 +2953,8 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc
 			v = math.MaxInt32
 		}
 		// Copy result to userspace.
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		vP := primitive.Int32(v)
+		_, err := vP.CopyOut(t, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCOUTQ:
@@ -2804,9 +2968,8 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc
 		}
 
 		// Copy result to userspace.
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		vP := primitive.Int32(v)
+		_, err := vP.CopyOut(t, args[2].Pointer())
 		return 0, err
 
 	case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
@@ -2832,7 +2995,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
 
 	// SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to
 	// identify a device.
-	if arg == syscall.SIOCGIFNAME {
+	if arg == linux.SIOCGIFNAME {
 		// Gets the name of the interface given the interface index
 		// stored in ifr_ifindex.
 		index = int32(usermem.ByteOrder.Uint32(ifr.Data[:4]))
@@ -2855,21 +3018,28 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
 	}
 
 	switch arg {
-	case syscall.SIOCGIFINDEX:
+	case linux.SIOCGIFINDEX:
 		// Copy out the index to the data.
 		usermem.ByteOrder.PutUint32(ifr.Data[:], uint32(index))
 
-	case syscall.SIOCGIFHWADDR:
+	case linux.SIOCGIFHWADDR:
 		// Copy the hardware address out.
-		ifr.Data[0] = 6 // IEEE802.2 arp type.
-		ifr.Data[1] = 0
+		//
+		// Refer: https://linux.die.net/man/7/netdevice
+		// SIOCGIFHWADDR, SIOCSIFHWADDR
+		//
+		// Get or set the hardware address of a device using
+		// ifr_hwaddr. The hardware address is specified in a struct
+		// sockaddr. sa_family contains the ARPHRD_* device type,
+		// sa_data the L2 hardware address starting from byte 0. Setting
+		// the hardware address is a privileged operation.
+		usermem.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType)
 		n := copy(ifr.Data[2:], iface.Addr)
 		for i := 2 + n; i < len(ifr.Data); i++ {
 			ifr.Data[i] = 0 // Clear padding.
 		}
-		usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(n))
 
-	case syscall.SIOCGIFFLAGS:
+	case linux.SIOCGIFFLAGS:
 		f, err := interfaceStatusFlags(stack, iface.Name)
 		if err != nil {
 			return err
@@ -2878,7 +3048,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
 		// matches Linux behavior.
 		usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(f))
 
-	case syscall.SIOCGIFADDR:
+	case linux.SIOCGIFADDR:
 		// Copy the IPv4 address out.
 		for _, addr := range stack.InterfaceAddrs()[index] {
 			// This ioctl is only compatible with AF_INET addresses.
@@ -2889,32 +3059,32 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
 			break
 		}
 
-	case syscall.SIOCGIFMETRIC:
+	case linux.SIOCGIFMETRIC:
 		// Gets the metric of the device. As per netdevice(7), this
 		// always just sets ifr_metric to 0.
 		usermem.ByteOrder.PutUint32(ifr.Data[:4], 0)
 
-	case syscall.SIOCGIFMTU:
+	case linux.SIOCGIFMTU:
 		// Gets the MTU of the device.
 		usermem.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU)
 
-	case syscall.SIOCGIFMAP:
+	case linux.SIOCGIFMAP:
 		// Gets the hardware parameters of the device.
 		// TODO(gvisor.dev/issue/505): Implement.
 
-	case syscall.SIOCGIFTXQLEN:
+	case linux.SIOCGIFTXQLEN:
 		// Gets the transmit queue length of the device.
 		// TODO(gvisor.dev/issue/505): Implement.
 
-	case syscall.SIOCGIFDSTADDR:
+	case linux.SIOCGIFDSTADDR:
 		// Gets the destination address of a point-to-point device.
 		// TODO(gvisor.dev/issue/505): Implement.
 
-	case syscall.SIOCGIFBRDADDR:
+	case linux.SIOCGIFBRDADDR:
 		// Gets the broadcast address of a device.
 		// TODO(gvisor.dev/issue/505): Implement.
 
-	case syscall.SIOCGIFNETMASK:
+	case linux.SIOCGIFNETMASK:
 		// Gets the network mask of a device.
 		for _, addr := range stack.InterfaceAddrs()[index] {
 			// This ioctl is only compatible with AF_INET addresses.
@@ -2931,6 +3101,14 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
 			break
 		}
 
+	case linux.SIOCETHTOOL:
+		// Stubbed out for now, Ideally we should implement the required
+		// sub-commands for ETHTOOL
+		//
+		// See:
+		// https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c
+		return syserr.ErrEndpointOperation
+
 	default:
 		// Not a valid call.
 		return syserr.ErrInvalidArgument
@@ -2940,7 +3118,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
 }
 
 // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl.
-func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
+func ifconfIoctl(ctx context.Context, t *kernel.Task, io usermem.IO, ifc *linux.IFConf) error {
 	// If Ptr is NULL, return the necessary buffer size via Len.
 	// Otherwise, write up to Len bytes starting at Ptr containing ifreq
 	// structs.
@@ -2977,9 +3155,7 @@ func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
 			// Copy the ifr to userspace.
 			dst := uintptr(ifc.Ptr) + uintptr(ifc.Len)
 			ifc.Len += int32(linux.SizeOfIFReq)
-			if _, err := usermem.CopyObjectOut(ctx, io, usermem.Addr(dst), ifr, usermem.IOOpts{
-				AddressSpaceActive: true,
-			}); err != nil {
+			if _, err := ifr.CopyOut(t, usermem.Addr(dst)); err != nil {
 				return err
 			}
 		}
diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
index fcd8013c0..a9025b0ec 100644
--- a/pkg/sentry/socket/netstack/netstack_vfs2.go
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -19,6 +19,7 @@ import (
 	"gvisor.dev/gvisor/pkg/amutex"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -30,6 +31,8 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // SocketVFS2 encapsulates all the state needed to represent a network stack
@@ -38,6 +41,7 @@ type SocketVFS2 struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.LockFD
 
 	socketOpsCommon
 }
@@ -64,6 +68,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu
 			protocol: protocol,
 		},
 	}
+	s.LockFD.Init(&vfs.FileLocks{})
 	vfsfd := &s.vfsfd
 	if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
 		DenyPRead:         true,
@@ -197,7 +202,7 @@ func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
 
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
 	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
 	// implemented specifically for netstack.SocketVFS2 rather than
 	// commonEndpoint. commonEndpoint should be extended to support socket
@@ -207,25 +212,25 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
-		val := int32(0)
+		val := primitive.Int32(0)
 		s.readMu.Lock()
 		defer s.readMu.Unlock()
 		if s.sockOptTimestamp {
 			val = 1
 		}
-		return val, nil
+		return &val, nil
 	}
 	if level == linux.SOL_TCP && name == linux.TCP_INQ {
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
-		val := int32(0)
+		val := primitive.Int32(0)
 		s.readMu.Lock()
 		defer s.readMu.Unlock()
 		if s.sockOptInq {
 			val = 1
 		}
-		return val, nil
+		return &val, nil
 	}
 
 	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
@@ -243,7 +248,7 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
 			if err != nil {
 				return nil, err
 			}
-			return info, nil
+			return &info, nil
 
 		case linux.IPT_SO_GET_ENTRIES:
 			if outLen < linux.SizeOfIPTGetEntries {
@@ -258,7 +263,7 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
 			if err != nil {
 				return nil, err
 			}
-			return entries, nil
+			return &entries, nil
 
 		}
 	}
@@ -315,3 +320,13 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by
 
 	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index f5fa18136..67737ae87 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -15,10 +15,11 @@
 package netstack
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
-	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -41,19 +42,29 @@ func (s *Stack) SupportsIPv6() bool {
 	return s.Stack.CheckNetworkProtocol(ipv6.ProtocolNumber)
 }
 
+// Converts Netstack's ARPHardwareType to equivalent linux constants.
+func toLinuxARPHardwareType(t header.ARPHardwareType) uint16 {
+	switch t {
+	case header.ARPHardwareNone:
+		return linux.ARPHRD_NONE
+	case header.ARPHardwareLoopback:
+		return linux.ARPHRD_LOOPBACK
+	case header.ARPHardwareEther:
+		return linux.ARPHRD_ETHER
+	default:
+		panic(fmt.Sprintf("unknown ARPHRD type: %d", t))
+	}
+}
+
 // Interfaces implements inet.Stack.Interfaces.
 func (s *Stack) Interfaces() map[int32]inet.Interface {
 	is := make(map[int32]inet.Interface)
 	for id, ni := range s.Stack.NICInfo() {
-		var devType uint16
-		if ni.Flags.Loopback {
-			devType = linux.ARPHRD_LOOPBACK
-		}
 		is[int32(id)] = inet.Interface{
 			Name:       ni.Name,
 			Addr:       []byte(ni.LinkAddress),
 			Flags:      uint32(nicStateFlagsToLinux(ni.Flags)),
-			DeviceType: devType,
+			DeviceType: toLinuxARPHardwareType(ni.ARPHardwareType),
 			MTU:        ni.MTU,
 		}
 	}
@@ -314,7 +325,7 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 			udp.PacketsSent.Value(),         // OutDatagrams.
 			udp.ReceiveBufferErrors.Value(), // RcvbufErrors.
 			0,                               // Udp/SndbufErrors.
-			0,                               // Udp/InCsumErrors.
+			udp.ChecksumErrors.Value(),      // Udp/InCsumErrors.
 			0,                               // Udp/IgnoredMulti.
 		}
 	default:
@@ -362,16 +373,10 @@ func (s *Stack) RouteTable() []inet.Route {
 }
 
 // IPTables returns the stack's iptables.
-func (s *Stack) IPTables() (stack.IPTables, error) {
+func (s *Stack) IPTables() (*stack.IPTables, error) {
 	return s.Stack.IPTables(), nil
 }
 
-// FillDefaultIPTables sets the stack's iptables to the default tables, which
-// allow and do not modify all traffic.
-func (s *Stack) FillDefaultIPTables() {
-	netfilter.FillDefaultIPTables(s.Stack)
-}
-
 // Resume implements inet.Stack.Resume.
 func (s *Stack) Resume() {
 	s.Stack.Resume()
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 6580bd6e9..d112757fb 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 // ControlMessages represents the union of unix control messages and tcpip
@@ -86,7 +87,7 @@ type SocketOps interface {
 	Shutdown(t *kernel.Task, how int) *syserr.Error
 
 	// GetSockOpt implements the getsockopt(2) linux syscall.
-	GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error)
+	GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error)
 
 	// SetSockOpt implements the setsockopt(2) linux syscall.
 	SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error
@@ -407,7 +408,6 @@ func emitUnimplementedEvent(t *kernel.Task, name int) {
 		linux.SO_MARK,
 		linux.SO_MAX_PACING_RATE,
 		linux.SO_NOFCS,
-		linux.SO_NO_CHECK,
 		linux.SO_OOBINLINE,
 		linux.SO_PASSCRED,
 		linux.SO_PASSSEC,
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index de2cc4bdf..061a689a9 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -21,6 +21,7 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/time",
@@ -34,5 +35,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/marshal",
     ],
 )
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index ce5b94ee7..a1e49cc57 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -252,7 +252,7 @@ func (e *connectionedEndpoint) Close() {
 // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
 func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error {
 	if ce.Type() != e.stype {
-		return syserr.ErrConnectionRefused
+		return syserr.ErrWrongProtocolForSocket
 	}
 
 	// Check if ce is e to avoid a deadlock.
@@ -476,6 +476,9 @@ func (e *connectionedEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask
 
 // State implements socket.Socket.State.
 func (e *connectionedEndpoint) State() uint32 {
+	e.Lock()
+	defer e.Unlock()
+
 	if e.Connected() {
 		return linux.SS_CONNECTED
 	}
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 5b29e9d7f..0482d33cf 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -40,6 +40,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 // SocketOperations is a Unix socket. It is similar to a netstack socket,
@@ -184,7 +185,7 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
 
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
 	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
 }
 
@@ -417,7 +418,18 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool
 	defer ep.Release()
 
 	// Connect the server endpoint.
-	return s.ep.Connect(t, ep)
+	err = s.ep.Connect(t, ep)
+
+	if err == syserr.ErrWrongProtocolForSocket {
+		// Linux for abstract sockets returns ErrConnectionRefused
+		// instead of ErrWrongProtocolForSocket.
+		path, _ := extractPath(sockaddr)
+		if len(path) > 0 && path[0] == 0 {
+			err = syserr.ErrConnectionRefused
+		}
+	}
+
+	return err
 }
 
 // Write implements fs.FileOperations.Write.
@@ -448,15 +460,25 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b
 		To:       nil,
 	}
 	if len(to) > 0 {
-		ep, err := extractEndpoint(t, to)
-		if err != nil {
-			return 0, err
-		}
-		defer ep.Release()
-		w.To = ep
+		switch s.stype {
+		case linux.SOCK_SEQPACKET:
+			to = nil
+		case linux.SOCK_STREAM:
+			if s.State() == linux.SS_CONNECTED {
+				return 0, syserr.ErrAlreadyConnected
+			}
+			return 0, syserr.ErrNotSupported
+		default:
+			ep, err := extractEndpoint(t, to)
+			if err != nil {
+				return 0, err
+			}
+			defer ep.Release()
+			w.To = ep
 
-		if ep.Passcred() && w.Control.Credentials == nil {
-			w.Control.Credentials = control.MakeCreds(t)
+			if ep.Passcred() && w.Control.Credentials == nil {
+				w.Control.Credentials = control.MakeCreds(t)
+			}
 		}
 	}
 
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index 45e109361..05c16fcfe 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -19,6 +19,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
@@ -31,6 +32,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 // SocketVFS2 implements socket.SocketVFS2 (and by extension,
@@ -39,6 +41,7 @@ type SocketVFS2 struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.LockFD
 
 	socketOpsCommon
 }
@@ -51,7 +54,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType)
 	mnt := t.Kernel().SocketMount()
 	d := sockfs.NewDentry(t.Credentials(), mnt)
 
-	fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d)
+	fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &vfs.FileLocks{})
 	if err != nil {
 		return nil, syserr.FromError(err)
 	}
@@ -60,7 +63,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType)
 
 // NewFileDescription creates and returns a socket file description
 // corresponding to the given mount and dentry.
-func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry) (*vfs.FileDescription, error) {
+func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
 	// You can create AF_UNIX, SOCK_RAW sockets. They're the same as
 	// SOCK_DGRAM and don't require CAP_NET_RAW.
 	if stype == linux.SOCK_RAW {
@@ -73,6 +76,7 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3
 			stype: stype,
 		},
 	}
+	sock.LockFD.Init(locks)
 	vfsfd := &sock.vfsfd
 	if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{
 		DenyPRead:         true,
@@ -86,7 +90,7 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3
 
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
 	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
 }
 
@@ -297,6 +301,16 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by
 	return netstack.SetSockOpt(t, s, s.ep, level, name, optVal)
 }
 
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence)
+}
+
 // providerVFS2 is a unix domain socket provider for VFS2.
 type providerVFS2 struct{}
 
diff --git a/pkg/sentry/strace/epoll.go b/pkg/sentry/strace/epoll.go
index a6e48b836..5d51a7792 100644
--- a/pkg/sentry/strace/epoll.go
+++ b/pkg/sentry/strace/epoll.go
@@ -50,10 +50,10 @@ func epollEvents(t *kernel.Task, eventsAddr usermem.Addr, numEvents, maxBytes ui
 			sb.WriteString("...")
 			break
 		}
-		if _, ok := addr.AddLength(uint64(linux.SizeOfEpollEvent)); !ok {
-			fmt.Fprintf(&sb, "{error reading event at %#x: EFAULT}", addr)
-			continue
-		}
+		// Allowing addr to overflow is consistent with Linux, and harmless; if
+		// this isn't the last iteration of the loop, the next call to CopyIn
+		// will just fail with EFAULT.
+		addr, _ = addr.AddLength(uint64(linux.SizeOfEpollEvent))
 	}
 	sb.WriteString("}")
 	return sb.String()
@@ -75,7 +75,7 @@ var epollEventEvents = abi.FlagSet{
 	{Flag: linux.EPOLLPRI, Name: "EPOLLPRI"},
 	{Flag: linux.EPOLLOUT, Name: "EPOLLOUT"},
 	{Flag: linux.EPOLLERR, Name: "EPOLLERR"},
-	{Flag: linux.EPOLLHUP, Name: "EPULLHUP"},
+	{Flag: linux.EPOLLHUP, Name: "EPOLLHUP"},
 	{Flag: linux.EPOLLRDNORM, Name: "EPOLLRDNORM"},
 	{Flag: linux.EPOLLRDBAND, Name: "EPOLLRDBAND"},
 	{Flag: linux.EPOLLWRNORM, Name: "EPOLLWRNORM"},
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index c0512de89..b51c4c941 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -521,6 +521,7 @@ var sockOptNames = map[uint64]abi.ValueSet{
 		linux.IP_ROUTER_ALERT:           "IP_ROUTER_ALERT",
 		linux.IP_PKTOPTIONS:             "IP_PKTOPTIONS",
 		linux.IP_MTU:                    "IP_MTU",
+		linux.SO_ORIGINAL_DST:           "SO_ORIGINAL_DST",
 	},
 	linux.SOL_SOCKET: {
 		linux.SO_ERROR:        "SO_ERROR",
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 217fcfef2..4a9b04fd0 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -99,5 +99,7 @@ go_library(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/marshal",
+        "//tools/go_marshal/primitive",
     ],
 )
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index ea4f9b1a7..80c65164a 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -325,8 +325,8 @@ var AMD64 = &kernel.SyscallTable{
 		270: syscalls.Supported("pselect", Pselect),
 		271: syscalls.Supported("ppoll", Ppoll),
 		272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
-		273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
-		274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+		273: syscalls.Supported("set_robust_list", SetRobustList),
+		274: syscalls.Supported("get_robust_list", GetRobustList),
 		275: syscalls.Supported("splice", Splice),
 		276: syscalls.Supported("tee", Tee),
 		277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index d781d6a04..ba2557c52 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -15,8 +15,8 @@
 package linux
 
 import (
-	"encoding/binary"
-
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -27,59 +27,6 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// I/O commands.
-const (
-	_IOCB_CMD_PREAD   = 0
-	_IOCB_CMD_PWRITE  = 1
-	_IOCB_CMD_FSYNC   = 2
-	_IOCB_CMD_FDSYNC  = 3
-	_IOCB_CMD_NOOP    = 6
-	_IOCB_CMD_PREADV  = 7
-	_IOCB_CMD_PWRITEV = 8
-)
-
-// I/O flags.
-const (
-	_IOCB_FLAG_RESFD = 1
-)
-
-// ioCallback describes an I/O request.
-//
-// The priority field is currently ignored in the implementation below. Also
-// note that the IOCB_FLAG_RESFD feature is not supported.
-type ioCallback struct {
-	Data      uint64
-	Key       uint32
-	Reserved1 uint32
-
-	OpCode  uint16
-	ReqPrio int16
-	FD      int32
-
-	Buf    uint64
-	Bytes  uint64
-	Offset int64
-
-	Reserved2 uint64
-	Flags     uint32
-
-	// eventfd to signal if IOCB_FLAG_RESFD is set in flags.
-	ResFD int32
-}
-
-// ioEvent describes an I/O result.
-//
-// +stateify savable
-type ioEvent struct {
-	Data    uint64
-	Obj     uint64
-	Result  int64
-	Result2 int64
-}
-
-// ioEventSize is the size of an ioEvent encoded.
-var ioEventSize = binary.Size(ioEvent{})
-
 // IoSetup implements linux syscall io_setup(2).
 func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	nrEvents := args[0].Int()
@@ -192,7 +139,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 			}
 		}
 
-		ev := v.(*ioEvent)
+		ev := v.(*linux.IOEvent)
 
 		// Copy out the result.
 		if _, err := t.CopyOut(eventsAddr, ev); err != nil {
@@ -204,7 +151,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 		}
 
 		// Keep rolling.
-		eventsAddr += usermem.Addr(ioEventSize)
+		eventsAddr += usermem.Addr(linux.IOEventSize)
 	}
 
 	// Everything finished.
@@ -231,7 +178,7 @@ func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadl
 }
 
 // memoryFor returns appropriate memory for the given callback.
-func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
+func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) {
 	bytes := int(cb.Bytes)
 	if bytes < 0 {
 		// Linux also requires that this field fit in ssize_t.
@@ -242,17 +189,17 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
 	// we have no guarantee that t's AddressSpace will be active during the
 	// I/O.
 	switch cb.OpCode {
-	case _IOCB_CMD_PREAD, _IOCB_CMD_PWRITE:
+	case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE:
 		return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
 			AddressSpaceActive: false,
 		})
 
-	case _IOCB_CMD_PREADV, _IOCB_CMD_PWRITEV:
+	case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV:
 		return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
 			AddressSpaceActive: false,
 		})
 
-	case _IOCB_CMD_FSYNC, _IOCB_CMD_FDSYNC, _IOCB_CMD_NOOP:
+	case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP:
 		return usermem.IOSequence{}, nil
 
 	default:
@@ -261,54 +208,62 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
 	}
 }
 
-func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) {
-	if ctx.Dead() {
-		ctx.CancelPendingRequest()
-		return
-	}
-	ev := &ioEvent{
-		Data: cb.Data,
-		Obj:  uint64(cbAddr),
-	}
+// IoCancel implements linux syscall io_cancel(2).
+//
+// It is not presently supported (ENOSYS indicates no support on this
+// architecture).
+func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, syserror.ENOSYS
+}
 
-	// Construct a context.Context that will not be interrupted if t is
-	// interrupted.
-	c := t.AsyncContext()
+// LINT.IfChange
 
-	var err error
-	switch cb.OpCode {
-	case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV:
-		ev.Result, err = file.Preadv(c, ioseq, cb.Offset)
-	case _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV:
-		ev.Result, err = file.Pwritev(c, ioseq, cb.Offset)
-	case _IOCB_CMD_FSYNC:
-		err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncAll)
-	case _IOCB_CMD_FDSYNC:
-		err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncData)
-	}
+func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, actx *mm.AIOContext, eventFile *fs.File) kernel.AIOCallback {
+	return func(ctx context.Context) {
+		if actx.Dead() {
+			actx.CancelPendingRequest()
+			return
+		}
+		ev := &linux.IOEvent{
+			Data: cb.Data,
+			Obj:  uint64(cbAddr),
+		}
 
-	// Update the result.
-	if err != nil {
-		err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file)
-		ev.Result = -int64(kernel.ExtractErrno(err, 0))
-	}
+		var err error
+		switch cb.OpCode {
+		case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV:
+			ev.Result, err = file.Preadv(ctx, ioseq, cb.Offset)
+		case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+			ev.Result, err = file.Pwritev(ctx, ioseq, cb.Offset)
+		case linux.IOCB_CMD_FSYNC:
+			err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+		case linux.IOCB_CMD_FDSYNC:
+			err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncData)
+		}
+
+		// Update the result.
+		if err != nil {
+			err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file)
+			ev.Result = -int64(kernel.ExtractErrno(err, 0))
+		}
 
-	file.DecRef()
+		file.DecRef()
 
-	// Queue the result for delivery.
-	ctx.FinishRequest(ev)
+		// Queue the result for delivery.
+		actx.FinishRequest(ev)
 
-	// Notify the event file if one was specified. This needs to happen
-	// *after* queueing the result to avoid racing with the thread we may
-	// wake up.
-	if eventFile != nil {
-		eventFile.FileOperations.(*eventfd.EventOperations).Signal(1)
-		eventFile.DecRef()
+		// Notify the event file if one was specified. This needs to happen
+		// *after* queueing the result to avoid racing with the thread we may
+		// wake up.
+		if eventFile != nil {
+			eventFile.FileOperations.(*eventfd.EventOperations).Signal(1)
+			eventFile.DecRef()
+		}
 	}
 }
 
 // submitCallback processes a single callback.
-func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error {
+func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error {
 	file := t.GetFile(cb.FD)
 	if file == nil {
 		// File not found.
@@ -318,7 +273,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad
 
 	// Was there an eventFD? Extract it.
 	var eventFile *fs.File
-	if cb.Flags&_IOCB_FLAG_RESFD != 0 {
+	if cb.Flags&linux.IOCB_FLAG_RESFD != 0 {
 		eventFile = t.GetFile(cb.ResFD)
 		if eventFile == nil {
 			// Bad FD.
@@ -340,7 +295,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad
 
 	// Check offset for reads/writes.
 	switch cb.OpCode {
-	case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV, _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV:
+	case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
 		if cb.Offset < 0 {
 			return syserror.EINVAL
 		}
@@ -366,7 +321,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad
 
 	// Perform the request asynchronously.
 	file.IncRef()
-	fs.Async(func() { performCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile) })
+	t.QueueAIO(getAIOCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile))
 
 	// All set.
 	return nil
@@ -395,7 +350,7 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		}
 
 		// Copy in this callback.
-		var cb ioCallback
+		var cb linux.IOCallback
 		cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
 		if _, err := t.CopyIn(cbAddr, &cb); err != nil {
 
@@ -424,10 +379,4 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	return uintptr(nrEvents), nil, nil
 }
 
-// IoCancel implements linux syscall io_cancel(2).
-//
-// It is not presently supported (ENOSYS indicates no support on this
-// architecture).
-func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	return 0, nil, syserror.ENOSYS
-}
+// LINT.ThenChange(vfs2/aio.go)
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 35a98212a..8cf6401e7 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -900,14 +900,20 @@ func fGetOwn(t *kernel.Task, file *fs.File) int32 {
 //
 // If who is positive, it represents a PID. If negative, it represents a PGID.
 // If the PID or PGID is invalid, the owner is silently unset.
-func fSetOwn(t *kernel.Task, file *fs.File, who int32) {
+func fSetOwn(t *kernel.Task, file *fs.File, who int32) error {
 	a := file.Async(fasync.New).(*fasync.FileAsync)
 	if who < 0 {
+		// Check for overflow before flipping the sign.
+		if who-1 > who {
+			return syserror.EINVAL
+		}
 		pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who))
 		a.SetOwnerProcessGroup(t, pg)
+	} else {
+		tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who))
+		a.SetOwnerThreadGroup(t, tg)
 	}
-	tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who))
-	a.SetOwnerThreadGroup(t, tg)
+	return nil
 }
 
 // Fcntl implements linux syscall fcntl(2).
@@ -935,10 +941,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return uintptr(flags.ToLinuxFDFlags()), nil, nil
 	case linux.F_SETFD:
 		flags := args[2].Uint()
-		t.FDTable().SetFlags(fd, kernel.FDFlags{
+		err := t.FDTable().SetFlags(fd, kernel.FDFlags{
 			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
 		})
-		return 0, nil, nil
+		return 0, nil, err
 	case linux.F_GETFL:
 		return uintptr(file.Flags().ToLinux()), nil, nil
 	case linux.F_SETFL:
@@ -998,9 +1004,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			return 0, nil, err
 		}
 
-		// The lock uid is that of the Task's FDTable.
-		lockUniqueID := lock.UniqueID(t.FDTable().ID())
-
 		// These locks don't block; execute the non-blocking operation using the inode's lock
 		// context directly.
 		switch flock.Type {
@@ -1010,12 +1013,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			}
 			if cmd == linux.F_SETLK {
 				// Non-blocking lock, provide a nil lock.Blocker.
-				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, nil) {
 					return 0, nil, syserror.EAGAIN
 				}
 			} else {
 				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
-				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, t) {
 					return 0, nil, syserror.EINTR
 				}
 			}
@@ -1026,18 +1029,18 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			}
 			if cmd == linux.F_SETLK {
 				// Non-blocking lock, provide a nil lock.Blocker.
-				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, nil) {
 					return 0, nil, syserror.EAGAIN
 				}
 			} else {
 				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
-				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, t) {
 					return 0, nil, syserror.EINTR
 				}
 			}
 			return 0, nil, nil
 		case linux.F_UNLCK:
-			file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng)
+			file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng)
 			return 0, nil, nil
 		default:
 			return 0, nil, syserror.EINVAL
@@ -1045,8 +1048,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	case linux.F_GETOWN:
 		return uintptr(fGetOwn(t, file)), nil, nil
 	case linux.F_SETOWN:
-		fSetOwn(t, file, args[2].Int())
-		return 0, nil, nil
+		return 0, nil, fSetOwn(t, file, args[2].Int())
 	case linux.F_GETOWN_EX:
 		addr := args[2].Pointer()
 		owner := fGetOwnEx(t, file)
@@ -1055,7 +1057,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	case linux.F_SETOWN_EX:
 		addr := args[2].Pointer()
 		var owner linux.FOwnerEx
-		n, err := t.CopyIn(addr, &owner)
+		_, err := t.CopyIn(addr, &owner)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -1067,21 +1069,21 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 				return 0, nil, syserror.ESRCH
 			}
 			a.SetOwnerTask(t, task)
-			return uintptr(n), nil, nil
+			return 0, nil, nil
 		case linux.F_OWNER_PID:
 			tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID))
 			if tg == nil {
 				return 0, nil, syserror.ESRCH
 			}
 			a.SetOwnerThreadGroup(t, tg)
-			return uintptr(n), nil, nil
+			return 0, nil, nil
 		case linux.F_OWNER_PGRP:
 			pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID))
 			if pg == nil {
 				return 0, nil, syserror.ESRCH
 			}
 			a.SetOwnerProcessGroup(t, pg)
-			return uintptr(n), nil, nil
+			return 0, nil, nil
 		default:
 			return 0, nil, syserror.EINVAL
 		}
@@ -1114,17 +1116,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 }
 
-// LINT.ThenChange(vfs2/fd.go)
-
-const (
-	_FADV_NORMAL     = 0
-	_FADV_RANDOM     = 1
-	_FADV_SEQUENTIAL = 2
-	_FADV_WILLNEED   = 3
-	_FADV_DONTNEED   = 4
-	_FADV_NOREUSE    = 5
-)
-
 // Fadvise64 implements linux syscall fadvise64(2).
 // This implementation currently ignores the provided advice.
 func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
@@ -1149,12 +1140,12 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	}
 
 	switch advice {
-	case _FADV_NORMAL:
-	case _FADV_RANDOM:
-	case _FADV_SEQUENTIAL:
-	case _FADV_WILLNEED:
-	case _FADV_DONTNEED:
-	case _FADV_NOREUSE:
+	case linux.POSIX_FADV_NORMAL:
+	case linux.POSIX_FADV_RANDOM:
+	case linux.POSIX_FADV_SEQUENTIAL:
+	case linux.POSIX_FADV_WILLNEED:
+	case linux.POSIX_FADV_DONTNEED:
+	case linux.POSIX_FADV_NOREUSE:
 	default:
 		return 0, nil, syserror.EINVAL
 	}
@@ -1163,8 +1154,6 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, nil
 }
 
-// LINT.IfChange
-
 func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
 	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -2157,22 +2146,6 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	nonblocking := operation&linux.LOCK_NB != 0
 	operation &^= linux.LOCK_NB
 
-	// flock(2):
-	// Locks created by flock() are associated with an open file table entry. This means that
-	// duplicate file descriptors (created by, for example, fork(2) or dup(2)) refer to the
-	// same lock, and this lock may be modified or released using any of these descriptors. Furthermore,
-	// the lock is released either by an explicit LOCK_UN operation on any of these duplicate
-	// descriptors, or when all such descriptors have been closed.
-	//
-	// If a process uses open(2) (or similar) to obtain more than one descriptor for the same file,
-	// these descriptors are treated independently by flock(). An attempt to lock the file using
-	// one of these file descriptors may be denied by a lock that the calling process has already placed via
-	// another descriptor.
-	//
-	// We use the File UniqueID as the lock UniqueID because it needs to reference the same lock across dup(2)
-	// and fork(2).
-	lockUniqueID := lock.UniqueID(file.UniqueID)
-
 	// A BSD style lock spans the entire file.
 	rng := lock.LockRange{
 		Start: 0,
@@ -2183,29 +2156,29 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	case linux.LOCK_EX:
 		if nonblocking {
 			// Since we're nonblocking we pass a nil lock.Blocker implementation.
-			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, nil) {
 				return 0, nil, syserror.EWOULDBLOCK
 			}
 		} else {
 			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
-			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, t) {
 				return 0, nil, syserror.EINTR
 			}
 		}
 	case linux.LOCK_SH:
 		if nonblocking {
 			// Since we're nonblocking we pass a nil lock.Blocker implementation.
-			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, nil) {
 				return 0, nil, syserror.EWOULDBLOCK
 			}
 		} else {
 			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
-			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, t) {
 				return 0, nil, syserror.EINTR
 			}
 		}
 	case linux.LOCK_UN:
-		file.Dirent.Inode.LockCtx.BSD.UnlockRegion(lockUniqueID, rng)
+		file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng)
 	default:
 		// flock(2): EINVAL operation is invalid.
 		return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index b68261f72..f04d78856 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -198,7 +198,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		switch cmd {
 		case linux.FUTEX_WAIT:
 			// WAIT uses a relative timeout.
-			mask = ^uint32(0)
+			mask = linux.FUTEX_BITSET_MATCH_ANY
 			var timeoutDur time.Duration
 			if !forever {
 				timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond
@@ -286,3 +286,49 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return 0, nil, syserror.ENOSYS
 	}
 }
+
+// SetRobustList implements linux syscall set_robust_list(2).
+func SetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// Despite the syscall using the name 'pid' for this variable, it is
+	// very much a tid.
+	head := args[0].Pointer()
+	length := args[1].SizeT()
+
+	if length != uint(linux.SizeOfRobustListHead) {
+		return 0, nil, syserror.EINVAL
+	}
+	t.SetRobustList(head)
+	return 0, nil, nil
+}
+
+// GetRobustList implements linux syscall get_robust_list(2).
+func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// Despite the syscall using the name 'pid' for this variable, it is
+	// very much a tid.
+	tid := args[0].Int()
+	head := args[1].Pointer()
+	size := args[2].Pointer()
+
+	if tid < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	ot := t
+	if tid != 0 {
+		if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil {
+			return 0, nil, syserror.ESRCH
+		}
+	}
+
+	// Copy out head pointer.
+	if _, err := t.CopyOut(head, uint64(ot.GetRobustList())); err != nil {
+		return 0, nil, err
+	}
+
+	// Copy out size, which is a constant.
+	if _, err := t.CopyOut(size, uint64(linux.SizeOfRobustListHead)); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 0760af77b..414fce8e3 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -29,6 +29,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // LINT.IfChange
@@ -474,7 +476,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	if v != nil {
-		if _, err := t.CopyOut(optValAddr, v); err != nil {
+		if _, err := v.CopyOut(t, optValAddr); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -484,7 +486,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 
 // getSockOpt tries to handle common socket options, or dispatches to a specific
 // socket implementation.
-func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
+func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (marshal.Marshallable, *syserr.Error) {
 	if level == linux.SOL_SOCKET {
 		switch name {
 		case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
@@ -496,13 +498,16 @@ func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr use
 		switch name {
 		case linux.SO_TYPE:
 			_, skType, _ := s.Type()
-			return int32(skType), nil
+			v := primitive.Int32(skType)
+			return &v, nil
 		case linux.SO_DOMAIN:
 			family, _, _ := s.Type()
-			return int32(family), nil
+			v := primitive.Int32(family)
+			return &v, nil
 		case linux.SO_PROTOCOL:
 			_, _, protocol := s.Type()
-			return int32(protocol), nil
+			v := primitive.Int32(protocol)
+			return &v, nil
 		}
 	}
 
@@ -539,7 +544,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 		return 0, nil, syserror.EINVAL
 	}
 	buf := t.CopyScratchBuffer(int(optLen))
-	if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+	if _, err := t.CopyInBytes(optValAddr, buf); err != nil {
 		return 0, nil, err
 	}
 
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index 39f2b79ec..77c78889d 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -80,6 +80,12 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
 		}
 	}
 
+	if total > 0 {
+		// On Linux, inotify behavior is not very consistent with splice(2). We try
+		// our best to emulate Linux for very basic calls to splice, where for some
+		// reason, events are generated for output files, but not input files.
+		outFile.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+	}
 	return total, err
 }
 
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 2de5e3422..c24946160 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -207,7 +207,11 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, si
 		return syserror.EOPNOTSUPP
 	}
 
-	return d.Inode.SetXattr(t, d, name, value, flags)
+	if err := d.Inode.SetXattr(t, d, name, value, flags); err != nil {
+		return err
+	}
+	d.InotifyEvent(linux.IN_ATTRIB, 0)
+	return nil
 }
 
 func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
@@ -418,7 +422,11 @@ func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
 		return syserror.EOPNOTSUPP
 	}
 
-	return d.Inode.RemoveXattr(t, d, name)
+	if err := d.Inode.RemoveXattr(t, d, name); err != nil {
+		return err
+	}
+	d.InotifyEvent(linux.IN_ATTRIB, 0)
+	return nil
 }
 
 // LINT.ThenChange(vfs2/xattr.go)
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index f882ef840..64696b438 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
 go_library(
     name = "vfs2",
     srcs = [
+        "aio.go",
         "epoll.go",
         "eventfd.go",
         "execve.go",
@@ -12,9 +13,12 @@ go_library(
         "filesystem.go",
         "fscontext.go",
         "getdents.go",
+        "inotify.go",
         "ioctl.go",
+        "lock.go",
         "memfd.go",
         "mmap.go",
+        "mount.go",
         "path.go",
         "pipe.go",
         "poll.go",
@@ -22,6 +26,7 @@ go_library(
         "setstat.go",
         "signal.go",
         "socket.go",
+        "splice.go",
         "stat.go",
         "stat_amd64.go",
         "stat_arm64.go",
@@ -36,9 +41,11 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/bits",
+        "//pkg/context",
         "//pkg/fspath",
         "//pkg/gohacks",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsbridge",
         "//pkg/sentry/fsimpl/eventfd",
         "//pkg/sentry/fsimpl/pipefs",
@@ -47,11 +54,13 @@ go_library(
         "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/fasync",
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/loader",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/mm",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/unix/transport",
@@ -63,5 +72,7 @@ go_library(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/marshal",
+        "//tools/go_marshal/primitive",
     ],
 )
diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go
new file mode 100644
index 000000000..e5cdefc50
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/aio.go
@@ -0,0 +1,216 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// IoSubmit implements linux syscall io_submit(2).
+func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Uint64()
+	nrEvents := args[1].Int()
+	addr := args[2].Pointer()
+
+	if nrEvents < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	for i := int32(0); i < nrEvents; i++ {
+		// Copy in the address.
+		cbAddrNative := t.Arch().Native(0)
+		if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
+			if i > 0 {
+				// Some successful.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Copy in this callback.
+		var cb linux.IOCallback
+		cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
+		if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+			if i > 0 {
+				// Some have been successful.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Process this callback.
+		if err := submitCallback(t, id, &cb, cbAddr); err != nil {
+			if i > 0 {
+				// Partial success.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Advance to the next one.
+		addr += usermem.Addr(t.Arch().Width())
+	}
+
+	return uintptr(nrEvents), nil, nil
+}
+
+// submitCallback processes a single callback.
+func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error {
+	if cb.Reserved2 != 0 {
+		return syserror.EINVAL
+	}
+
+	fd := t.GetFileVFS2(cb.FD)
+	if fd == nil {
+		return syserror.EBADF
+	}
+	defer fd.DecRef()
+
+	// Was there an eventFD? Extract it.
+	var eventFD *vfs.FileDescription
+	if cb.Flags&linux.IOCB_FLAG_RESFD != 0 {
+		eventFD = t.GetFileVFS2(cb.ResFD)
+		if eventFD == nil {
+			return syserror.EBADF
+		}
+		defer eventFD.DecRef()
+
+		// Check that it is an eventfd.
+		if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok {
+			return syserror.EINVAL
+		}
+	}
+
+	ioseq, err := memoryFor(t, cb)
+	if err != nil {
+		return err
+	}
+
+	// Check offset for reads/writes.
+	switch cb.OpCode {
+	case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+		if cb.Offset < 0 {
+			return syserror.EINVAL
+		}
+	}
+
+	// Prepare the request.
+	aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id)
+	if !ok {
+		return syserror.EINVAL
+	}
+	if ready := aioCtx.Prepare(); !ready {
+		// Context is busy.
+		return syserror.EAGAIN
+	}
+
+	if eventFD != nil {
+		// The request is set. Make sure there's a ref on the file.
+		//
+		// This is necessary when the callback executes on completion,
+		// which is also what will release this reference.
+		eventFD.IncRef()
+	}
+
+	// Perform the request asynchronously.
+	fd.IncRef()
+	t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx))
+	return nil
+}
+
+func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback {
+	return func(ctx context.Context) {
+		if aioCtx.Dead() {
+			aioCtx.CancelPendingRequest()
+			return
+		}
+		ev := &linux.IOEvent{
+			Data: cb.Data,
+			Obj:  uint64(cbAddr),
+		}
+
+		var err error
+		switch cb.OpCode {
+		case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV:
+			ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{})
+		case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+			ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{})
+		case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC:
+			err = fd.Sync(ctx)
+		}
+
+		// Update the result.
+		if err != nil {
+			err = slinux.HandleIOErrorVFS2(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd)
+			ev.Result = -int64(kernel.ExtractErrno(err, 0))
+		}
+
+		fd.DecRef()
+
+		// Queue the result for delivery.
+		aioCtx.FinishRequest(ev)
+
+		// Notify the event file if one was specified. This needs to happen
+		// *after* queueing the result to avoid racing with the thread we may
+		// wake up.
+		if eventFD != nil {
+			eventFD.Impl().(*eventfd.EventFileDescription).Signal(1)
+			eventFD.DecRef()
+		}
+	}
+}
+
+// memoryFor returns appropriate memory for the given callback.
+func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) {
+	bytes := int(cb.Bytes)
+	if bytes < 0 {
+		// Linux also requires that this field fit in ssize_t.
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+
+	// Since this I/O will be asynchronous with respect to t's task goroutine,
+	// we have no guarantee that t's AddressSpace will be active during the
+	// I/O.
+	switch cb.OpCode {
+	case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE:
+		return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+			AddressSpaceActive: false,
+		})
+
+	case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV:
+		return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+			AddressSpaceActive: false,
+		})
+
+	case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP:
+		return usermem.IOSequence{}, nil
+
+	default:
+		// Not a supported command.
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
index ca0f7fd1e..67f191551 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -17,10 +17,13 @@ package vfs2
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
 	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -134,10 +137,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return uintptr(flags.ToLinuxFDFlags()), nil, nil
 	case linux.F_SETFD:
 		flags := args[2].Uint()
-		t.FDTable().SetFlags(fd, kernel.FDFlags{
+		err := t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
 			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
 		})
-		return 0, nil, nil
+		return 0, nil, err
 	case linux.F_GETFL:
 		return uintptr(file.StatusFlags()), nil, nil
 	case linux.F_SETFL:
@@ -152,6 +155,41 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			return 0, nil, err
 		}
 		return uintptr(n), nil, nil
+	case linux.F_GETOWN:
+		owner, hasOwner := getAsyncOwner(t, file)
+		if !hasOwner {
+			return 0, nil, nil
+		}
+		if owner.Type == linux.F_OWNER_PGRP {
+			return uintptr(-owner.PID), nil, nil
+		}
+		return uintptr(owner.PID), nil, nil
+	case linux.F_SETOWN:
+		who := args[2].Int()
+		ownerType := int32(linux.F_OWNER_PID)
+		if who < 0 {
+			// Check for overflow before flipping the sign.
+			if who-1 > who {
+				return 0, nil, syserror.EINVAL
+			}
+			ownerType = linux.F_OWNER_PGRP
+			who = -who
+		}
+		return 0, nil, setAsyncOwner(t, file, ownerType, who)
+	case linux.F_GETOWN_EX:
+		owner, hasOwner := getAsyncOwner(t, file)
+		if !hasOwner {
+			return 0, nil, nil
+		}
+		_, err := t.CopyOut(args[2].Pointer(), &owner)
+		return 0, nil, err
+	case linux.F_SETOWN_EX:
+		var owner linux.FOwnerEx
+		_, err := t.CopyIn(args[2].Pointer(), &owner)
+		if err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, setAsyncOwner(t, file, owner.Type, owner.PID)
 	case linux.F_GETPIPE_SZ:
 		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
 		if !ok {
@@ -167,8 +205,151 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		}
 		err := tmpfs.AddSeals(file, args[2].Uint())
 		return 0, nil, err
+	case linux.F_SETLK, linux.F_SETLKW:
+		return 0, nil, posixLock(t, args, file, cmd)
+	default:
+		// TODO(gvisor.dev/issue/2920): Everything else is not yet supported.
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) {
+	a := fd.AsyncHandler()
+	if a == nil {
+		return linux.FOwnerEx{}, false
+	}
+
+	ot, otg, opg := a.(*fasync.FileAsync).Owner()
+	switch {
+	case ot != nil:
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_TID,
+			PID:  int32(t.PIDNamespace().IDOfTask(ot)),
+		}, true
+	case otg != nil:
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_PID,
+			PID:  int32(t.PIDNamespace().IDOfThreadGroup(otg)),
+		}, true
+	case opg != nil:
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_PGRP,
+			PID:  int32(t.PIDNamespace().IDOfProcessGroup(opg)),
+		}, true
+	default:
+		return linux.FOwnerEx{}, true
+	}
+}
+
+func setAsyncOwner(t *kernel.Task, fd *vfs.FileDescription, ownerType, pid int32) error {
+	switch ownerType {
+	case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP:
+		// Acceptable type.
+	default:
+		return syserror.EINVAL
+	}
+
+	a := fd.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync)
+	if pid == 0 {
+		a.ClearOwner()
+		return nil
+	}
+
+	switch ownerType {
+	case linux.F_OWNER_TID:
+		task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid))
+		if task == nil {
+			return syserror.ESRCH
+		}
+		a.SetOwnerTask(t, task)
+		return nil
+	case linux.F_OWNER_PID:
+		tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid))
+		if tg == nil {
+			return syserror.ESRCH
+		}
+		a.SetOwnerThreadGroup(t, tg)
+		return nil
+	case linux.F_OWNER_PGRP:
+		pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid))
+		if pg == nil {
+			return syserror.ESRCH
+		}
+		a.SetOwnerProcessGroup(t, pg)
+		return nil
+	default:
+		return syserror.EINVAL
+	}
+}
+
+func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, cmd int32) error {
+	// Copy in the lock request.
+	flockAddr := args[2].Pointer()
+	var flock linux.Flock
+	if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+		return err
+	}
+
+	var blocker lock.Blocker
+	if cmd == linux.F_SETLKW {
+		blocker = t
+	}
+
+	switch flock.Type {
+	case linux.F_RDLCK:
+		if !file.IsReadable() {
+			return syserror.EBADF
+		}
+		return file.LockPOSIX(t, t.FDTable(), lock.ReadLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker)
+
+	case linux.F_WRLCK:
+		if !file.IsWritable() {
+			return syserror.EBADF
+		}
+		return file.LockPOSIX(t, t.FDTable(), lock.WriteLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker)
+
+	case linux.F_UNLCK:
+		return file.UnlockPOSIX(t, t.FDTable(), uint64(flock.Start), uint64(flock.Len), flock.Whence)
+
+	default:
+		return syserror.EINVAL
+	}
+}
+
+// Fadvise64 implements fadvise64(2).
+// This implementation currently ignores the provided advice.
+func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	length := args[2].Int64()
+	advice := args[3].Int()
+
+	// Note: offset is allowed to be negative.
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// If the FD refers to a pipe or FIFO, return error.
+	if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	switch advice {
+	case linux.POSIX_FADV_NORMAL:
+	case linux.POSIX_FADV_RANDOM:
+	case linux.POSIX_FADV_SEQUENTIAL:
+	case linux.POSIX_FADV_WILLNEED:
+	case linux.POSIX_FADV_DONTNEED:
+	case linux.POSIX_FADV_NOREUSE:
 	default:
-		// TODO(gvisor.dev/issue/1623): Everything else is not yet supported.
 		return 0, nil, syserror.EINVAL
 	}
+
+	// Sure, whatever.
+	return 0, nil, nil
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
index 46d3e189c..b6d2ddd65 100644
--- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -106,7 +106,7 @@ func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	addr := args[0].Pointer()
 	mode := args[1].ModeT()
 	dev := args[2].Uint()
-	return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev)
+	return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev)
 }
 
 // Mknodat implements Linux syscall mknodat(2).
@@ -115,10 +115,10 @@ func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	addr := args[1].Pointer()
 	mode := args[2].ModeT()
 	dev := args[3].Uint()
-	return 0, nil, mknodat(t, dirfd, addr, mode, dev)
+	return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev)
 }
 
-func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error {
+func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode linux.FileMode, dev uint32) error {
 	path, err := copyInPath(t, addr)
 	if err != nil {
 		return err
@@ -128,9 +128,14 @@ func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint
 		return err
 	}
 	defer tpop.Release()
+
+	// "Zero file type is equivalent to type S_IFREG." - mknod(2)
+	if mode.FileType() == 0 {
+		mode |= linux.ModeRegular
+	}
 	major, minor := linux.DecodeDeviceID(dev)
 	return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
-		Mode:     linux.FileMode(mode &^ t.FSContext().Umask()),
+		Mode:     mode &^ linux.FileMode(t.FSContext().Umask()),
 		DevMajor: uint32(major),
 		DevMinor: minor,
 	})
@@ -313,6 +318,9 @@ func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpath
 	if err != nil {
 		return err
 	}
+	if len(target) == 0 {
+		return syserror.ENOENT
+	}
 	linkpath, err := copyInPath(t, linkpathAddr)
 	if err != nil {
 		return err
diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go
new file mode 100644
index 000000000..5d98134a5
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go
@@ -0,0 +1,137 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC
+
+// InotifyInit1 implements the inotify_init1() syscalls.
+func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	if flags&^allFlags != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags))
+	if err != nil {
+		return 0, nil, err
+	}
+	defer ino.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{
+		CloseOnExec: flags&linux.IN_CLOEXEC != 0,
+	})
+
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// InotifyInit implements the inotify_init() syscalls.
+func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	args[0].Value = 0
+	return InotifyInit1(t, args)
+}
+
+// fdToInotify resolves an fd to an inotify object. If successful, the file will
+// have an extra ref and the caller is responsible for releasing the ref.
+func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) {
+	f := t.GetFileVFS2(fd)
+	if f == nil {
+		// Invalid fd.
+		return nil, nil, syserror.EBADF
+	}
+
+	ino, ok := f.Impl().(*vfs.Inotify)
+	if !ok {
+		// Not an inotify fd.
+		f.DecRef()
+		return nil, nil, syserror.EINVAL
+	}
+
+	return ino, f, nil
+}
+
+// InotifyAddWatch implements the inotify_add_watch() syscall.
+func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	mask := args[2].Uint()
+
+	// "EINVAL: The given event mask contains no valid events."
+	// -- inotify_add_watch(2)
+	if mask&linux.ALL_INOTIFY_BITS == 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link."
+	//  -- inotify(7)
+	follow := followFinalSymlink
+	if mask&linux.IN_DONT_FOLLOW == 0 {
+		follow = nofollowFinalSymlink
+	}
+
+	ino, f, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer f.DecRef()
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	if mask&linux.IN_ONLYDIR != 0 {
+		path.Dir = true
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+	d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{})
+	if err != nil {
+		return 0, nil, err
+	}
+	defer d.DecRef()
+
+	fd, err = ino.AddWatch(d.Dentry(), mask)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// InotifyRmWatch implements the inotify_rm_watch() syscall.
+func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	wd := args[1].Int()
+
+	ino, f, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer f.DecRef()
+	return 0, nil, ino.RmWatch(wd)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
index 5a2418da9..fd6ab94b2 100644
--- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -15,6 +15,7 @@
 package vfs2
 
 import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -30,6 +31,77 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 	defer file.DecRef()
 
+	// Handle ioctls that apply to all FDs.
+	switch args[1].Int() {
+	case linux.FIONCLEX:
+		t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+			CloseOnExec: false,
+		})
+		return 0, nil, nil
+
+	case linux.FIOCLEX:
+		t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+			CloseOnExec: true,
+		})
+		return 0, nil, nil
+
+	case linux.FIONBIO:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		flags := file.StatusFlags()
+		if set != 0 {
+			flags |= linux.O_NONBLOCK
+		} else {
+			flags &^= linux.O_NONBLOCK
+		}
+		return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags)
+
+	case linux.FIOASYNC:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		flags := file.StatusFlags()
+		if set != 0 {
+			flags |= linux.O_ASYNC
+		} else {
+			flags &^= linux.O_ASYNC
+		}
+		file.SetStatusFlags(t, t.Credentials(), flags)
+		return 0, nil, nil
+
+	case linux.FIOGETOWN, linux.SIOCGPGRP:
+		var who int32
+		owner, hasOwner := getAsyncOwner(t, file)
+		if hasOwner {
+			if owner.Type == linux.F_OWNER_PGRP {
+				who = -owner.PID
+			} else {
+				who = owner.PID
+			}
+		}
+		_, err := t.CopyOut(args[2].Pointer(), &who)
+		return 0, nil, err
+
+	case linux.FIOSETOWN, linux.SIOCSPGRP:
+		var who int32
+		if _, err := t.CopyIn(args[2].Pointer(), &who); err != nil {
+			return 0, nil, err
+		}
+		ownerType := int32(linux.F_OWNER_PID)
+		if who < 0 {
+			// Check for overflow before flipping the sign.
+			if who-1 > who {
+				return 0, nil, syserror.EINVAL
+			}
+			ownerType = linux.F_OWNER_PGRP
+			who = -who
+		}
+		return 0, nil, setAsyncOwner(t, file, ownerType, who)
+	}
+
 	ret, err := file.Ioctl(t, t.MemoryManager(), args)
 	return ret, nil, err
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go
new file mode 100644
index 000000000..bf19028c4
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/lock.go
@@ -0,0 +1,64 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Flock implements linux syscall flock(2).
+func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	operation := args[1].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		// flock(2): EBADF fd is not an open file descriptor.
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	nonblocking := operation&linux.LOCK_NB != 0
+	operation &^= linux.LOCK_NB
+
+	var blocker lock.Blocker
+	if !nonblocking {
+		blocker = t
+	}
+
+	switch operation {
+	case linux.LOCK_EX:
+		if err := file.LockBSD(t, lock.WriteLock, blocker); err != nil {
+			return 0, nil, err
+		}
+	case linux.LOCK_SH:
+		if err := file.LockBSD(t, lock.ReadLock, blocker); err != nil {
+			return 0, nil, err
+		}
+	case linux.LOCK_UN:
+		if err := file.UnlockBSD(t); err != nil {
+			return 0, nil, err
+		}
+	default:
+		// flock(2): EINVAL operation is invalid.
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go
new file mode 100644
index 000000000..ea337de7c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mount.go
@@ -0,0 +1,150 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mount implements Linux syscall mount(2).
+func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sourceAddr := args[0].Pointer()
+	targetAddr := args[1].Pointer()
+	typeAddr := args[2].Pointer()
+	flags := args[3].Uint64()
+	dataAddr := args[4].Pointer()
+
+	// For null-terminated strings related to mount(2), Linux copies in at most
+	// a page worth of data. See fs/namespace.c:copy_mount_string().
+	fsType, err := t.CopyInString(typeAddr, usermem.PageSize)
+	if err != nil {
+		return 0, nil, err
+	}
+	source, err := t.CopyInString(sourceAddr, usermem.PageSize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	targetPath, err := copyInPath(t, targetAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	data := ""
+	if dataAddr != 0 {
+		// In Linux, a full page is always copied in regardless of null
+		// character placement, and the address is passed to each file system.
+		// Most file systems always treat this data as a string, though, and so
+		// do all of the ones we implement.
+		data, err = t.CopyInString(dataAddr, usermem.PageSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+
+	// Ignore magic value that was required before Linux 2.4.
+	if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL {
+		flags = flags &^ linux.MS_MGC_MSK
+	}
+
+	// Must have CAP_SYS_ADMIN in the current mount namespace's associated user
+	// namespace.
+	creds := t.Credentials()
+	if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+		return 0, nil, syserror.EPERM
+	}
+
+	const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND |
+		linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE |
+		linux.MS_UNBINDABLE | linux.MS_MOVE
+
+	// Silently allow MS_NOSUID, since we don't implement set-id bits
+	// anyway.
+	const unsupportedFlags = linux.MS_NODIRATIME | linux.MS_STRICTATIME
+
+	// Linux just allows passing any flags to mount(2) - it won't fail when
+	// unknown or unsupported flags are passed. Since we don't implement
+	// everything, we fail explicitly on flags that are unimplemented.
+	if flags&(unsupportedOps|unsupportedFlags) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var opts vfs.MountOptions
+	if flags&linux.MS_NOATIME == linux.MS_NOATIME {
+		opts.Flags.NoATime = true
+	}
+	if flags&linux.MS_NOEXEC == linux.MS_NOEXEC {
+		opts.Flags.NoExec = true
+	}
+	if flags&linux.MS_NODEV == linux.MS_NODEV {
+		opts.Flags.NoDev = true
+	}
+	if flags&linux.MS_NOSUID == linux.MS_NOSUID {
+		opts.Flags.NoSUID = true
+	}
+	if flags&linux.MS_RDONLY == linux.MS_RDONLY {
+		opts.ReadOnly = true
+	}
+	opts.GetFilesystemOptions.Data = data
+
+	target, err := getTaskPathOperation(t, linux.AT_FDCWD, targetPath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer target.Release()
+
+	return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
+}
+
+// Umount2 implements Linux syscall umount2(2).
+func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Int()
+
+	// Must have CAP_SYS_ADMIN in the mount namespace's associated user
+	// namespace.
+	//
+	// Currently, this is always the init task's user namespace.
+	creds := t.Credentials()
+	if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+		return 0, nil, syserror.EPERM
+	}
+
+	const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE
+	if flags&unsupported != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	opts := vfs.UmountOptions{
+		Flags: uint32(flags),
+	}
+
+	return 0, nil, t.Kernel().VFS().UmountAt(t, creds, &tpop.pop, &opts)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
index 3a7ef24f5..cd25597a7 100644
--- a/pkg/sentry/syscalls/linux/vfs2/read_write.go
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -93,11 +93,17 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
 	n, err := file.Read(t, dst, opts)
 	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
 	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
 	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
@@ -128,6 +134,9 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt
 	}
 	file.EventUnregister(&w)
 
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	}
 	return total, err
 }
 
@@ -248,11 +257,17 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	n, err := file.PRead(t, dst, offset, opts)
 	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
 	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
 	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
@@ -283,6 +298,9 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of
 	}
 	file.EventUnregister(&w)
 
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	}
 	return total, err
 }
 
@@ -345,11 +363,17 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	n, err := file.Write(t, src, opts)
 	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
 	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
 	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
@@ -380,6 +404,9 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op
 	}
 	file.EventUnregister(&w)
 
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+	}
 	return total, err
 }
 
@@ -500,11 +527,17 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	n, err := file.PWrite(t, src, offset, opts)
 	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
 	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
 	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
@@ -535,6 +568,9 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o
 	}
 	file.EventUnregister(&w)
 
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	}
 	return total, err
 }
 
@@ -570,3 +606,36 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	newoff, err := file.Seek(t, offset, whence)
 	return uintptr(newoff), nil, err
 }
+
+// Readahead implements readahead(2).
+func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is readable.
+	if !file.IsReadable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is valid.
+	if int(size) < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Return EINVAL; if the underlying file type does not support readahead,
+	// then Linux will return EINVAL to indicate as much. In the future, we
+	// may extend this function to actually support readahead hints.
+	return 0, nil, syserror.EINVAL
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
index 09ecfed26..25cdb7a55 100644
--- a/pkg/sentry/syscalls/linux/vfs2/setstat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -178,6 +179,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 			Mask: linux.STATX_SIZE,
 			Size: uint64(length),
 		},
+		NeedWritePerm: true,
 	})
 	return 0, nil, handleSetSizeError(t, err)
 }
@@ -197,6 +199,10 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	}
 	defer file.DecRef()
 
+	if !file.IsWritable() {
+		return 0, nil, syserror.EINVAL
+	}
+
 	err := file.SetStat(t, vfs.SetStatOptions{
 		Stat: linux.Statx{
 			Mask: linux.STATX_SIZE,
@@ -206,6 +212,56 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, handleSetSizeError(t, err)
 }
 
+// Fallocate implements linux system call fallocate(2).
+func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	mode := args[1].Uint64()
+	offset := args[2].Int64()
+	length := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	if !file.IsWritable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	if mode != 0 {
+		return 0, nil, syserror.ENOTSUP
+	}
+
+	if offset < 0 || length <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	size := offset + length
+
+	if size < 0 {
+		return 0, nil, syserror.EFBIG
+	}
+
+	limit := limits.FromContext(t).Get(limits.FileSize).Cur
+
+	if uint64(size) >= limit {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(linux.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	if err := file.Allocate(t, mode, uint64(offset), uint64(length)); err != nil {
+		return 0, nil, err
+	}
+
+	file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+	return 0, nil, nil
+}
+
 // Utime implements Linux syscall utime(2).
 func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	pathAddr := args[0].Pointer()
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index 10b668477..8096a8f9c 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -30,6 +30,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // minListenBacklog is the minimum reasonable backlog for listening sockets.
@@ -477,7 +479,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	if v != nil {
-		if _, err := t.CopyOut(optValAddr, v); err != nil {
+		if _, err := v.CopyOut(t, optValAddr); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -487,7 +489,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 
 // getSockOpt tries to handle common socket options, or dispatches to a specific
 // socket implementation.
-func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
+func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (marshal.Marshallable, *syserr.Error) {
 	if level == linux.SOL_SOCKET {
 		switch name {
 		case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
@@ -499,13 +501,16 @@ func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr
 		switch name {
 		case linux.SO_TYPE:
 			_, skType, _ := s.Type()
-			return int32(skType), nil
+			v := primitive.Int32(skType)
+			return &v, nil
 		case linux.SO_DOMAIN:
 			family, _, _ := s.Type()
-			return int32(family), nil
+			v := primitive.Int32(family)
+			return &v, nil
 		case linux.SO_PROTOCOL:
 			_, _, protocol := s.Type()
-			return int32(protocol), nil
+			v := primitive.Int32(protocol)
+			return &v, nil
 		}
 	}
 
@@ -542,7 +547,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 		return 0, nil, syserror.EINVAL
 	}
 	buf := t.CopyScratchBuffer(int(optLen))
-	if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+	if _, err := t.CopyInBytes(optValAddr, buf); err != nil {
 		return 0, nil, err
 	}
 
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
new file mode 100644
index 000000000..63ab11f8c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -0,0 +1,486 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Splice implements Linux syscall splice(2).
+func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	inFD := args[0].Int()
+	inOffsetPtr := args[1].Pointer()
+	outFD := args[2].Int()
+	outOffsetPtr := args[3].Pointer()
+	count := int64(args[4].SizeT())
+	flags := args[5].Int()
+
+	if count == 0 {
+		return 0, nil, nil
+	}
+	if count > int64(kernel.MAX_RW_COUNT) {
+		count = int64(kernel.MAX_RW_COUNT)
+	}
+
+	// Check for invalid flags.
+	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get file descriptions.
+	inFile := t.GetFileVFS2(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+	outFile := t.GetFileVFS2(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	// Check that both files support the required directionality.
+	if !inFile.IsReadable() || !outFile.IsWritable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	// The operation is non-blocking if anything is non-blocking.
+	//
+	// N.B. This is a rather simplistic heuristic that avoids some
+	// poor edge case behavior since the exact semantics here are
+	// underspecified and vary between versions of Linux itself.
+	nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+	// At least one file description must represent a pipe.
+	inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+	if !inIsPipe && !outIsPipe {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Copy in offsets.
+	inOffset := int64(-1)
+	if inOffsetPtr != 0 {
+		if inIsPipe {
+			return 0, nil, syserror.ESPIPE
+		}
+		if inFile.Options().DenyPRead {
+			return 0, nil, syserror.EINVAL
+		}
+		if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil {
+			return 0, nil, err
+		}
+		if inOffset < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+	}
+	outOffset := int64(-1)
+	if outOffsetPtr != 0 {
+		if outIsPipe {
+			return 0, nil, syserror.ESPIPE
+		}
+		if outFile.Options().DenyPWrite {
+			return 0, nil, syserror.EINVAL
+		}
+		if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil {
+			return 0, nil, err
+		}
+		if outOffset < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+	}
+
+	// Move data.
+	var (
+		n   int64
+		err error
+	)
+	dw := dualWaiter{
+		inFile:  inFile,
+		outFile: outFile,
+	}
+	defer dw.destroy()
+	for {
+		// If both input and output are pipes, delegate to the pipe
+		// implementation. Otherwise, exactly one end is a pipe, which
+		// we ensure is consistently ordered after the non-pipe FD's
+		// locks by passing the pipe FD as usermem.IO to the non-pipe
+		// end.
+		switch {
+		case inIsPipe && outIsPipe:
+			n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
+		case inIsPipe:
+			if outOffset != -1 {
+				n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{})
+				outOffset += n
+			} else {
+				n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{})
+			}
+		case outIsPipe:
+			if inOffset != -1 {
+				n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{})
+				inOffset += n
+			} else {
+				n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
+			}
+		default:
+			panic("not possible")
+		}
+
+		if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
+			break
+		}
+		if err = dw.waitForBoth(t); err != nil {
+			break
+		}
+	}
+
+	// Copy updated offsets out.
+	if inOffsetPtr != 0 {
+		if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil {
+			return 0, nil, err
+		}
+	}
+	if outOffsetPtr != 0 {
+		if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	if n == 0 {
+		return 0, nil, err
+	}
+
+	// On Linux, inotify behavior is not very consistent with splice(2). We try
+	// our best to emulate Linux for very basic calls to splice, where for some
+	// reason, events are generated for output files, but not input files.
+	outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+	return uintptr(n), nil, nil
+}
+
+// Tee implements Linux syscall tee(2).
+func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	inFD := args[0].Int()
+	outFD := args[1].Int()
+	count := int64(args[2].SizeT())
+	flags := args[3].Int()
+
+	if count == 0 {
+		return 0, nil, nil
+	}
+	if count > int64(kernel.MAX_RW_COUNT) {
+		count = int64(kernel.MAX_RW_COUNT)
+	}
+
+	// Check for invalid flags.
+	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get file descriptions.
+	inFile := t.GetFileVFS2(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+	outFile := t.GetFileVFS2(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	// Check that both files support the required directionality.
+	if !inFile.IsReadable() || !outFile.IsWritable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	// The operation is non-blocking if anything is non-blocking.
+	//
+	// N.B. This is a rather simplistic heuristic that avoids some
+	// poor edge case behavior since the exact semantics here are
+	// underspecified and vary between versions of Linux itself.
+	nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+	// Both file descriptions must represent pipes.
+	inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+	if !inIsPipe || !outIsPipe {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Copy data.
+	var (
+		n   int64
+		err error
+	)
+	dw := dualWaiter{
+		inFile:  inFile,
+		outFile: outFile,
+	}
+	defer dw.destroy()
+	for {
+		n, err = pipe.Tee(t, outPipeFD, inPipeFD, count)
+		if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
+			break
+		}
+		if err = dw.waitForBoth(t); err != nil {
+			break
+		}
+	}
+	if n == 0 {
+		return 0, nil, err
+	}
+	outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+	return uintptr(n), nil, nil
+}
+
+// Sendfile implements linux system call sendfile(2).
+func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	outFD := args[0].Int()
+	inFD := args[1].Int()
+	offsetAddr := args[2].Pointer()
+	count := int64(args[3].SizeT())
+
+	inFile := t.GetFileVFS2(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+	if !inFile.IsReadable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	outFile := t.GetFileVFS2(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+	if !outFile.IsWritable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Verify that the outFile Append flag is not set.
+	if outFile.StatusFlags()&linux.O_APPEND != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Verify that inFile is a regular file or block device. This is a
+	// requirement; the same check appears in Linux
+	// (fs/splice.c:splice_direct_to_actor).
+	if stat, err := inFile.Stat(t, vfs.StatOptions{Mask: linux.STATX_TYPE}); err != nil {
+		return 0, nil, err
+	} else if stat.Mask&linux.STATX_TYPE == 0 ||
+		(stat.Mode&linux.S_IFMT != linux.S_IFREG && stat.Mode&linux.S_IFMT != linux.S_IFBLK) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Copy offset if it exists.
+	offset := int64(-1)
+	if offsetAddr != 0 {
+		if inFile.Options().DenyPRead {
+			return 0, nil, syserror.ESPIPE
+		}
+		if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+			return 0, nil, err
+		}
+		if offset < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if offset+count < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+	}
+
+	// Validate count. This must come after offset checks.
+	if count < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if count == 0 {
+		return 0, nil, nil
+	}
+	if count > int64(kernel.MAX_RW_COUNT) {
+		count = int64(kernel.MAX_RW_COUNT)
+	}
+
+	// Copy data.
+	var (
+		n   int64
+		err error
+	)
+	dw := dualWaiter{
+		inFile:  inFile,
+		outFile: outFile,
+	}
+	defer dw.destroy()
+	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+	// Reading from input file should never block, since it is regular or
+	// block device. We only need to check if writing to the output file
+	// can block.
+	nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0
+	if outIsPipe {
+		for n < count {
+			var spliceN int64
+			if offset != -1 {
+				spliceN, err = inFile.PRead(t, outPipeFD.IOSequence(count), offset, vfs.ReadOptions{})
+				offset += spliceN
+			} else {
+				spliceN, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
+			}
+			n += spliceN
+			if err == syserror.ErrWouldBlock && !nonBlock {
+				err = dw.waitForBoth(t)
+			}
+			if err != nil {
+				break
+			}
+		}
+	} else {
+		// Read inFile to buffer, then write the contents to outFile.
+		buf := make([]byte, count)
+		for n < count {
+			var readN int64
+			if offset != -1 {
+				readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{})
+				offset += readN
+			} else {
+				readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+			}
+			if readN == 0 && err == io.EOF {
+				// We reached the end of the file. Eat the
+				// error and exit the loop.
+				err = nil
+				break
+			}
+			n += readN
+			if err != nil {
+				break
+			}
+
+			// Write all of the bytes that we read. This may need
+			// multiple write calls to complete.
+			wbuf := buf[:n]
+			for len(wbuf) > 0 {
+				var writeN int64
+				writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{})
+				wbuf = wbuf[writeN:]
+				if err == syserror.ErrWouldBlock && !nonBlock {
+					err = dw.waitForOut(t)
+				}
+				if err != nil {
+					// We didn't complete the write. Only
+					// report the bytes that were actually
+					// written, and rewind the offset.
+					notWritten := int64(len(wbuf))
+					n -= notWritten
+					if offset != -1 {
+						offset -= notWritten
+					}
+					break
+				}
+			}
+			if err == syserror.ErrWouldBlock && !nonBlock {
+				err = dw.waitForBoth(t)
+			}
+			if err != nil {
+				break
+			}
+		}
+	}
+
+	if offsetAddr != 0 {
+		// Copy out the new offset.
+		if _, err := t.CopyOut(offsetAddr, offset); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	if n == 0 {
+		return 0, nil, err
+	}
+
+	inFile.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+	return uintptr(n), nil, nil
+}
+
+// dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not
+// thread-safe, and does not take a reference on the vfs.FileDescriptions.
+//
+// Users must call destroy() when finished.
+type dualWaiter struct {
+	inFile  *vfs.FileDescription
+	outFile *vfs.FileDescription
+
+	inW   waiter.Entry
+	inCh  chan struct{}
+	outW  waiter.Entry
+	outCh chan struct{}
+}
+
+// waitForBoth waits for both dw.inFile and dw.outFile to be ready.
+func (dw *dualWaiter) waitForBoth(t *kernel.Task) error {
+	if dw.inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
+		if dw.inCh == nil {
+			dw.inW, dw.inCh = waiter.NewChannelEntry(nil)
+			dw.inFile.EventRegister(&dw.inW, eventMaskRead)
+			// We might be ready now. Try again before blocking.
+			return nil
+		}
+		if err := t.Block(dw.inCh); err != nil {
+			return err
+		}
+	}
+	return dw.waitForOut(t)
+}
+
+// waitForOut waits for dw.outfile to be read.
+func (dw *dualWaiter) waitForOut(t *kernel.Task) error {
+	if dw.outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
+		if dw.outCh == nil {
+			dw.outW, dw.outCh = waiter.NewChannelEntry(nil)
+			dw.outFile.EventRegister(&dw.outW, eventMaskWrite)
+			// We might be ready now. Try again before blocking.
+			return nil
+		}
+		if err := t.Block(dw.outCh); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// destroy cleans up resources help by dw. No more calls to wait* can occur
+// after destroy is called.
+func (dw *dualWaiter) destroy() {
+	if dw.inCh != nil {
+		dw.inFile.EventUnregister(&dw.inW)
+		dw.inCh = nil
+	}
+	if dw.outCh != nil {
+		dw.outFile.EventUnregister(&dw.outW)
+		dw.outCh = nil
+	}
+	dw.inFile = nil
+	dw.outFile = nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go
index 365250b0b..0d0ebf46a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/sync.go
+++ b/pkg/sentry/syscalls/linux/vfs2/sync.go
@@ -65,10 +65,8 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 	nbytes := args[2].Int64()
 	flags := args[3].Uint()
 
-	if offset < 0 {
-		return 0, nil, syserror.EINVAL
-	}
-	if nbytes < 0 {
+	// Check for negative values and overflow.
+	if offset < 0 || offset+nbytes < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 	if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
@@ -81,7 +79,37 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 	}
 	defer file.DecRef()
 
-	// TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of
-	// [offset, offset+nbytes).
-	return 0, nil, file.Sync(t)
+	// TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support
+	// is a full-file sync, i.e. fsync(2). As a result, there are severe
+	// limitations on how much we support sync_file_range:
+	// - In Linux, sync_file_range(2) doesn't write out the file's metadata, even
+	//   if the file size is changed. We do.
+	// - We always sync the entire file instead of [offset, offset+nbytes).
+	// - We do not support the use of WAIT_BEFORE without WAIT_AFTER. For
+	//   correctness, we would have to perform a write-out every time WAIT_BEFORE
+	//   was used, but this would be much more expensive than expected if there
+	//   were no write-out operations in progress.
+	// - Whenever WAIT_AFTER is used, we sync the file.
+	// - Ignore WRITE. If this flag is used with WAIT_AFTER, then the file will
+	//   be synced anyway. If this flag is used without WAIT_AFTER, then it is
+	//   safe (and less expensive) to do nothing, because the syscall will not
+	//   wait for the write-out to complete--we only need to make sure that the
+	//   next time WAIT_BEFORE or WAIT_AFTER are used, the write-out completes.
+	// - According to fs/sync.c, WAIT_BEFORE|WAIT_AFTER "will detect any I/O
+	//   errors or ENOSPC conditions and will return those to the caller, after
+	//   clearing the EIO and ENOSPC flags in the address_space." We don't do
+	//   this.
+
+	if flags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 &&
+		flags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 {
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.ENOSYS
+	}
+
+	if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 {
+		if err := file.Sync(t); err != nil {
+			return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+		}
+	}
+	return 0, nil, nil
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
index a332d01bd..c576d9475 100644
--- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -44,7 +44,7 @@ func Override() {
 	s.Table[23] = syscalls.Supported("select", Select)
 	s.Table[32] = syscalls.Supported("dup", Dup)
 	s.Table[33] = syscalls.Supported("dup2", Dup2)
-	delete(s.Table, 40) // sendfile
+	s.Table[40] = syscalls.Supported("sendfile", Sendfile)
 	s.Table[41] = syscalls.Supported("socket", Socket)
 	s.Table[42] = syscalls.Supported("connect", Connect)
 	s.Table[43] = syscalls.Supported("accept", Accept)
@@ -62,7 +62,7 @@ func Override() {
 	s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt)
 	s.Table[59] = syscalls.Supported("execve", Execve)
 	s.Table[72] = syscalls.Supported("fcntl", Fcntl)
-	delete(s.Table, 73) // flock
+	s.Table[73] = syscalls.Supported("flock", Flock)
 	s.Table[74] = syscalls.Supported("fsync", Fsync)
 	s.Table[75] = syscalls.Supported("fdatasync", Fdatasync)
 	s.Table[76] = syscalls.Supported("truncate", Truncate)
@@ -90,9 +90,9 @@ func Override() {
 	s.Table[138] = syscalls.Supported("fstatfs", Fstatfs)
 	s.Table[161] = syscalls.Supported("chroot", Chroot)
 	s.Table[162] = syscalls.Supported("sync", Sync)
-	delete(s.Table, 165) // mount
-	delete(s.Table, 166) // umount2
-	delete(s.Table, 187) // readahead
+	s.Table[165] = syscalls.Supported("mount", Mount)
+	s.Table[166] = syscalls.Supported("umount2", Umount2)
+	s.Table[187] = syscalls.Supported("readahead", Readahead)
 	s.Table[188] = syscalls.Supported("setxattr", Setxattr)
 	s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
 	s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
@@ -105,20 +105,16 @@ func Override() {
 	s.Table[197] = syscalls.Supported("removexattr", Removexattr)
 	s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
 	s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
-	delete(s.Table, 206) // io_setup
-	delete(s.Table, 207) // io_destroy
-	delete(s.Table, 208) // io_getevents
-	delete(s.Table, 209) // io_submit
-	delete(s.Table, 210) // io_cancel
+	s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"})
 	s.Table[213] = syscalls.Supported("epoll_create", EpollCreate)
 	s.Table[217] = syscalls.Supported("getdents64", Getdents64)
-	delete(s.Table, 221) // fdavise64
+	s.Table[221] = syscalls.PartiallySupported("fadvise64", Fadvise64, "The syscall is 'supported', but ignores all provided advice.", nil)
 	s.Table[232] = syscalls.Supported("epoll_wait", EpollWait)
 	s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
 	s.Table[235] = syscalls.Supported("utimes", Utimes)
-	delete(s.Table, 253) // inotify_init
-	delete(s.Table, 254) // inotify_add_watch
-	delete(s.Table, 255) // inotify_rm_watch
+	s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil)
+	s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil)
+	s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil)
 	s.Table[257] = syscalls.Supported("openat", Openat)
 	s.Table[258] = syscalls.Supported("mkdirat", Mkdirat)
 	s.Table[259] = syscalls.Supported("mknodat", Mknodat)
@@ -134,15 +130,15 @@ func Override() {
 	s.Table[269] = syscalls.Supported("faccessat", Faccessat)
 	s.Table[270] = syscalls.Supported("pselect", Pselect)
 	s.Table[271] = syscalls.Supported("ppoll", Ppoll)
-	delete(s.Table, 275) // splice
-	delete(s.Table, 276) // tee
+	s.Table[275] = syscalls.Supported("splice", Splice)
+	s.Table[276] = syscalls.Supported("tee", Tee)
 	s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange)
 	s.Table[280] = syscalls.Supported("utimensat", Utimensat)
 	s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait)
 	s.Table[282] = syscalls.Supported("signalfd", Signalfd)
 	s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate)
 	s.Table[284] = syscalls.Supported("eventfd", Eventfd)
-	delete(s.Table, 285) // fallocate
+	s.Table[285] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil)
 	s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime)
 	s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime)
 	s.Table[288] = syscalls.Supported("accept4", Accept4)
@@ -151,7 +147,7 @@ func Override() {
 	s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
 	s.Table[292] = syscalls.Supported("dup3", Dup3)
 	s.Table[293] = syscalls.Supported("pipe2", Pipe2)
-	delete(s.Table, 294) // inotify_init1
+	s.Table[294] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil)
 	s.Table[295] = syscalls.Supported("preadv", Preadv)
 	s.Table[296] = syscalls.Supported("pwritev", Pwritev)
 	s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg)
@@ -167,6 +163,106 @@ func Override() {
 
 	// Override ARM64.
 	s = linux.ARM64
+	s.Table[5] = syscalls.Supported("setxattr", Setxattr)
+	s.Table[6] = syscalls.Supported("lsetxattr", Lsetxattr)
+	s.Table[7] = syscalls.Supported("fsetxattr", Fsetxattr)
+	s.Table[8] = syscalls.Supported("getxattr", Getxattr)
+	s.Table[9] = syscalls.Supported("lgetxattr", Lgetxattr)
+	s.Table[10] = syscalls.Supported("fgetxattr", Fgetxattr)
+	s.Table[11] = syscalls.Supported("listxattr", Listxattr)
+	s.Table[12] = syscalls.Supported("llistxattr", Llistxattr)
+	s.Table[13] = syscalls.Supported("flistxattr", Flistxattr)
+	s.Table[14] = syscalls.Supported("removexattr", Removexattr)
+	s.Table[15] = syscalls.Supported("lremovexattr", Lremovexattr)
+	s.Table[16] = syscalls.Supported("fremovexattr", Fremovexattr)
+	s.Table[17] = syscalls.Supported("getcwd", Getcwd)
+	s.Table[19] = syscalls.Supported("eventfd2", Eventfd2)
+	s.Table[20] = syscalls.Supported("epoll_create1", EpollCreate1)
+	s.Table[21] = syscalls.Supported("epoll_ctl", EpollCtl)
+	s.Table[22] = syscalls.Supported("epoll_pwait", EpollPwait)
+	s.Table[23] = syscalls.Supported("dup", Dup)
+	s.Table[24] = syscalls.Supported("dup3", Dup3)
+	s.Table[25] = syscalls.Supported("fcntl", Fcntl)
+	s.Table[26] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil)
+	s.Table[27] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil)
+	s.Table[28] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil)
+	s.Table[29] = syscalls.Supported("ioctl", Ioctl)
+	s.Table[32] = syscalls.Supported("flock", Flock)
+	s.Table[33] = syscalls.Supported("mknodat", Mknodat)
+	s.Table[34] = syscalls.Supported("mkdirat", Mkdirat)
+	s.Table[35] = syscalls.Supported("unlinkat", Unlinkat)
+	s.Table[36] = syscalls.Supported("symlinkat", Symlinkat)
+	s.Table[37] = syscalls.Supported("linkat", Linkat)
+	s.Table[38] = syscalls.Supported("renameat", Renameat)
+	s.Table[39] = syscalls.Supported("umount2", Umount2)
+	s.Table[40] = syscalls.Supported("mount", Mount)
+	s.Table[43] = syscalls.Supported("statfs", Statfs)
+	s.Table[44] = syscalls.Supported("fstatfs", Fstatfs)
+	s.Table[45] = syscalls.Supported("truncate", Truncate)
+	s.Table[46] = syscalls.Supported("ftruncate", Ftruncate)
+	s.Table[48] = syscalls.Supported("faccessat", Faccessat)
+	s.Table[49] = syscalls.Supported("chdir", Chdir)
+	s.Table[50] = syscalls.Supported("fchdir", Fchdir)
+	s.Table[51] = syscalls.Supported("chroot", Chroot)
+	s.Table[52] = syscalls.Supported("fchmod", Fchmod)
+	s.Table[53] = syscalls.Supported("fchmodat", Fchmodat)
+	s.Table[54] = syscalls.Supported("fchownat", Fchownat)
+	s.Table[55] = syscalls.Supported("fchown", Fchown)
+	s.Table[56] = syscalls.Supported("openat", Openat)
+	s.Table[57] = syscalls.Supported("close", Close)
+	s.Table[59] = syscalls.Supported("pipe2", Pipe2)
+	s.Table[61] = syscalls.Supported("getdents64", Getdents64)
+	s.Table[62] = syscalls.Supported("lseek", Lseek)
 	s.Table[63] = syscalls.Supported("read", Read)
+	s.Table[64] = syscalls.Supported("write", Write)
+	s.Table[65] = syscalls.Supported("readv", Readv)
+	s.Table[66] = syscalls.Supported("writev", Writev)
+	s.Table[67] = syscalls.Supported("pread64", Pread64)
+	s.Table[68] = syscalls.Supported("pwrite64", Pwrite64)
+	s.Table[69] = syscalls.Supported("preadv", Preadv)
+	s.Table[70] = syscalls.Supported("pwritev", Pwritev)
+	s.Table[72] = syscalls.Supported("pselect", Pselect)
+	s.Table[73] = syscalls.Supported("ppoll", Ppoll)
+	s.Table[74] = syscalls.Supported("signalfd4", Signalfd4)
+	s.Table[76] = syscalls.Supported("splice", Splice)
+	s.Table[77] = syscalls.Supported("tee", Tee)
+	s.Table[78] = syscalls.Supported("readlinkat", Readlinkat)
+	s.Table[80] = syscalls.Supported("fstat", Fstat)
+	s.Table[81] = syscalls.Supported("sync", Sync)
+	s.Table[82] = syscalls.Supported("fsync", Fsync)
+	s.Table[83] = syscalls.Supported("fdatasync", Fdatasync)
+	s.Table[84] = syscalls.Supported("sync_file_range", SyncFileRange)
+	s.Table[85] = syscalls.Supported("timerfd_create", TimerfdCreate)
+	s.Table[86] = syscalls.Supported("timerfd_settime", TimerfdSettime)
+	s.Table[87] = syscalls.Supported("timerfd_gettime", TimerfdGettime)
+	s.Table[88] = syscalls.Supported("utimensat", Utimensat)
+	s.Table[198] = syscalls.Supported("socket", Socket)
+	s.Table[199] = syscalls.Supported("socketpair", SocketPair)
+	s.Table[200] = syscalls.Supported("bind", Bind)
+	s.Table[201] = syscalls.Supported("listen", Listen)
+	s.Table[202] = syscalls.Supported("accept", Accept)
+	s.Table[203] = syscalls.Supported("connect", Connect)
+	s.Table[204] = syscalls.Supported("getsockname", GetSockName)
+	s.Table[205] = syscalls.Supported("getpeername", GetPeerName)
+	s.Table[206] = syscalls.Supported("sendto", SendTo)
+	s.Table[207] = syscalls.Supported("recvfrom", RecvFrom)
+	s.Table[208] = syscalls.Supported("setsockopt", SetSockOpt)
+	s.Table[209] = syscalls.Supported("getsockopt", GetSockOpt)
+	s.Table[210] = syscalls.Supported("shutdown", Shutdown)
+	s.Table[211] = syscalls.Supported("sendmsg", SendMsg)
+	s.Table[212] = syscalls.Supported("recvmsg", RecvMsg)
+	s.Table[221] = syscalls.Supported("execve", Execve)
+	s.Table[222] = syscalls.Supported("mmap", Mmap)
+	s.Table[242] = syscalls.Supported("accept4", Accept4)
+	s.Table[243] = syscalls.Supported("recvmmsg", RecvMMsg)
+	s.Table[267] = syscalls.Supported("syncfs", Syncfs)
+	s.Table[269] = syscalls.Supported("sendmmsg", SendMMsg)
+	s.Table[276] = syscalls.Supported("renameat2", Renameat2)
+	s.Table[279] = syscalls.Supported("memfd_create", MemfdCreate)
+	s.Table[281] = syscalls.Supported("execveat", Execveat)
+	s.Table[286] = syscalls.Supported("preadv2", Preadv2)
+	s.Table[287] = syscalls.Supported("pwritev2", Pwritev2)
+	s.Table[291] = syscalls.Supported("statx", Statx)
+
 	s.Init()
 }
diff --git a/pkg/sentry/time/muldiv_arm64.s b/pkg/sentry/time/muldiv_arm64.s
index 5ad57a8a3..8afc62d53 100644
--- a/pkg/sentry/time/muldiv_arm64.s
+++ b/pkg/sentry/time/muldiv_arm64.s
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "funcdata.h"
 #include "textflag.h"
 
 // Documentation is available in parameters.go.
 //
 // func muldiv64(value, multiplier, divisor uint64) (uint64, bool)
 TEXT ·muldiv64(SB),NOSPLIT,$40-33
+    GO_ARGS
+    NO_LOCAL_POINTERS
     MOVD    value+0(FP), R0
     MOVD    multiplier+8(FP), R1
     MOVD    divisor+16(FP), R2
diff --git a/pkg/sentry/time/parameters.go b/pkg/sentry/time/parameters.go
index 65868cb26..cd1b95117 100644
--- a/pkg/sentry/time/parameters.go
+++ b/pkg/sentry/time/parameters.go
@@ -228,11 +228,15 @@ func errorAdjust(prevParams Parameters, newParams Parameters, now TSCValue) (Par
 //
 // The log level is determined by the error severity.
 func logErrorAdjustment(clock ClockID, errorNS ReferenceNS, orig, adjusted Parameters) {
-	fn := log.Debugf
-	if int64(errorNS.Magnitude()) > time.Millisecond.Nanoseconds() {
+	magNS := int64(errorNS.Magnitude())
+	if magNS <= 10*time.Microsecond.Nanoseconds() {
+		// Don't log small errors.
+		return
+	}
+	fn := log.Infof
+	if magNS > time.Millisecond.Nanoseconds() {
+		// Upgrade large errors to warning.
 		fn = log.Warningf
-	} else if int64(errorNS.Magnitude()) > 10*time.Microsecond.Nanoseconds() {
-		fn = log.Infof
 	}
 
 	fn("Clock(%v): error: %v ns, adjusted frequency from %v Hz to %v Hz", clock, errorNS, orig.Frequency, adjusted.Frequency)
diff --git a/pkg/sentry/time/parameters_test.go b/pkg/sentry/time/parameters_test.go
index e1b9084ac..0ce1257f6 100644
--- a/pkg/sentry/time/parameters_test.go
+++ b/pkg/sentry/time/parameters_test.go
@@ -484,3 +484,18 @@ func TestMulDivOverflow(t *testing.T) {
 		})
 	}
 }
+
+func BenchmarkMuldiv64(b *testing.B) {
+	var v uint64 = math.MaxUint64
+	for i := uint64(1); i <= 1000000; i++ {
+		mult := uint64(1000000000)
+		div := i * mult
+		res, ok := muldiv64(v, mult, div)
+		if !ok {
+			b.Errorf("Result of %v * %v / %v ok got false want true", v, mult, div)
+		}
+		if want := v / i; res != want {
+			b.Errorf("Bad result of %v * %v / %v: got %v, want %v", v, mult, div, res, want)
+		}
+	}
+}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 94d69c1cc..642769e7c 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -15,6 +15,18 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "event_list",
+    out = "event_list.go",
+    package = "vfs",
+    prefix = "event",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Event",
+        "Linker": "*Event",
+    },
+)
+
 go_library(
     name = "vfs",
     srcs = [
@@ -25,11 +37,14 @@ go_library(
         "device.go",
         "epoll.go",
         "epoll_interest_list.go",
+        "event_list.go",
         "file_description.go",
         "file_description_impl_util.go",
         "filesystem.go",
         "filesystem_impl_util.go",
         "filesystem_type.go",
+        "inotify.go",
+        "lock.go",
         "mount.go",
         "mount_unsafe.go",
         "options.go",
@@ -57,6 +72,7 @@ go_library(
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/uniqueid",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
index 9aa133bcb..4b9faf2ea 100644
--- a/pkg/sentry/vfs/README.md
+++ b/pkg/sentry/vfs/README.md
@@ -39,8 +39,8 @@ Mount references are held by:
 -   Mount: Each referenced Mount holds a reference on its parent, which is the
     mount containing its mount point.
 
--   VirtualFilesystem: A reference is held on each Mount that has not been
-    umounted.
+-   VirtualFilesystem: A reference is held on each Mount that has been connected
+    to a mount point, but not yet umounted.
 
 MountNamespace and FileDescription references are held by users of VFS. The
 expectation is that each `kernel.Task` holds a reference on its corresponding
@@ -169,8 +169,6 @@ This construction, which is essentially a type-safe analogue to Linux's
 
     -   binder, which is similarly far too incomplete to use.
 
-    -   whitelistfs, which we are already actively attempting to remove.
-
 -   Save/restore. For instance, it is unclear if the current implementation of
     the `state` package supports the inheritance pattern described above.
 
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index caf770fd5..641e3e502 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -297,3 +297,18 @@ func (d *anonDentry) TryIncRef() bool {
 func (d *anonDentry) DecRef() {
 	// no-op
 }
+
+// InotifyWithParent implements DentryImpl.InotifyWithParent.
+//
+// Although Linux technically supports inotify on pseudo filesystems (inotify
+// is implemented at the vfs layer), it is not particularly useful. It is left
+// unimplemented until someone actually needs it.
+func (d *anonDentry) InotifyWithParent(events, cookie uint32, et EventType) {}
+
+// Watches implements DentryImpl.Watches.
+func (d *anonDentry) Watches() *Watches {
+	return nil
+}
+
+// OnZeroWatches implements Dentry.OnZeroWatches.
+func (d *anonDentry) OnZeroWatches() {}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 8624dbd5d..cea3e6955 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -103,6 +103,39 @@ type DentryImpl interface {
 
 	// DecRef decrements the Dentry's reference count.
 	DecRef()
+
+	// InotifyWithParent notifies all watches on the targets represented by this
+	// dentry and its parent. The parent's watches are notified first, followed
+	// by this dentry's.
+	//
+	// InotifyWithParent automatically adds the IN_ISDIR flag for dentries
+	// representing directories.
+	//
+	// Note that the events may not actually propagate up to the user, depending
+	// on the event masks.
+	InotifyWithParent(events, cookie uint32, et EventType)
+
+	// Watches returns the set of inotify watches for the file corresponding to
+	// the Dentry. Dentries that are hard links to the same underlying file
+	// share the same watches.
+	//
+	// Watches may return nil if the dentry belongs to a FilesystemImpl that
+	// does not support inotify. If an implementation returns a non-nil watch
+	// set, it must always return a non-nil watch set. Likewise, if an
+	// implementation returns a nil watch set, it must always return a nil watch
+	// set.
+	//
+	// The caller does not need to hold a reference on the dentry.
+	Watches() *Watches
+
+	// OnZeroWatches is called whenever the number of watches on a dentry drops
+	// to zero. This is needed by some FilesystemImpls (e.g. gofer) to manage
+	// dentry lifetime.
+	//
+	// The caller does not need to hold a reference on the dentry. OnZeroWatches
+	// may acquire inotify locks, so to prevent deadlock, no inotify locks should
+	// be held by the caller.
+	OnZeroWatches()
 }
 
 // IncRef increments d's reference count.
@@ -133,6 +166,26 @@ func (d *Dentry) isMounted() bool {
 	return atomic.LoadUint32(&d.mounts) != 0
 }
 
+// InotifyWithParent notifies all watches on the targets represented by d and
+// its parent of events.
+func (d *Dentry) InotifyWithParent(events, cookie uint32, et EventType) {
+	d.impl.InotifyWithParent(events, cookie, et)
+}
+
+// Watches returns the set of inotify watches associated with d.
+//
+// Watches will return nil if d belongs to a FilesystemImpl that does not
+// support inotify.
+func (d *Dentry) Watches() *Watches {
+	return d.impl.Watches()
+}
+
+// OnZeroWatches performs cleanup tasks whenever the number of watches on a
+// dentry drops to zero.
+func (d *Dentry) OnZeroWatches() {
+	d.impl.OnZeroWatches()
+}
+
 // The following functions are exported so that filesystem implementations can
 // use them. The vfs package, and users of VFS, should not call these
 // functions.
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 8297f964b..5b009b928 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -31,6 +31,7 @@ type EpollInstance struct {
 	vfsfd FileDescription
 	FileDescriptionDefaultImpl
 	DentryMetadataFileDescriptionImpl
+	NoLockFD
 
 	// q holds waiters on this EpollInstance.
 	q waiter.Queue
@@ -185,7 +186,7 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin
 	}
 
 	// Register interest in file.
-	mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
+	mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP
 	epi := &epollInterest{
 		epoll:    ep,
 		key:      key,
@@ -256,7 +257,7 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event
 	}
 
 	// Update epi for the next call to ep.ReadEvents().
-	mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
+	mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP
 	ep.mu.Lock()
 	epi.mask = mask
 	epi.userData = event.Data
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index cfabd936c..93861fb4a 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -42,11 +42,20 @@ type FileDescription struct {
 	// operations.
 	refs int64
 
+	// flagsMu protects statusFlags and asyncHandler below.
+	flagsMu sync.Mutex
+
 	// statusFlags contains status flags, "initialized by open(2) and possibly
-	// modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic
-	// memory operations.
+	// modified by fcntl()" - fcntl(2). statusFlags can be read using atomic
+	// memory operations when it does not need to be synchronized with an
+	// access to asyncHandler.
 	statusFlags uint32
 
+	// asyncHandler handles O_ASYNC signal generation. It is set with the
+	// F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must
+	// also be set by fcntl(2).
+	asyncHandler FileAsync
+
 	// epolls is the set of epollInterests registered for this FileDescription.
 	// epolls is protected by epollMu.
 	epollMu sync.Mutex
@@ -73,6 +82,8 @@ type FileDescription struct {
 	// writable is analogous to Linux's FMODE_WRITE.
 	writable bool
 
+	usedLockBSD uint32
+
 	// impl is the FileDescriptionImpl associated with this Filesystem. impl is
 	// immutable. This should be the last field in FileDescription.
 	impl FileDescriptionImpl
@@ -80,8 +91,7 @@ type FileDescription struct {
 
 // FileDescriptionOptions contains options to FileDescription.Init().
 type FileDescriptionOptions struct {
-	// If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is
-	// usually only the case if O_DIRECT would actually have an effect.
+	// If AllowDirectIO is true, allow O_DIRECT to be set on the file.
 	AllowDirectIO bool
 
 	// If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
@@ -106,6 +116,10 @@ type FileDescriptionOptions struct {
 	UseDentryMetadata bool
 }
 
+// FileCreationFlags are the set of flags passed to FileDescription.Init() but
+// omitted from FileDescription.StatusFlags().
+const FileCreationFlags = linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC
+
 // Init must be called before first use of fd. If it succeeds, it takes
 // references on mnt and d. flags is the initial file description flags, which
 // is usually the full set of flags passed to open(2).
@@ -120,8 +134,8 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou
 	fd.refs = 1
 
 	// Remove "file creation flags" to mirror the behavior from file.f_flags in
-	// fs/open.c:do_dentry_open
-	fd.statusFlags = flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC)
+	// fs/open.c:do_dentry_open.
+	fd.statusFlags = flags &^ FileCreationFlags
 	fd.vd = VirtualDentry{
 		mount:  mnt,
 		dentry: d,
@@ -175,12 +189,25 @@ func (fd *FileDescription) DecRef() {
 			}
 			ep.interestMu.Unlock()
 		}
+
+		// If BSD locks were used, release any lock that it may have acquired.
+		if atomic.LoadUint32(&fd.usedLockBSD) != 0 {
+			fd.impl.UnlockBSD(context.Background(), fd)
+		}
+
 		// Release implementation resources.
 		fd.impl.Release()
 		if fd.writable {
 			fd.vd.mount.EndWrite()
 		}
 		fd.vd.DecRef()
+		fd.flagsMu.Lock()
+		// TODO(gvisor.dev/issue/1663): We may need to unregister during save, as we do in VFS1.
+		if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
+			fd.asyncHandler.Unregister(fd)
+		}
+		fd.asyncHandler = nil
+		fd.flagsMu.Unlock()
 	} else if refs < 0 {
 		panic("FileDescription.DecRef() called without holding a reference")
 	}
@@ -210,6 +237,11 @@ func (fd *FileDescription) VirtualDentry() VirtualDentry {
 	return fd.vd
 }
 
+// Options returns the options passed to fd.Init().
+func (fd *FileDescription) Options() FileDescriptionOptions {
+	return fd.opts
+}
+
 // StatusFlags returns file description status flags, as for fcntl(F_GETFL).
 func (fd *FileDescription) StatusFlags() uint32 {
 	return atomic.LoadUint32(&fd.statusFlags)
@@ -259,7 +291,18 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede
 	}
 	// TODO(jamieliu): FileDescriptionImpl.SetOAsync()?
 	const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK
-	atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags))
+	fd.flagsMu.Lock()
+	if fd.asyncHandler != nil {
+		// Use fd.statusFlags instead of oldFlags, which may have become outdated,
+		// to avoid double registering/unregistering.
+		if fd.statusFlags&linux.O_ASYNC == 0 && flags&linux.O_ASYNC != 0 {
+			fd.asyncHandler.Register(fd)
+		} else if fd.statusFlags&linux.O_ASYNC != 0 && flags&linux.O_ASYNC == 0 {
+			fd.asyncHandler.Unregister(fd)
+		}
+	}
+	fd.statusFlags = (oldFlags &^ settableFlags) | (flags & settableFlags)
+	fd.flagsMu.Unlock()
 	return nil
 }
 
@@ -311,6 +354,10 @@ type FileDescriptionImpl interface {
 	// represented by the FileDescription.
 	StatFS(ctx context.Context) (linux.Statfs, error)
 
+	// Allocate grows the file to offset + length bytes.
+	// Only mode == 0 is supported currently.
+	Allocate(ctx context.Context, mode, offset, length uint64) error
+
 	// waiter.Waitable methods may be used to poll for I/O events.
 	waiter.Waitable
 
@@ -415,24 +462,16 @@ type FileDescriptionImpl interface {
 	Removexattr(ctx context.Context, name string) error
 
 	// LockBSD tries to acquire a BSD-style advisory file lock.
-	//
-	// TODO(gvisor.dev/issue/1480): BSD-style file locking
 	LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
 
-	// LockBSD releases a BSD-style advisory file lock.
-	//
-	// TODO(gvisor.dev/issue/1480): BSD-style file locking
+	// UnlockBSD releases a BSD-style advisory file lock.
 	UnlockBSD(ctx context.Context, uid lock.UniqueID) error
 
 	// LockPOSIX tries to acquire a POSIX-style advisory file lock.
-	//
-	// TODO(gvisor.dev/issue/1480): POSIX-style file locking
-	LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error
+	LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, length uint64, whence int16, block lock.Blocker) error
 
 	// UnlockPOSIX releases a POSIX-style advisory file lock.
-	//
-	// TODO(gvisor.dev/issue/1480): POSIX-style file locking
-	UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error
+	UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, length uint64, whence int16) error
 }
 
 // Dirent holds the information contained in struct linux_dirent64.
@@ -462,6 +501,15 @@ type IterDirentsCallback interface {
 	Handle(dirent Dirent) error
 }
 
+// IterDirentsCallbackFunc implements IterDirentsCallback for a function with
+// the semantics of IterDirentsCallback.Handle.
+type IterDirentsCallbackFunc func(dirent Dirent) error
+
+// Handle implements IterDirentsCallback.Handle.
+func (f IterDirentsCallbackFunc) Handle(dirent Dirent) error {
+	return f(dirent)
+}
+
 // OnClose is called when a file descriptor representing the FileDescription is
 // closed. Returning a non-nil error should not prevent the file descriptor
 // from being closed.
@@ -515,17 +563,28 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
 	return fd.impl.StatFS(ctx)
 }
 
-// Readiness returns fd's I/O readiness.
+// Allocate grows file represented by FileDescription to offset + length bytes.
+func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return fd.impl.Allocate(ctx, mode, offset, length)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+//
+// It returns fd's I/O readiness.
 func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return fd.impl.Readiness(mask)
 }
 
-// EventRegister registers e for I/O readiness events in mask.
+// EventRegister implements waiter.Waitable.EventRegister.
+//
+// It registers e for I/O readiness events in mask.
 func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
 	fd.impl.EventRegister(e, mask)
 }
 
-// EventUnregister unregisters e for I/O readiness events.
+// EventUnregister implements waiter.Waitable.EventUnregister.
+//
+// It unregisters e for I/O readiness events.
 func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
 	fd.impl.EventUnregister(e)
 }
@@ -731,3 +790,53 @@ func (fd *FileDescription) InodeID() uint64 {
 func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
 	return fd.Sync(ctx)
 }
+
+// LockBSD tries to acquire a BSD-style advisory file lock.
+func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, blocker lock.Blocker) error {
+	atomic.StoreUint32(&fd.usedLockBSD, 1)
+	return fd.impl.LockBSD(ctx, fd, lockType, blocker)
+}
+
+// UnlockBSD releases a BSD-style advisory file lock.
+func (fd *FileDescription) UnlockBSD(ctx context.Context) error {
+	return fd.impl.UnlockBSD(ctx, fd)
+}
+
+// LockPOSIX locks a POSIX-style file range lock.
+func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, end uint64, whence int16, block lock.Blocker) error {
+	return fd.impl.LockPOSIX(ctx, uid, t, start, end, whence, block)
+}
+
+// UnlockPOSIX unlocks a POSIX-style file range lock.
+func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, end uint64, whence int16) error {
+	return fd.impl.UnlockPOSIX(ctx, uid, start, end, whence)
+}
+
+// A FileAsync sends signals to its owner when w is ready for IO. This is only
+// implemented by pkg/sentry/fasync:FileAsync, but we unfortunately need this
+// interface to avoid circular dependencies.
+type FileAsync interface {
+	Register(w waiter.Waitable)
+	Unregister(w waiter.Waitable)
+}
+
+// AsyncHandler returns the FileAsync for fd.
+func (fd *FileDescription) AsyncHandler() FileAsync {
+	fd.flagsMu.Lock()
+	defer fd.flagsMu.Unlock()
+	return fd.asyncHandler
+}
+
+// SetAsyncHandler sets fd.asyncHandler if it has not been set before and
+// returns it.
+func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsync {
+	fd.flagsMu.Lock()
+	defer fd.flagsMu.Unlock()
+	if fd.asyncHandler == nil {
+		fd.asyncHandler = newHandler()
+		if fd.statusFlags&linux.O_ASYNC != 0 {
+			fd.asyncHandler.Register(fd)
+		}
+	}
+	return fd.asyncHandler
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index f4c111926..6b8b4ad49 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -21,7 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -56,6 +56,12 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err
 	return linux.Statfs{}, syserror.ENOSYS
 }
 
+// Allocate implements FileDescriptionImpl.Allocate analogously to
+// fallocate called on regular file, directory or FIFO in Linux.
+func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return syserror.ENODEV
+}
+
 // Readiness implements waiter.Waitable.Readiness analogously to
 // file_operations::poll == NULL in Linux.
 func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask {
@@ -153,31 +159,16 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string)
 	return syserror.ENOTSUP
 }
 
-// LockBSD implements FileDescriptionImpl.LockBSD.
-func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
-	return syserror.EBADF
-}
-
-// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
-func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
-	return syserror.EBADF
-}
-
-// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
-func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
-	return syserror.EBADF
-}
-
-// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
-func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
-	return syserror.EBADF
-}
-
 // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
 // FileDescriptionImpl that always represent directories to obtain
 // implementations of non-directory I/O methods that return EISDIR.
 type DirectoryFileDescriptionDefaultImpl struct{}
 
+// Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate.
+func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return syserror.EISDIR
+}
+
 // PRead implements FileDescriptionImpl.PRead.
 func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
 	return 0, syserror.EISDIR
@@ -347,7 +338,7 @@ func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src
 
 	writable, ok := fd.data.(WritableDynamicBytesSource)
 	if !ok {
-		return 0, syserror.EINVAL
+		return 0, syserror.EIO
 	}
 	n, err := writable.Write(ctx, src, offset)
 	if err != nil {
@@ -384,3 +375,54 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M
 	fd.IncRef()
 	return nil
 }
+
+// LockFD may be used by most implementations of FileDescriptionImpl.Lock*
+// functions. Caller must call Init().
+type LockFD struct {
+	locks *FileLocks
+}
+
+// Init initializes fd with FileLocks to use.
+func (fd *LockFD) Init(locks *FileLocks) {
+	fd.locks = locks
+}
+
+// Locks returns the locks associated with this file.
+func (fd *LockFD) Locks() *FileLocks {
+	return fd.locks
+}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	return fd.locks.LockBSD(uid, t, block)
+}
+
+// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
+	fd.locks.UnlockBSD(uid)
+	return nil
+}
+
+// NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface
+// returning ENOLCK.
+type NoLockFD struct{}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	return syserror.ENOLCK
+}
+
+// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
+	return syserror.ENOLCK
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return syserror.ENOLCK
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return syserror.ENOLCK
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 3a75d4d62..3b7e1c273 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -33,6 +33,7 @@ import (
 type fileDescription struct {
 	vfsfd FileDescription
 	FileDescriptionDefaultImpl
+	NoLockFD
 }
 
 // genCount contains the number of times its DynamicBytesSource.Generate()
@@ -154,11 +155,11 @@ func TestGenCountFD(t *testing.T) {
 	}
 
 	// Write and PWrite fails.
-	if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EINVAL {
-		t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+	if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EIO {
+		t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO)
 	}
-	if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EINVAL {
-		t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+	if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EIO {
+		t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO)
 	}
 }
 
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 1edd584c9..6bb9ca180 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -524,8 +524,6 @@ type FilesystemImpl interface {
 	//
 	// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
 	PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
-
-	// TODO(gvisor.dev/issue/1479): inotify_add_watch()
 }
 
 // PrependPathAtVFSRootError is returned by implementations of
diff --git a/pkg/sentry/vfs/g3doc/inotify.md b/pkg/sentry/vfs/g3doc/inotify.md
new file mode 100644
index 000000000..e7da49faa
--- /dev/null
+++ b/pkg/sentry/vfs/g3doc/inotify.md
@@ -0,0 +1,210 @@
+# Inotify
+
+Inotify is a mechanism for monitoring filesystem events in Linux--see
+inotify(7). An inotify instance can be used to monitor files and directories for
+modifications, creation/deletion, etc. The inotify API consists of system calls
+that create inotify instances (inotify_init/inotify_init1) and add/remove
+watches on files to an instance (inotify_add_watch/inotify_rm_watch). Events are
+generated from various places in the sentry, including the syscall layer, the
+vfs layer, the process fd table, and within each filesystem implementation. This
+document outlines the implementation details of inotify in VFS2.
+
+## Inotify Objects
+
+Inotify data structures are implemented in the vfs package.
+
+### vfs.Inotify
+
+Inotify instances are represented by vfs.Inotify objects, which implement
+vfs.FileDescriptionImpl. As in Linux, inotify fds are backed by a
+pseudo-filesystem (anonfs). Each inotify instance receives events from a set of
+vfs.Watch objects, which can be modified with inotify_add_watch(2) and
+inotify_rm_watch(2). An application can retrieve events by reading the inotify
+fd.
+
+### vfs.Watches
+
+The set of all watches held on a single file (i.e., the watch target) is stored
+in vfs.Watches. Each watch will belong to a different inotify instance (an
+instance can only have one watch on any watch target). The watches are stored in
+a map indexed by their vfs.Inotify owner’s id. Hard links and file descriptions
+to a single file will all share the same vfs.Watches. Activity on the target
+causes its vfs.Watches to generate notifications on its watches’ inotify
+instances.
+
+### vfs.Watch
+
+A single watch, owned by one inotify instance and applied to one watch target.
+Both the vfs.Inotify owner and vfs.Watches on the target will hold a vfs.Watch,
+which leads to some complicated locking behavior (see Lock Ordering). Whenever a
+watch is notified of an event on its target, it will queue events to its inotify
+instance for delivery to the user.
+
+### vfs.Event
+
+vfs.Event is a simple struct encapsulating all the fields for an inotify event.
+It is generated by vfs.Watches and forwarded to the watches' owners. It is
+serialized to the user during read(2) syscalls on the associated fs.Inotify's
+fd.
+
+## Lock Ordering
+
+There are three locks related to the inotify implementation:
+
+Inotify.mu: the inotify instance lock. Inotify.evMu: the inotify event queue
+lock. Watches.mu: the watch set lock, used to protect the collection of watches
+on a target.
+
+The correct lock ordering for inotify code is:
+
+Inotify.mu -> Watches.mu -> Inotify.evMu.
+
+Note that we use a distinct lock to protect the inotify event queue. If we
+simply used Inotify.mu, we could simultaneously have locks being acquired in the
+order of Inotify.mu -> Watches.mu and Watches.mu -> Inotify.mu, which would
+cause deadlocks. For instance, adding a watch to an inotify instance would
+require locking Inotify.mu, and then adding the same watch to the target would
+cause Watches.mu to be held. At the same time, generating an event on the target
+would require Watches.mu to be held before iterating through each watch, and
+then notifying the owner of each watch would cause Inotify.mu to be held.
+
+See the vfs package comment to understand how inotify locks fit into the overall
+ordering of filesystem locks.
+
+## Watch Targets in Different Filesystem Implementations
+
+In Linux, watches reside on inodes at the virtual filesystem layer. As a result,
+all hard links and file descriptions on a single file will all share the same
+watch set. In VFS2, there is no common inode structure across filesystem types
+(some may not even have inodes), so we have to plumb inotify support through
+each specific filesystem implementation. Some of the technical considerations
+are outlined below.
+
+### Tmpfs
+
+For filesystems with inodes, like tmpfs, the design is quite similar to that of
+Linux, where watches reside on the inode.
+
+### Pseudo-filesystems
+
+Technically, because inotify is implemented at the vfs layer in Linux,
+pseudo-filesystems on top of kernfs support inotify passively. However, watches
+can only track explicit filesystem operations like read/write, open/close,
+mknod, etc., so watches on a target like /proc/self/fd will not generate events
+every time a new fd is added or removed. As of this writing, we leave inotify
+unimplemented in kernfs and anonfs; it does not seem particularly useful.
+
+### Gofer Filesystem (fsimpl/gofer)
+
+The gofer filesystem has several traits that make it difficult to support
+inotify:
+
+*   **There are no inodes.** A file is represented as a dentry that holds an
+    unopened p9 file (and possibly an open FID), through which the Sentry
+    interacts with the gofer.
+    *   *Solution:* Because there is no inode structure stored in the sandbox,
+        inotify watches must be held on the dentry. This would be an issue in
+        the presence of hard links, where multiple dentries would need to share
+        the same set of watches, but in VFS2, we do not support the internal
+        creation of hard links on gofer fs. As a result, we make the assumption
+        that every dentry corresponds to a unique inode. However, the next point
+        raises an issue with this assumption:
+*   **The Sentry cannot always be aware of hard links on the remote
+    filesystem.** There is no way for us to confirm whether two files on the
+    remote filesystem are actually links to the same inode. QIDs and inodes are
+    not always 1:1. The assumption that dentries and inodes are 1:1 is
+    inevitably broken if there are remote hard links that we cannot detect.
+    *   *Solution:* this is an issue with gofer fs in general, not only inotify,
+        and we will have to live with it.
+*   **Dentries can be cached, and then evicted.** Dentry lifetime does not
+    correspond to file lifetime. Because gofer fs is not entirely in-memory, the
+    absence of a dentry does not mean that the corresponding file does not
+    exist, nor does a dentry reaching zero references mean that the
+    corresponding file no longer exists. When a dentry reaches zero references,
+    it will be cached, in case the file at that path is needed again in the
+    future. However, the dentry may be evicted from the cache, which will cause
+    a new dentry to be created next time the same file path is used. The
+    existing watches will be lost.
+    *   *Solution:* When a dentry reaches zero references, do not cache it if it
+        has any watches, so we can avoid eviction/destruction. Note that if the
+        dentry was deleted or invalidated (d.vfsd.IsDead()), we should still
+        destroy it along with its watches. Additionally, when a dentry’s last
+        watch is removed, we cache it if it also has zero references. This way,
+        the dentry can eventually be evicted from memory if it is no longer
+        needed.
+*   **Dentries can be invalidated.** Another issue with dentry lifetime is that
+    the remote file at the file path represented may change from underneath the
+    dentry. In this case, the next time that the dentry is used, it will be
+    invalidated and a new dentry will replace it. In this case, it is not clear
+    what should be done with the watches on the old dentry.
+    *   *Solution:* Silently destroy the watches when invalidation occurs. We
+        have no way of knowing exactly what happened, when it happens. Inotify
+        instances on NFS files in Linux probably behave in a similar fashion,
+        since inotify is implemented at the vfs layer and is not aware of the
+        complexities of remote file systems.
+    *   An alternative would be to issue some kind of event upon invalidation,
+        e.g. a delete event, but this has several issues:
+    *   We cannot discern whether the remote file was invalidated because it was
+        moved, deleted, etc. This information is crucial, because these cases
+        should result in different events. Furthermore, the watches should only
+        be destroyed if the file has been deleted.
+    *   Moreover, the mechanism for detecting whether the underlying file has
+        changed is to check whether a new QID is given by the gofer. This may
+        result in false positives, e.g. suppose that the server closed and
+        re-opened the same file, which may result in a new QID.
+    *   Finally, the time of the event may be completely different from the time
+        of the file modification, since a dentry is not immediately notified
+        when the underlying file has changed. It would be quite unexpected to
+        receive the notification when invalidation was triggered, i.e. the next
+        time the file was accessed within the sandbox, because then the
+        read/write/etc. operation on the file would not result in the expected
+        event.
+    *   Another point in favor of the first solution: inotify in Linux can
+        already be lossy on local filesystems (one of the sacrifices made so
+        that filesystem performance isn’t killed), and it is lossy on NFS for
+        similar reasons to gofer fs. Therefore, it is better for inotify to be
+        silent than to emit incorrect notifications.
+*   **There may be external users of the remote filesystem.** We can only track
+    operations performed on the file within the sandbox. This is sufficient
+    under InteropModeExclusive, but whenever there are external users, the set
+    of actions we are aware of is incomplete.
+    *   *Solution:* We could either return an error or just issue a warning when
+        inotify is used without InteropModeExclusive. Although faulty, VFS1
+        allows it when the filesystem is shared, and Linux does the same for
+        remote filesystems (as mentioned above, inotify sits at the vfs level).
+
+## Dentry Interface
+
+For events that must be generated above the vfs layer, we provide the following
+DentryImpl methods to allow interactions with targets on any FilesystemImpl:
+
+*   **InotifyWithParent()** generates events on the dentry’s watches as well as
+    its parent’s.
+*   **Watches()** retrieves the watch set of the target represented by the
+    dentry. This is used to access and modify watches on a target.
+*   **OnZeroWatches()** performs cleanup tasks after the last watch is removed
+    from a dentry. This is needed by gofer fs, which must allow a watched dentry
+    to be cached once it has no more watches. Most implementations can just do
+    nothing. Note that OnZeroWatches() must be called after all inotify locks
+    are released to preserve lock ordering, since it may acquire
+    FilesystemImpl-specific locks.
+
+## IN_EXCL_UNLINK
+
+There are several options that can be set for a watch, specified as part of the
+mask in inotify_add_watch(2). In particular, IN_EXCL_UNLINK requires some
+additional support in each filesystem.
+
+A watch with IN_EXCL_UNLINK will not generate events for its target if it
+corresponds to a path that was unlinked. For instance, if an fd is opened on
+“foo/bar” and “foo/bar” is subsequently unlinked, any reads/writes/etc. on the
+fd will be ignored by watches on “foo” or “foo/bar” with IN_EXCL_UNLINK. This
+requires each DentryImpl to keep track of whether it has been unlinked, in order
+to determine whether events should be sent to watches with IN_EXCL_UNLINK.
+
+## IN_ONESHOT
+
+One-shot watches expire after generating a single event. When an event occurs,
+all one-shot watches on the target that successfully generated an event are
+removed. Lock ordering can cause the management of one-shot watches to be quite
+expensive; see Watches.Notify() for more information.
diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go
index 286510195..8882fa84a 100644
--- a/pkg/sentry/vfs/genericfstree/genericfstree.go
+++ b/pkg/sentry/vfs/genericfstree/genericfstree.go
@@ -43,7 +43,7 @@ type Dentry struct {
 // IsAncestorDentry returns true if d is an ancestor of d2; that is, d is
 // either d2's parent or an ancestor of d2's parent.
 func IsAncestorDentry(d, d2 *Dentry) bool {
-	for {
+	for d2 != nil { // Stop at root, where d2.parent == nil.
 		if d2.parent == d {
 			return true
 		}
@@ -52,6 +52,7 @@ func IsAncestorDentry(d, d2 *Dentry) bool {
 		}
 		d2 = d2.parent
 	}
+	return false
 }
 
 // ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d.
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
new file mode 100644
index 000000000..167b731ac
--- /dev/null
+++ b/pkg/sentry/vfs/inotify.go
@@ -0,0 +1,774 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"bytes"
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// inotifyEventBaseSize is the base size of linux's struct inotify_event. This
+// must be a power 2 for rounding below.
+const inotifyEventBaseSize = 16
+
+// EventType defines different kinds of inotfiy events.
+//
+// The way events are labelled appears somewhat arbitrary, but they must match
+// Linux so that IN_EXCL_UNLINK behaves as it does in Linux.
+type EventType uint8
+
+// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and
+// FSNOTIFY_EVENT_INODE in Linux.
+const (
+	PathEvent  EventType = iota
+	InodeEvent EventType = iota
+)
+
+// Inotify represents an inotify instance created by inotify_init(2) or
+// inotify_init1(2). Inotify implements FileDescriptionImpl.
+//
+// +stateify savable
+type Inotify struct {
+	vfsfd FileDescription
+	FileDescriptionDefaultImpl
+	DentryMetadataFileDescriptionImpl
+	NoLockFD
+
+	// Unique identifier for this inotify instance. We don't just reuse the
+	// inotify fd because fds can be duped. These should not be exposed to the
+	// user, since we may aggressively reuse an id on S/R.
+	id uint64
+
+	// queue is used to notify interested parties when the inotify instance
+	// becomes readable or writable.
+	queue waiter.Queue `state:"nosave"`
+
+	// evMu *only* protects the events list. We need a separate lock while
+	// queuing events: using mu may violate lock ordering, since at that point
+	// the calling goroutine may already hold Watches.mu.
+	evMu sync.Mutex `state:"nosave"`
+
+	// A list of pending events for this inotify instance. Protected by evMu.
+	events eventList
+
+	// A scratch buffer, used to serialize inotify events. Allocate this
+	// ahead of time for the sake of performance. Protected by evMu.
+	scratch []byte
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// nextWatchMinusOne is used to allocate watch descriptors on this Inotify
+	// instance. Note that Linux starts numbering watch descriptors from 1.
+	nextWatchMinusOne int32
+
+	// Map from watch descriptors to watch objects.
+	watches map[int32]*Watch
+}
+
+var _ FileDescriptionImpl = (*Inotify)(nil)
+
+// NewInotifyFD constructs a new Inotify instance.
+func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) {
+	// O_CLOEXEC affects file descriptors, so it must be handled outside of vfs.
+	flags &^= linux.O_CLOEXEC
+	if flags&^linux.O_NONBLOCK != 0 {
+		return nil, syserror.EINVAL
+	}
+
+	id := uniqueid.GlobalFromContext(ctx)
+	vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id))
+	defer vd.DecRef()
+	fd := &Inotify{
+		id:      id,
+		scratch: make([]byte, inotifyEventBaseSize),
+		watches: make(map[int32]*Watch),
+	}
+	if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+		UseDentryMetadata: true,
+		DenyPRead:         true,
+		DenyPWrite:        true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release. Release removes all
+// watches and frees all resources for an inotify instance.
+func (i *Inotify) Release() {
+	var ds []*Dentry
+
+	// We need to hold i.mu to avoid a race with concurrent calls to
+	// Inotify.handleDeletion from Watches. There's no risk of Watches
+	// accessing this Inotify after the destructor ends, because we remove all
+	// references to it below.
+	i.mu.Lock()
+	for _, w := range i.watches {
+		// Remove references to the watch from the watches set on the target. We
+		// don't need to worry about the references from i.watches, since this
+		// file description is about to be destroyed.
+		d := w.target
+		ws := d.Watches()
+		// Watchable dentries should never return a nil watch set.
+		if ws == nil {
+			panic("Cannot remove watch from an unwatchable dentry")
+		}
+		ws.Remove(i.id)
+		if ws.Size() == 0 {
+			ds = append(ds, d)
+		}
+	}
+	i.mu.Unlock()
+
+	for _, d := range ds {
+		d.OnZeroWatches()
+	}
+}
+
+// Allocate implements FileDescription.Allocate.
+func (i *Inotify) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	panic("Allocate should not be called on read-only inotify fds")
+}
+
+// EventRegister implements waiter.Waitable.
+func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	i.queue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.
+func (i *Inotify) EventUnregister(e *waiter.Entry) {
+	i.queue.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+//
+// Readiness indicates whether there are pending events for an inotify instance.
+func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ready := waiter.EventMask(0)
+
+	i.evMu.Lock()
+	defer i.evMu.Unlock()
+
+	if !i.events.Empty() {
+		ready |= waiter.EventIn
+	}
+
+	return mask & ready
+}
+
+// PRead implements FileDescriptionImpl.PRead.
+func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// PWrite implements FileDescriptionImpl.PWrite.
+func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	if dst.NumBytes() < inotifyEventBaseSize {
+		return 0, syserror.EINVAL
+	}
+
+	i.evMu.Lock()
+	defer i.evMu.Unlock()
+
+	if i.events.Empty() {
+		// Nothing to read yet, tell caller to block.
+		return 0, syserror.ErrWouldBlock
+	}
+
+	var writeLen int64
+	for it := i.events.Front(); it != nil; {
+		// Advance `it` before the element is removed from the list, or else
+		// it.Next() will always be nil.
+		event := it
+		it = it.Next()
+
+		// Does the buffer have enough remaining space to hold the event we're
+		// about to write out?
+		if dst.NumBytes() < int64(event.sizeOf()) {
+			if writeLen > 0 {
+				// Buffer wasn't big enough for all pending events, but we did
+				// write some events out.
+				return writeLen, nil
+			}
+			return 0, syserror.EINVAL
+		}
+
+		// Linux always dequeues an available event as long as there's enough
+		// buffer space to copy it out, even if the copy below fails. Emulate
+		// this behaviour.
+		i.events.Remove(event)
+
+		// Buffer has enough space, copy event to the read buffer.
+		n, err := event.CopyTo(ctx, i.scratch, dst)
+		if err != nil {
+			return 0, err
+		}
+
+		writeLen += n
+		dst = dst.DropFirst64(n)
+	}
+	return writeLen, nil
+}
+
+// Ioctl implements FileDescriptionImpl.Ioctl.
+func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch args[1].Int() {
+	case linux.FIONREAD:
+		i.evMu.Lock()
+		defer i.evMu.Unlock()
+		var n uint32
+		for e := i.events.Front(); e != nil; e = e.Next() {
+			n += uint32(e.sizeOf())
+		}
+		var buf [4]byte
+		usermem.ByteOrder.PutUint32(buf[:], n)
+		_, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{})
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+func (i *Inotify) queueEvent(ev *Event) {
+	i.evMu.Lock()
+
+	// Check if we should coalesce the event we're about to queue with the last
+	// one currently in the queue. Events are coalesced if they are identical.
+	if last := i.events.Back(); last != nil {
+		if ev.equals(last) {
+			// "Coalesce" the two events by simply not queuing the new one. We
+			// don't need to raise a waiter.EventIn notification because no new
+			// data is available for reading.
+			i.evMu.Unlock()
+			return
+		}
+	}
+
+	i.events.PushBack(ev)
+
+	// Release mutex before notifying waiters because we don't control what they
+	// can do.
+	i.evMu.Unlock()
+
+	i.queue.Notify(waiter.EventIn)
+}
+
+// newWatchLocked creates and adds a new watch to target.
+//
+// Precondition: i.mu must be locked. ws must be the watch set for target d.
+func (i *Inotify) newWatchLocked(d *Dentry, ws *Watches, mask uint32) *Watch {
+	w := &Watch{
+		owner:  i,
+		wd:     i.nextWatchIDLocked(),
+		target: d,
+		mask:   mask,
+	}
+
+	// Hold the watch in this inotify instance as well as the watch set on the
+	// target.
+	i.watches[w.wd] = w
+	ws.Add(w)
+	return w
+}
+
+// newWatchIDLocked allocates and returns a new watch descriptor.
+//
+// Precondition: i.mu must be locked.
+func (i *Inotify) nextWatchIDLocked() int32 {
+	i.nextWatchMinusOne++
+	return i.nextWatchMinusOne
+}
+
+// AddWatch constructs a new inotify watch and adds it to the target. It
+// returns the watch descriptor returned by inotify_add_watch(2).
+//
+// The caller must hold a reference on target.
+func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) {
+	// Note: Locking this inotify instance protects the result returned by
+	// Lookup() below. With the lock held, we know for sure the lookup result
+	// won't become stale because it's impossible for *this* instance to
+	// add/remove watches on target.
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	ws := target.Watches()
+	if ws == nil {
+		// While Linux supports inotify watches on all filesystem types, watches on
+		// filesystems like kernfs are not generally useful, so we do not.
+		return 0, syserror.EPERM
+	}
+	// Does the target already have a watch from this inotify instance?
+	if existing := ws.Lookup(i.id); existing != nil {
+		newmask := mask
+		if mask&linux.IN_MASK_ADD != 0 {
+			// "Add (OR) events to watch mask for this pathname if it already
+			// exists (instead of replacing mask)." -- inotify(7)
+			newmask |= atomic.LoadUint32(&existing.mask)
+		}
+		atomic.StoreUint32(&existing.mask, newmask)
+		return existing.wd, nil
+	}
+
+	// No existing watch, create a new watch.
+	w := i.newWatchLocked(target, ws, mask)
+	return w.wd, nil
+}
+
+// RmWatch looks up an inotify watch for the given 'wd' and configures the
+// target to stop sending events to this inotify instance.
+func (i *Inotify) RmWatch(wd int32) error {
+	i.mu.Lock()
+
+	// Find the watch we were asked to removed.
+	w, ok := i.watches[wd]
+	if !ok {
+		i.mu.Unlock()
+		return syserror.EINVAL
+	}
+
+	// Remove the watch from this instance.
+	delete(i.watches, wd)
+
+	// Remove the watch from the watch target.
+	ws := w.target.Watches()
+	// AddWatch ensures that w.target has a non-nil watch set.
+	if ws == nil {
+		panic("Watched dentry cannot have nil watch set")
+	}
+	ws.Remove(w.OwnerID())
+	remaining := ws.Size()
+	i.mu.Unlock()
+
+	if remaining == 0 {
+		w.target.OnZeroWatches()
+	}
+
+	// Generate the event for the removal.
+	i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0))
+
+	return nil
+}
+
+// Watches is the collection of all inotify watches on a single file.
+//
+// +stateify savable
+type Watches struct {
+	// mu protects the fields below.
+	mu sync.RWMutex `state:"nosave"`
+
+	// ws is the map of active watches in this collection, keyed by the inotify
+	// instance id of the owner.
+	ws map[uint64]*Watch
+}
+
+// Size returns the number of watches held by w.
+func (w *Watches) Size() int {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return len(w.ws)
+}
+
+// Lookup returns the watch owned by an inotify instance with the given id.
+// Returns nil if no such watch exists.
+//
+// Precondition: the inotify instance with the given id must be locked to
+// prevent the returned watch from being concurrently modified or replaced in
+// Inotify.watches.
+func (w *Watches) Lookup(id uint64) *Watch {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.ws[id]
+}
+
+// Add adds watch into this set of watches.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Add(watch *Watch) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	owner := watch.OwnerID()
+	// Sanity check, we should never have two watches for one owner on the
+	// same target.
+	if _, exists := w.ws[owner]; exists {
+		panic(fmt.Sprintf("Watch collision with ID %+v", owner))
+	}
+	if w.ws == nil {
+		w.ws = make(map[uint64]*Watch)
+	}
+	w.ws[owner] = watch
+}
+
+// Remove removes a watch with the given id from this set of watches and
+// releases it. The caller is responsible for generating any watch removal
+// event, as appropriate. The provided id must match an existing watch in this
+// collection.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Remove(id uint64) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	if w.ws == nil {
+		// This watch set is being destroyed. The thread executing the
+		// destructor is already in the process of deleting all our watches. We
+		// got here with no references on the target because we raced with the
+		// destructor notifying all the watch owners of destruction. See the
+		// comment in Watches.HandleDeletion for why this race exists.
+		return
+	}
+
+	// It is possible for w.Remove() to be called for the same watch multiple
+	// times. See the treatment of one-shot watches in Watches.Notify().
+	if _, ok := w.ws[id]; ok {
+		delete(w.ws, id)
+	}
+}
+
+// Notify queues a new event with watches in this set. Watches with
+// IN_EXCL_UNLINK are skipped if the event is coming from a child that has been
+// unlinked.
+func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlinked bool) {
+	var hasExpired bool
+	w.mu.RLock()
+	for _, watch := range w.ws {
+		if unlinked && watch.ExcludeUnlinked() && et == PathEvent {
+			continue
+		}
+		if watch.Notify(name, events, cookie) {
+			hasExpired = true
+		}
+	}
+	w.mu.RUnlock()
+
+	if hasExpired {
+		w.cleanupExpiredWatches()
+	}
+}
+
+// This function is relatively expensive and should only be called where there
+// are expired watches.
+func (w *Watches) cleanupExpiredWatches() {
+	// Because of lock ordering, we cannot acquire Inotify.mu for each watch
+	// owner while holding w.mu. As a result, store expired watches locally
+	// before removing.
+	var toRemove []*Watch
+	w.mu.RLock()
+	for _, watch := range w.ws {
+		if atomic.LoadInt32(&watch.expired) == 1 {
+			toRemove = append(toRemove, watch)
+		}
+	}
+	w.mu.RUnlock()
+	for _, watch := range toRemove {
+		watch.owner.RmWatch(watch.wd)
+	}
+}
+
+// HandleDeletion is called when the watch target is destroyed. Clear the
+// watch set, detach watches from the inotify instances they belong to, and
+// generate the appropriate events.
+func (w *Watches) HandleDeletion() {
+	w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */)
+
+	// As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for
+	// the owner of each watch being deleted. Instead, atomically store the
+	// watches map in a local variable and set it to nil so we can iterate over
+	// it with the assurance that there will be no concurrent accesses.
+	var ws map[uint64]*Watch
+	w.mu.Lock()
+	ws = w.ws
+	w.ws = nil
+	w.mu.Unlock()
+
+	// Remove each watch from its owner's watch set, and generate a corresponding
+	// watch removal event.
+	for _, watch := range ws {
+		i := watch.owner
+		i.mu.Lock()
+		_, found := i.watches[watch.wd]
+		delete(i.watches, watch.wd)
+
+		// Release mutex before notifying waiters because we don't control what
+		// they can do.
+		i.mu.Unlock()
+
+		// If watch was not found, it was removed from the inotify instance before
+		// we could get to it, in which case we should not generate an event.
+		if found {
+			i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0))
+		}
+	}
+}
+
+// Watch represent a particular inotify watch created by inotify_add_watch.
+//
+// +stateify savable
+type Watch struct {
+	// Inotify instance which owns this watch.
+	//
+	// This field is immutable after creation.
+	owner *Inotify
+
+	// Descriptor for this watch. This is unique across an inotify instance.
+	//
+	// This field is immutable after creation.
+	wd int32
+
+	// target is a dentry representing the watch target. Its watch set contains this watch.
+	//
+	// This field is immutable after creation.
+	target *Dentry
+
+	// Events being monitored via this watch. Must be accessed with atomic
+	// memory operations.
+	mask uint32
+
+	// expired is set to 1 to indicate that this watch is a one-shot that has
+	// already sent a notification and therefore can be removed. Must be accessed
+	// with atomic memory operations.
+	expired int32
+}
+
+// OwnerID returns the id of the inotify instance that owns this watch.
+func (w *Watch) OwnerID() uint64 {
+	return w.owner.id
+}
+
+// ExcludeUnlinked indicates whether the watched object should continue to be
+// notified of events originating from a path that has been unlinked.
+//
+// For example, if "foo/bar" is opened and then unlinked, operations on the
+// open fd may be ignored by watches on "foo" and "foo/bar" with IN_EXCL_UNLINK.
+func (w *Watch) ExcludeUnlinked() bool {
+	return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0
+}
+
+// Notify queues a new event on this watch. Returns true if this is a one-shot
+// watch that should be deleted, after this event was successfully queued.
+func (w *Watch) Notify(name string, events uint32, cookie uint32) bool {
+	if atomic.LoadInt32(&w.expired) == 1 {
+		// This is a one-shot watch that is already in the process of being
+		// removed. This may happen if a second event reaches the watch target
+		// before this watch has been removed.
+		return false
+	}
+
+	mask := atomic.LoadUint32(&w.mask)
+	if mask&events == 0 {
+		// We weren't watching for this event.
+		return false
+	}
+
+	// Event mask should include bits matched from the watch plus all control
+	// event bits.
+	unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS
+	effectiveMask := unmaskableBits | mask
+	matchedEvents := effectiveMask & events
+	w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie))
+	if mask&linux.IN_ONESHOT != 0 {
+		atomic.StoreInt32(&w.expired, 1)
+		return true
+	}
+	return false
+}
+
+// Event represents a struct inotify_event from linux.
+//
+// +stateify savable
+type Event struct {
+	eventEntry
+
+	wd     int32
+	mask   uint32
+	cookie uint32
+
+	// len is computed based on the name field is set automatically by
+	// Event.setName. It should be 0 when no name is set; otherwise it is the
+	// length of the name slice.
+	len uint32
+
+	// The name field has special padding requirements and should only be set by
+	// calling Event.setName.
+	name []byte
+}
+
+func newEvent(wd int32, name string, events, cookie uint32) *Event {
+	e := &Event{
+		wd:     wd,
+		mask:   events,
+		cookie: cookie,
+	}
+	if name != "" {
+		e.setName(name)
+	}
+	return e
+}
+
+// paddedBytes converts a go string to a null-terminated c-string, padded with
+// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes
+// in the 's' plus at least one null byte.
+func paddedBytes(s string, l uint32) []byte {
+	if l < uint32(len(s)+1) {
+		panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!")
+	}
+	b := make([]byte, l)
+	copy(b, s)
+
+	// b was zero-value initialized during make(), so the rest of the slice is
+	// already filled with null bytes.
+
+	return b
+}
+
+// setName sets the optional name for this event.
+func (e *Event) setName(name string) {
+	// We need to pad the name such that the entire event length ends up a
+	// multiple of inotifyEventBaseSize.
+	unpaddedLen := len(name) + 1
+	// Round up to nearest multiple of inotifyEventBaseSize.
+	e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1))
+	// Make sure we haven't overflowed and wrapped around when rounding.
+	if unpaddedLen > int(e.len) {
+		panic("Overflow when rounding inotify event size, the 'name' field was too big.")
+	}
+	e.name = paddedBytes(name, e.len)
+}
+
+func (e *Event) sizeOf() int {
+	s := inotifyEventBaseSize + int(e.len)
+	if s < inotifyEventBaseSize {
+		panic("Overflowed event size")
+	}
+	return s
+}
+
+// CopyTo serializes this event to dst. buf is used as a scratch buffer to
+// construct the output. We use a buffer allocated ahead of time for
+// performance. buf must be at least inotifyEventBaseSize bytes.
+func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) {
+	usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd))
+	usermem.ByteOrder.PutUint32(buf[4:], e.mask)
+	usermem.ByteOrder.PutUint32(buf[8:], e.cookie)
+	usermem.ByteOrder.PutUint32(buf[12:], e.len)
+
+	writeLen := 0
+
+	n, err := dst.CopyOut(ctx, buf)
+	if err != nil {
+		return 0, err
+	}
+	writeLen += n
+	dst = dst.DropFirst(n)
+
+	if e.len > 0 {
+		n, err = dst.CopyOut(ctx, e.name)
+		if err != nil {
+			return 0, err
+		}
+		writeLen += n
+	}
+
+	// Santiy check.
+	if writeLen != e.sizeOf() {
+		panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen))
+	}
+
+	return int64(writeLen), nil
+}
+
+func (e *Event) equals(other *Event) bool {
+	return e.wd == other.wd &&
+		e.mask == other.mask &&
+		e.cookie == other.cookie &&
+		e.len == other.len &&
+		bytes.Equal(e.name, other.name)
+}
+
+// InotifyEventFromStatMask generates the appropriate events for an operation
+// that set the stats specified in mask.
+func InotifyEventFromStatMask(mask uint32) uint32 {
+	var ev uint32
+	if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 {
+		ev |= linux.IN_ATTRIB
+	}
+	if mask&linux.STATX_SIZE != 0 {
+		ev |= linux.IN_MODIFY
+	}
+
+	if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) {
+		// Both times indicates a utime(s) call.
+		ev |= linux.IN_ATTRIB
+	} else if mask&linux.STATX_ATIME != 0 {
+		ev |= linux.IN_ACCESS
+	} else if mask&linux.STATX_MTIME != 0 {
+		mask |= linux.IN_MODIFY
+	}
+	return ev
+}
+
+// InotifyRemoveChild sends the appriopriate notifications to the watch sets of
+// the child being removed and its parent. Note that unlike most pairs of
+// parent/child notifications, the child is notified first in this case.
+func InotifyRemoveChild(self, parent *Watches, name string) {
+	if self != nil {
+		self.Notify("", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */)
+	}
+	if parent != nil {
+		parent.Notify(name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */)
+	}
+}
+
+// InotifyRename sends the appriopriate notifications to the watch sets of the
+// file being renamed and its old/new parents.
+func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) {
+	var dirEv uint32
+	if isDir {
+		dirEv = linux.IN_ISDIR
+	}
+	cookie := uniqueid.InotifyCookie(ctx)
+	if oldParent != nil {
+		oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */)
+	}
+	if newParent != nil {
+		newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */)
+	}
+	// Somewhat surprisingly, self move events do not have a cookie.
+	if renamed != nil {
+		renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */)
+	}
+}
diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock.go
index 724dfe743..6c7583a81 100644
--- a/pkg/sentry/vfs/lock/lock.go
+++ b/pkg/sentry/vfs/lock.go
@@ -17,9 +17,11 @@
 //
 // The actual implementations can be found in the lock package under
 // sentry/fs/lock.
-package lock
+package vfs
 
 import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -56,7 +58,11 @@ func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) {
 }
 
 // LockPOSIX tries to acquire a POSIX-style lock on a file region.
-func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
+func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	rng, err := computeRange(ctx, fd, start, length, whence)
+	if err != nil {
+		return err
+	}
 	if fl.posix.LockRegion(uid, t, rng, block) {
 		return nil
 	}
@@ -67,6 +73,37 @@ func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fsloc
 //
 // This operation is always successful, even if there did not exist a lock on
 // the requested region held by uid in the first place.
-func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) {
+func (fl *FileLocks) UnlockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	rng, err := computeRange(ctx, fd, start, length, whence)
+	if err != nil {
+		return err
+	}
 	fl.posix.UnlockRegion(uid, rng)
+	return nil
+}
+
+func computeRange(ctx context.Context, fd *FileDescription, start uint64, length uint64, whence int16) (fslock.LockRange, error) {
+	var off int64
+	switch whence {
+	case linux.SEEK_SET:
+		off = 0
+	case linux.SEEK_CUR:
+		// Note that Linux does not hold any mutexes while retrieving the file
+		// offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
+		curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR)
+		if err != nil {
+			return fslock.LockRange{}, err
+		}
+		off = curOff
+	case linux.SEEK_END:
+		stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE})
+		if err != nil {
+			return fslock.LockRange{}, err
+		}
+		off = int64(stat.Size)
+	default:
+		return fslock.LockRange{}, syserror.EINVAL
+	}
+
+	return fslock.ComputeRange(int64(start), int64(length), off)
 }
diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD
deleted file mode 100644
index d9ab063b7..000000000
--- a/pkg/sentry/vfs/lock/BUILD
+++ /dev/null
@@ -1,13 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "lock",
-    srcs = ["lock.go"],
-    visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/sentry/fs/lock",
-        "//pkg/syserror",
-    ],
-)
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 02850b65c..32f901bd8 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -28,9 +28,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// lastMountID is used to allocate mount ids. Must be accessed atomically.
-var lastMountID uint64
-
 // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
 // (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
 // (Mount.fs), which applies to path resolution in the context of a particular
@@ -58,6 +55,10 @@ type Mount struct {
 	// ID is the immutable mount ID.
 	ID uint64
 
+	// Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
+	// for MS_RDONLY which is tracked in "writers". Immutable.
+	Flags MountFlags
+
 	// key is protected by VirtualFilesystem.mountMu and
 	// VirtualFilesystem.mounts.seq, and may be nil. References are held on
 	// key.parent and key.point if they are not nil.
@@ -84,10 +85,6 @@ type Mount struct {
 	// umounted is true. umounted is protected by VirtualFilesystem.mountMu.
 	umounted bool
 
-	// flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
-	// for MS_RDONLY which is tracked in "writers".
-	flags MountFlags
-
 	// The lower 63 bits of writers is the number of calls to
 	// Mount.CheckBeginWrite() that have not yet been paired with a call to
 	// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
@@ -97,11 +94,11 @@ type Mount struct {
 
 func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
 	mnt := &Mount{
-		ID:    atomic.AddUint64(&lastMountID, 1),
+		ID:    atomic.AddUint64(&vfs.lastMountID, 1),
+		Flags: opts.Flags,
 		vfs:   vfs,
 		fs:    fs,
 		root:  root,
-		flags: opts.Flags,
 		ns:    mntns,
 		refs:  1,
 	}
@@ -111,8 +108,17 @@ func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *Mount
 	return mnt
 }
 
-// A MountNamespace is a collection of Mounts.
-//
+// Options returns a copy of the MountOptions currently applicable to mnt.
+func (mnt *Mount) Options() MountOptions {
+	mnt.vfs.mountMu.Lock()
+	defer mnt.vfs.mountMu.Unlock()
+	return MountOptions{
+		Flags:    mnt.Flags,
+		ReadOnly: mnt.readOnly(),
+	}
+}
+
+// A MountNamespace is a collection of Mounts.//
 // MountNamespaces are reference-counted. Unless otherwise specified, all
 // MountNamespace methods require that a reference is held.
 //
@@ -120,6 +126,9 @@ func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *Mount
 //
 // +stateify savable
 type MountNamespace struct {
+	// Owner is the usernamespace that owns this mount namespace.
+	Owner *auth.UserNamespace
+
 	// root is the MountNamespace's root mount. root is immutable.
 	root *Mount
 
@@ -148,7 +157,7 @@ type MountNamespace struct {
 func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
 	rft := vfs.getFilesystemType(fsTypeName)
 	if rft == nil {
-		ctx.Warningf("Unknown filesystem: %s", fsTypeName)
+		ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
 		return nil, syserror.ENODEV
 	}
 	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
@@ -156,6 +165,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 		return nil, err
 	}
 	mntns := &MountNamespace{
+		Owner:       creds.UserNamespace,
 		refs:        1,
 		mountpoints: make(map[*Dentry]uint32),
 	}
@@ -175,26 +185,34 @@ func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry,
 	return newMount(vfs, fs, root, nil /* mntns */, opts), nil
 }
 
-// MountAt creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+// MountDisconnected creates a Filesystem configured by the given arguments,
+// then returns a Mount representing it. The new Mount is not associated with
+// any MountNamespace and is not connected to any other Mounts.
+func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) {
 	rft := vfs.getFilesystemType(fsTypeName)
 	if rft == nil {
-		return syserror.ENODEV
+		return nil, syserror.ENODEV
 	}
 	if !opts.InternalMount && !rft.opts.AllowUserMount {
-		return syserror.ENODEV
+		return nil, syserror.ENODEV
 	}
 	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
 	if err != nil {
-		return err
+		return nil, err
 	}
+	defer root.DecRef()
+	defer fs.DecRef()
+	return vfs.NewDisconnectedMount(fs, root, opts)
+}
 
+// ConnectMountAt connects mnt at the path represented by target.
+//
+// Preconditions: mnt must be disconnected.
+func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error {
 	// We can't hold vfs.mountMu while calling FilesystemImpl methods due to
 	// lock ordering.
 	vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
 	if err != nil {
-		root.DecRef()
-		fs.DecRef()
 		return err
 	}
 	vfs.mountMu.Lock()
@@ -204,8 +222,6 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 			vd.dentry.mu.Unlock()
 			vfs.mountMu.Unlock()
 			vd.DecRef()
-			root.DecRef()
-			fs.DecRef()
 			return syserror.ENOENT
 		}
 		// vd might have been mounted over between vfs.GetDentryAt() and
@@ -238,7 +254,6 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 	// point and the mount root are directories, or neither are, and returns
 	// ENOTDIR if this is not the case.
 	mntns := vd.mount.ns
-	mnt := newMount(vfs, fs, root, mntns, opts)
 	vfs.mounts.seq.BeginWrite()
 	vfs.connectLocked(mnt, vd, mntns)
 	vfs.mounts.seq.EndWrite()
@@ -247,6 +262,19 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 	return nil
 }
 
+// MountAt creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+	mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
+	if err != nil {
+		return err
+	}
+	defer mnt.DecRef()
+	if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
+		return err
+	}
+	return nil
+}
+
 // UmountAt removes the Mount at the given path.
 func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
 	if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
@@ -254,6 +282,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
 	}
 
 	// MNT_FORCE is currently unimplemented except for the permission check.
+	// Force unmounting specifically requires CAP_SYS_ADMIN in the root user
+	// namespace, and not in the owner user namespace for the target mount. See
+	// fs/namespace.c:SYSCALL_DEFINE2(umount, ...)
 	if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
 		return syserror.EPERM
 	}
@@ -369,14 +400,22 @@ func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecu
 // references held by vd.
 //
 // Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section. d.mu must be locked. mnt.parent() == nil.
+// writer critical section. d.mu must be locked. mnt.parent() == nil, i.e. mnt
+// must not already be connected.
 func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
+	if checkInvariants {
+		if mnt.parent() != nil {
+			panic("VFS.connectLocked called on connected mount")
+		}
+	}
+	mnt.IncRef() // dropped by callers of umountRecursiveLocked
 	mnt.storeKey(vd)
 	if vd.mount.children == nil {
 		vd.mount.children = make(map[*Mount]struct{})
 	}
 	vd.mount.children[mnt] = struct{}{}
 	atomic.AddUint32(&vd.dentry.mounts, 1)
+	mnt.ns = mntns
 	mntns.mountpoints[vd.dentry]++
 	vfs.mounts.insertSeqed(mnt)
 	vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
@@ -394,6 +433,11 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
 // writer critical section. mnt.parent() != nil.
 func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
 	vd := mnt.loadKey()
+	if checkInvariants {
+		if vd.mount != nil {
+			panic("VFS.disconnectLocked called on disconnected mount")
+		}
+	}
 	mnt.storeKey(VirtualDentry{})
 	delete(vd.mount.children, mnt)
 	atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
@@ -715,7 +759,10 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi
 		if mnt.readOnly() {
 			opts = "ro"
 		}
-		if mnt.flags.NoExec {
+		if mnt.Flags.NoATime {
+			opts = ",noatime"
+		}
+		if mnt.Flags.NoExec {
 			opts += ",noexec"
 		}
 
@@ -800,11 +847,12 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
 		if mnt.readOnly() {
 			opts = "ro"
 		}
-		if mnt.flags.NoExec {
+		if mnt.Flags.NoATime {
+			opts = ",noatime"
+		}
+		if mnt.Flags.NoExec {
 			opts += ",noexec"
 		}
-		// TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is
-		// set.
 		fmt.Fprintf(buf, "%s ", opts)
 
 		// (7) Optional fields: zero or more fields of the form "tag[:value]".
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index bc7581698..70f850ca4 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 53d364c5c..dfc8573fd 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -75,6 +75,21 @@ type MknodOptions struct {
 type MountFlags struct {
 	// NoExec is equivalent to MS_NOEXEC.
 	NoExec bool
+
+	// NoATime is equivalent to MS_NOATIME and indicates that the
+	// filesystem should not update access time in-place.
+	NoATime bool
+
+	// NoDev is equivalent to MS_NODEV and indicates that the
+	// filesystem should not allow access to devices (special files).
+	// TODO(gVisor.dev/issue/3186): respect this flag in non FUSE
+	// filesystems.
+	NoDev bool
+
+	// NoSUID is equivalent to MS_NOSUID and indicates that the
+	// filesystem should not honor set-user-ID and set-group-ID bits or
+	// file capabilities when executing programs.
+	NoSUID bool
 }
 
 // MountOptions contains options to VirtualFilesystem.MountAt().
@@ -149,6 +164,12 @@ type SetStatOptions struct {
 	// == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask
 	// instead).
 	Stat linux.Statx
+
+	// NeedWritePerm indicates that write permission on the file is needed for
+	// this operation. This is needed for truncate(2) (note that ftruncate(2)
+	// does not require the same check--instead, it checks that the fd is
+	// writable).
+	NeedWritePerm bool
 }
 
 // BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt()
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index f9647f90e..33389c1df 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -94,6 +94,37 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linu
 	return syserror.EACCES
 }
 
+// MayLink determines whether creating a hard link to a file with the given
+// mode, kuid, and kgid is permitted.
+//
+// This corresponds to Linux's fs/namei.c:may_linkat.
+func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+	// Source inode owner can hardlink all they like; otherwise, it must be a
+	// safe source.
+	if CanActAsOwner(creds, kuid) {
+		return nil
+	}
+
+	// Only regular files can be hard linked.
+	if mode.FileType() != linux.S_IFREG {
+		return syserror.EPERM
+	}
+
+	// Setuid files should not get pinned to the filesystem.
+	if mode&linux.S_ISUID != 0 {
+		return syserror.EPERM
+	}
+
+	// Executable setgid files should not get pinned to the filesystem, but we
+	// don't support S_IXGRP anyway.
+
+	// Hardlinking to unreadable or unwritable sources is dangerous.
+	if err := GenericCheckPermissions(creds, MayRead|MayWrite, mode, kuid, kgid); err != nil {
+		return syserror.EPERM
+	}
+	return nil
+}
+
 // AccessTypesForOpenFlags returns the access types required to open a file
 // with the given OpenOptions.Flags. Note that this is NOT the same thing as
 // the set of accesses permitted for the opened file:
@@ -152,7 +183,8 @@ func MayWriteFileWithOpenFlags(flags uint32) bool {
 // CheckSetStat checks that creds has permission to change the metadata of a
 // file with the given permissions, UID, and GID as specified by stat, subject
 // to the rules of Linux's fs/attr.c:setattr_prepare().
-func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOptions, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+	stat := &opts.Stat
 	if stat.Mask&linux.STATX_SIZE != 0 {
 		limit, err := CheckLimit(ctx, 0, int64(stat.Size))
 		if err != nil {
@@ -184,6 +216,11 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat
 			return syserror.EPERM
 		}
 	}
+	if opts.NeedWritePerm && !creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
+		if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil {
+			return err
+		}
+	}
 	if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 {
 		if !CanActAsOwner(creds, kuid) {
 			if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) ||
@@ -199,6 +236,20 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat
 	return nil
 }
 
+// CheckDeleteSticky checks whether the sticky bit is set on a directory with
+// the given file mode, and if so, checks whether creds has permission to
+// remove a file owned by childKUID from a directory with the given mode.
+// CheckDeleteSticky is consistent with fs/linux.h:check_sticky().
+func CheckDeleteSticky(creds *auth.Credentials, parentMode linux.FileMode, childKUID auth.KUID) error {
+	if parentMode&linux.ModeSticky == 0 {
+		return nil
+	}
+	if CanActAsOwner(creds, childKUID) {
+		return nil
+	}
+	return syserror.EPERM
+}
+
 // CanActAsOwner returns true if creds can act as the owner of a file with the
 // given owning UID, consistent with Linux's
 // fs/inode.c:inode_owner_or_capable().
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 8d7f8f8af..522e27475 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -24,6 +24,9 @@
 //           Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
 //         VirtualFilesystem.filesystemsMu
 //       EpollInstance.mu
+//		   Inotify.mu
+// 		     Watches.mu
+//  		     Inotify.evMu
 // VirtualFilesystem.fsTypesMu
 //
 // Locking Dentry.mu in multiple Dentries requires holding
@@ -82,6 +85,10 @@ type VirtualFilesystem struct {
 	// mountpoints is analogous to Linux's mountpoint_hashtable.
 	mountpoints map[*Dentry]map[*Mount]struct{}
 
+	// lastMountID is the last allocated mount ID. lastMountID is accessed
+	// using atomic memory operations.
+	lastMountID uint64
+
 	// anonMount is a Mount, not included in mounts or mountpoints,
 	// representing an anonFilesystem. anonMount is used to back
 	// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
@@ -116,6 +123,9 @@ type VirtualFilesystem struct {
 
 // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
 func (vfs *VirtualFilesystem) Init() error {
+	if vfs.mountpoints != nil {
+		panic("VFS already initialized")
+	}
 	vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
 	vfs.devices = make(map[devTuple]*registeredDevice)
 	vfs.anonBlockDevMinorNext = 1
@@ -401,7 +411,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 			vfs.putResolvingPath(rp)
 
 			if opts.FileExec {
-				if fd.Mount().flags.NoExec {
+				if fd.Mount().Flags.NoExec {
 					fd.DecRef()
 					return nil, syserror.EACCES
 				}
@@ -418,6 +428,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 				}
 			}
 
+			fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent)
 			return fd, nil
 		}
 		if !rp.handleError(err) {
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 101497ed6..748273366 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -77,7 +77,10 @@ var DefaultOpts = Opts{
 // trigger it.
 const descheduleThreshold = 1 * time.Second
 
-var stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected")
+var (
+	stuckStartup = metric.MustCreateNewUint64Metric("/watchdog/stuck_startup_detected", true /* sync */, "Incremented once on startup watchdog timeout")
+	stuckTasks   = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected")
+)
 
 // Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck.
 var stackDumpSameTaskPeriod = time.Minute
@@ -220,6 +223,9 @@ func (w *Watchdog) waitForStart() {
 		// We are fine.
 		return
 	}
+
+	stuckStartup.Increment()
+
 	var buf bytes.Buffer
 	buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
 	w.doAction(w.StartupTimeoutAction, false, &buf)
@@ -323,13 +329,13 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo
 
 func (w *Watchdog) reportStuckWatchdog() {
 	var buf bytes.Buffer
-	buf.WriteString("Watchdog goroutine is stuck:")
+	buf.WriteString("Watchdog goroutine is stuck")
 	w.doAction(w.TaskTimeoutAction, false, &buf)
 }
 
 // doAction will take the given action. If the action is LogWarning, the stack
-// is not always dumpped to the log to prevent log flooding. "forceStack"
-// guarantees that the stack will be dumped regarless.
+// is not always dumped to the log to prevent log flooding. "forceStack"
+// guarantees that the stack will be dumped regardless.
 func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {
 	switch action {
 	case LogWarning: