77 files changed, 6324 insertions, 622 deletions
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 585764223..93512c9b6 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -18,6 +18,7 @@ go_library(
         "//pkg/context",
         "//pkg/safemem",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index c03c65445..e6fda2b4f 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -116,6 +116,8 @@ type rootInode struct {
 	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
 
+	locks vfs.FileLocks
+
 	// Keep a reference to this inode's dentry.
 	dentry kernfs.Dentry
 
@@ -183,7 +185,7 @@ func (i *rootInode) masterClose(t *Terminal) {
 
 // Open implements kernfs.Inode.Open.
 func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 7a7ce5d81..1081fff52 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
@@ -34,6 +35,8 @@ type masterInode struct {
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
 
+	locks vfs.FileLocks
+
 	// Keep a reference to this inode's dentry.
 	dentry kernfs.Dentry
 
@@ -55,6 +58,7 @@ func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vf
 		inode: mi,
 		t:     t,
 	}
+	fd.LockFD.Init(&mi.locks)
 	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 		mi.DecRef()
 		return nil, err
@@ -63,8 +67,8 @@ func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vf
 }
 
 // Stat implements kernfs.Inode.Stat.
-func (mi *masterInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	statx, err := mi.InodeAttrs.Stat(vfsfs, opts)
+func (mi *masterInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := mi.InodeAttrs.Stat(ctx, vfsfs, opts)
 	if err != nil {
 		return linux.Statx{}, err
 	}
@@ -85,6 +89,7 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
 type masterFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	inode *masterInode
 	t     *Terminal
@@ -181,7 +186,17 @@ func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatO
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return mfd.inode.Stat(fs, opts)
+	return mfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (mfd *masterFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return mfd.Locks().LockPOSIX(ctx, &mfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (mfd *masterFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return mfd.Locks().UnlockPOSIX(ctx, &mfd.vfsfd, uid, start, length, whence)
 }
 
 // maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid.
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
index 526cd406c..a91cae3ef 100644
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ b/pkg/sentry/fsimpl/devpts/slave.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -33,6 +34,8 @@ type slaveInode struct {
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
 
+	locks vfs.FileLocks
+
 	// Keep a reference to this inode's dentry.
 	dentry kernfs.Dentry
 
@@ -51,6 +54,7 @@ func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs
 	fd := &slaveFileDescription{
 		inode: si,
 	}
+	fd.LockFD.Init(&si.locks)
 	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 		si.DecRef()
 		return nil, err
@@ -69,8 +73,8 @@ func (si *slaveInode) Valid(context.Context) bool {
 }
 
 // Stat implements kernfs.Inode.Stat.
-func (si *slaveInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	statx, err := si.InodeAttrs.Stat(vfsfs, opts)
+func (si *slaveInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts)
 	if err != nil {
 		return linux.Statx{}, err
 	}
@@ -91,6 +95,7 @@ func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
 type slaveFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	inode *slaveInode
 }
@@ -127,7 +132,7 @@ func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequen
 	return sfd.inode.t.ld.outputQueueWrite(ctx, src)
 }
 
-// Ioctl implements vfs.FileDescripionImpl.Ioctl.
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
 func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
@@ -178,5 +183,15 @@ func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOp
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return sfd.inode.Stat(fs, opts)
+	return sfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence)
 }
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index 142ee53b0..d0e06cdc0 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -136,6 +136,8 @@ func (a *Accessor) pathOperationAt(pathname string) *vfs.PathOperation {
 // CreateDeviceFile creates a device special file at the given pathname in the
 // devtmpfs instance accessed by the Accessor.
 func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind vfs.DeviceKind, major, minor uint32, perms uint16) error {
+	actx := a.wrapContext(ctx)
+
 	mode := (linux.FileMode)(perms)
 	switch kind {
 	case vfs.BlockDevice:
@@ -145,12 +147,24 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v
 	default:
 		panic(fmt.Sprintf("invalid vfs.DeviceKind: %v", kind))
 	}
+
+	// Create any parent directories. See
+	// devtmpfs.c:handle_create()=>path_create().
+	for it := fspath.Parse(pathname).Begin; it.NextOk(); it = it.Next() {
+		pop := a.pathOperationAt(it.String())
+		if err := a.vfsObj.MkdirAt(actx, a.creds, pop, &vfs.MkdirOptions{
+			Mode: 0755,
+		}); err != nil {
+			return fmt.Errorf("failed to create directory %q: %v", it.String(), err)
+		}
+	}
+
 	// NOTE: Linux's devtmpfs refuses to automatically delete files it didn't
 	// create, which it recognizes by storing a pointer to the kdevtmpfs struct
 	// thread in struct inode::i_private. Accessor doesn't yet support deletion
 	// of files at all, and probably won't as long as we don't need to support
 	// kernel modules, so this is moot for now.
-	return a.vfsObj.MknodAt(a.wrapContext(ctx), a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{
+	return a.vfsObj.MknodAt(actx, a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{
 		Mode:     mode,
 		DevMajor: major,
 		DevMinor: minor,
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
index c573d7935..d12d78b84 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -37,6 +37,7 @@ type EventFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 
 	// queue is used to notify interested parties when the event object
 	// becomes readable or writable.
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index ff861d0fe..abc610ef3 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -54,6 +54,7 @@ go_library(
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/ext/disklayout",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
@@ -95,7 +96,7 @@ go_test(
         "//pkg/syserror",
         "//pkg/test/testutil",
         "//pkg/usermem",
-        "@com_github_google_go-cmp//cmp:go_default_library",
-        "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go
index a2d8c3ad6..8bb104ff0 100644
--- a/pkg/sentry/fsimpl/ext/block_map_file.go
+++ b/pkg/sentry/fsimpl/ext/block_map_file.go
@@ -58,15 +58,16 @@ var _ io.ReaderAt = (*blockMapFile)(nil)
 
 // newBlockMapFile is the blockMapFile constructor. It initializes the file to
 // physical blocks map with (at most) the first 12 (direct) blocks.
-func newBlockMapFile(regFile regularFile) (*blockMapFile, error) {
-	file := &blockMapFile{regFile: regFile}
+func newBlockMapFile(args inodeArgs) (*blockMapFile, error) {
+	file := &blockMapFile{}
 	file.regFile.impl = file
+	file.regFile.inode.init(args, &file.regFile)
 
 	for i := uint(0); i < 4; i++ {
-		file.coverage[i] = getCoverage(regFile.inode.blkSize, i)
+		file.coverage[i] = getCoverage(file.regFile.inode.blkSize, i)
 	}
 
-	blkMap := regFile.inode.diskInode.Data()
+	blkMap := file.regFile.inode.diskInode.Data()
 	binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks)
 	binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk)
 	binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk)
diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go
index 181727ef7..6fa84e7aa 100644
--- a/pkg/sentry/fsimpl/ext/block_map_test.go
+++ b/pkg/sentry/fsimpl/ext/block_map_test.go
@@ -85,20 +85,6 @@ func (n *blkNumGen) next() uint32 {
 // the inode covers and that is written to disk.
 func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
 	mockDisk := make([]byte, mockBMDiskSize)
-	regFile := regularFile{
-		inode: inode{
-			fs: &filesystem{
-				dev: bytes.NewReader(mockDisk),
-			},
-			diskInode: &disklayout.InodeNew{
-				InodeOld: disklayout.InodeOld{
-					SizeLo: getMockBMFileFize(),
-				},
-			},
-			blkSize: uint64(mockBMBlkSize),
-		},
-	}
-
 	var fileData []byte
 	blkNums := newBlkNumGen()
 	var data []byte
@@ -125,9 +111,20 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
 	data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk)
 	fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...)
 
-	copy(regFile.inode.diskInode.Data(), data)
+	args := inodeArgs{
+		fs: &filesystem{
+			dev: bytes.NewReader(mockDisk),
+		},
+		diskInode: &disklayout.InodeNew{
+			InodeOld: disklayout.InodeOld{
+				SizeLo: getMockBMFileFize(),
+			},
+		},
+		blkSize: uint64(mockBMBlkSize),
+	}
+	copy(args.diskInode.Data(), data)
 
-	mockFile, err := newBlockMapFile(regFile)
+	mockFile, err := newBlockMapFile(args)
 	if err != nil {
 		t.Fatalf("newBlockMapFile failed: %v", err)
 	}
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index bfbd7c3d4..55902322a 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -60,3 +60,20 @@ func (d *dentry) DecRef() {
 	// inode.decRef().
 	d.inode.decRef()
 }
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) Watches() *vfs.Watches {
+	return nil
+}
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+//
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) OnZeroWatches() {}
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 12b875c8f..357512c7e 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -54,16 +55,15 @@ type directory struct {
 }
 
 // newDirectory is the directory constructor.
-func newDirectory(inode inode, newDirent bool) (*directory, error) {
+func newDirectory(args inodeArgs, newDirent bool) (*directory, error) {
 	file := &directory{
-		inode:      inode,
 		childCache: make(map[string]*dentry),
 		childMap:   make(map[string]*dirent),
 	}
-	file.inode.impl = file
+	file.inode.init(args, file)
 
 	// Initialize childList by reading dirents from the underlying file.
-	if inode.diskInode.Flags().Index {
+	if args.diskInode.Flags().Index {
 		// TODO(b/134676337): Support hash tree directories. Currently only the '.'
 		// and '..' entries are read in.
 
@@ -74,7 +74,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) {
 
 	// The dirents are organized in a linear array in the file data.
 	// Extract the file data and decode the dirents.
-	regFile, err := newRegularFile(inode)
+	regFile, err := newRegularFile(args)
 	if err != nil {
 		return nil, err
 	}
@@ -82,7 +82,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) {
 	// buf is used as scratch space for reading in dirents from disk and
 	// unmarshalling them into dirent structs.
 	buf := make([]byte, disklayout.DirentSize)
-	size := inode.diskInode.Size()
+	size := args.diskInode.Size()
 	for off, inc := uint64(0), uint64(0); off < size; off += inc {
 		toRead := size - off
 		if toRead > disklayout.DirentSize {
@@ -306,3 +306,13 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
 	fd.off = offset
 	return offset, nil
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *directoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *directoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index 11dcc0346..c36225a7c 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -38,9 +38,10 @@ var _ io.ReaderAt = (*extentFile)(nil)
 // newExtentFile is the extent file constructor. It reads the entire extent
 // tree into memory.
 // TODO(b/134676337): Build extent tree on demand to reduce memory usage.
-func newExtentFile(regFile regularFile) (*extentFile, error) {
-	file := &extentFile{regFile: regFile}
+func newExtentFile(args inodeArgs) (*extentFile, error) {
+	file := &extentFile{}
 	file.regFile.impl = file
+	file.regFile.inode.init(args, &file.regFile)
 	err := file.buildExtTree()
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go
index a2382daa3..cd10d46ee 100644
--- a/pkg/sentry/fsimpl/ext/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/extent_test.go
@@ -177,21 +177,19 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []
 	t.Helper()
 
 	mockDisk := make([]byte, mockExtentBlkSize*10)
-	mockExtentFile := &extentFile{
-		regFile: regularFile{
-			inode: inode{
-				fs: &filesystem{
-					dev: bytes.NewReader(mockDisk),
-				},
-				diskInode: &disklayout.InodeNew{
-					InodeOld: disklayout.InodeOld{
-						SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
-					},
-				},
-				blkSize: mockExtentBlkSize,
+	mockExtentFile := &extentFile{}
+	args := inodeArgs{
+		fs: &filesystem{
+			dev: bytes.NewReader(mockDisk),
+		},
+		diskInode: &disklayout.InodeNew{
+			InodeOld: disklayout.InodeOld{
+				SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
 			},
 		},
+		blkSize: mockExtentBlkSize,
 	}
+	mockExtentFile.regFile.inode.init(args, &mockExtentFile.regFile)
 
 	fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize)
 
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index 92f7da40d..90b086468 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -26,6 +26,7 @@ import (
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 485f86f4b..30636cf66 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -54,6 +54,8 @@ type inode struct {
 	// diskInode gives us access to the inode struct on disk. Immutable.
 	diskInode disklayout.Inode
 
+	locks vfs.FileLocks
+
 	// This is immutable. The first field of the implementations must have inode
 	// as the first field to ensure temporality.
 	impl interface{}
@@ -115,7 +117,7 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 	}
 
 	// Build the inode based on its type.
-	inode := inode{
+	args := inodeArgs{
 		fs:        fs,
 		inodeNum:  inodeNum,
 		blkSize:   blkSize,
@@ -124,19 +126,19 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 
 	switch diskInode.Mode().FileType() {
 	case linux.ModeSymlink:
-		f, err := newSymlink(inode)
+		f, err := newSymlink(args)
 		if err != nil {
 			return nil, err
 		}
 		return &f.inode, nil
 	case linux.ModeRegular:
-		f, err := newRegularFile(inode)
+		f, err := newRegularFile(args)
 		if err != nil {
 			return nil, err
 		}
 		return &f.inode, nil
 	case linux.ModeDirectory:
-		f, err := newDirectory(inode, fs.sb.IncompatibleFeatures().DirentFileType)
+		f, err := newDirectory(args, fs.sb.IncompatibleFeatures().DirentFileType)
 		if err != nil {
 			return nil, err
 		}
@@ -147,6 +149,21 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 	}
 }
 
+type inodeArgs struct {
+	fs        *filesystem
+	inodeNum  uint32
+	blkSize   uint64
+	diskInode disklayout.Inode
+}
+
+func (in *inode) init(args inodeArgs, impl interface{}) {
+	in.fs = args.fs
+	in.inodeNum = args.inodeNum
+	in.blkSize = args.blkSize
+	in.diskInode = args.diskInode
+	in.impl = impl
+}
+
 // open creates and returns a file description for the dentry passed in.
 func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(opts)
@@ -157,6 +174,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
+		fd.LockFD.Init(&in.locks)
 		if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
@@ -168,6 +186,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
+		fd.LockFD.Init(&in.locks)
 		if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
@@ -178,6 +197,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
 			return nil, syserror.ELOOP
 		}
 		var fd symlinkFD
+		fd.LockFD.Init(&in.locks)
 		fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	default:
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index 30135ddb0..66d14bb95 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -43,28 +44,19 @@ type regularFile struct {
 
 // newRegularFile is the regularFile constructor. It figures out what kind of
 // file this is and initializes the fileReader.
-func newRegularFile(inode inode) (*regularFile, error) {
-	regFile := regularFile{
-		inode: inode,
-	}
-
-	inodeFlags := inode.diskInode.Flags()
-
-	if inodeFlags.Extents {
-		file, err := newExtentFile(regFile)
+func newRegularFile(args inodeArgs) (*regularFile, error) {
+	if args.diskInode.Flags().Extents {
+		file, err := newExtentFile(args)
 		if err != nil {
 			return nil, err
 		}
-
-		file.regFile.inode.impl = &file.regFile
 		return &file.regFile, nil
 	}
 
-	file, err := newBlockMapFile(regFile)
+	file, err := newBlockMapFile(args)
 	if err != nil {
 		return nil, err
 	}
-	file.regFile.inode.impl = &file.regFile
 	return &file.regFile, nil
 }
 
@@ -77,6 +69,7 @@ func (in *inode) isRegular() bool {
 // vfs.FileDescriptionImpl.
 type regularFileFD struct {
 	fileDescription
+	vfs.LockFD
 
 	// off is the file offset. off is accessed using atomic memory operations.
 	off int64
@@ -157,3 +150,13 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt
 	// TODO(b/134676337): Implement mmap(2).
 	return syserror.ENODEV
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index 1447a4dc1..62efd4095 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -30,18 +30,17 @@ type symlink struct {
 
 // newSymlink is the symlink constructor. It reads out the symlink target from
 // the inode (however it might have been stored).
-func newSymlink(inode inode) (*symlink, error) {
-	var file *symlink
+func newSymlink(args inodeArgs) (*symlink, error) {
 	var link []byte
 
 	// If the symlink target is lesser than 60 bytes, its stores in inode.Data().
 	// Otherwise either extents or block maps will be used to store the link.
-	size := inode.diskInode.Size()
+	size := args.diskInode.Size()
 	if size < 60 {
-		link = inode.diskInode.Data()[:size]
+		link = args.diskInode.Data()[:size]
 	} else {
 		// Create a regular file out of this inode and read out the target.
-		regFile, err := newRegularFile(inode)
+		regFile, err := newRegularFile(args)
 		if err != nil {
 			return nil, err
 		}
@@ -52,8 +51,8 @@ func newSymlink(inode inode) (*symlink, error) {
 		}
 	}
 
-	file = &symlink{inode: inode, target: string(link)}
-	file.inode.impl = file
+	file := &symlink{target: string(link)}
+	file.inode.init(args, file)
 	return file, nil
 }
 
@@ -67,6 +66,7 @@ func (in *inode) isSymlink() bool {
 // O_PATH. For this reason most of the functions return EBADF.
 type symlinkFD struct {
 	fileDescription
+	vfs.NoLockFD
 }
 
 // Compiles only if symlinkFD implements vfs.FileDescriptionImpl.
diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD
new file mode 100644
index 000000000..999111deb
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/BUILD
@@ -0,0 +1,63 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "request_list",
+    out = "request_list.go",
+    package = "fuse",
+    prefix = "request",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Request",
+        "Linker": "*Request",
+    },
+)
+
+go_library(
+    name = "fuse",
+    srcs = [
+        "connection.go",
+        "dev.go",
+        "fusefs.go",
+        "init.go",
+        "register.go",
+        "request_list.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/log",
+        "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+        "//tools/go_marshal/marshal",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "fuse_test",
+    size = "small",
+    srcs = ["dev_test.go"],
+    library = ":fuse",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/fsimpl/testutil",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+        "//tools/go_marshal/marshal",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go
new file mode 100644
index 000000000..6df2728ab
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/connection.go
@@ -0,0 +1,437 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"errors"
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+)
+
+// maxActiveRequestsDefault is the default setting controlling the upper bound
+// on the number of active requests at any given time.
+const maxActiveRequestsDefault = 10000
+
+// Ordinary requests have even IDs, while interrupts IDs are odd.
+// Used to increment the unique ID for each FUSE request.
+var reqIDStep uint64 = 2
+
+const (
+	// fuseDefaultMaxBackground is the default value for MaxBackground.
+	fuseDefaultMaxBackground = 12
+
+	// fuseDefaultCongestionThreshold is the default value for CongestionThreshold,
+	// and is 75% of the default maximum of MaxGround.
+	fuseDefaultCongestionThreshold = (fuseDefaultMaxBackground * 3 / 4)
+
+	// fuseDefaultMaxPagesPerReq is the default value for MaxPagesPerReq.
+	fuseDefaultMaxPagesPerReq = 32
+)
+
+// Request represents a FUSE operation request that hasn't been sent to the
+// server yet.
+//
+// +stateify savable
+type Request struct {
+	requestEntry
+
+	id   linux.FUSEOpID
+	hdr  *linux.FUSEHeaderIn
+	data []byte
+}
+
+// Response represents an actual response from the server, including the
+// response payload.
+//
+// +stateify savable
+type Response struct {
+	opcode linux.FUSEOpcode
+	hdr    linux.FUSEHeaderOut
+	data   []byte
+}
+
+// connection is the struct by which the sentry communicates with the FUSE server daemon.
+type connection struct {
+	fd *DeviceFD
+
+	// The following FUSE_INIT flags are currently unsupported by this implementation:
+	// - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC)
+	// - FUSE_EXPORT_SUPPORT
+	// - FUSE_HANDLE_KILLPRIV
+	// - FUSE_POSIX_LOCKS: requires POSIX locks
+	// - FUSE_FLOCK_LOCKS: requires POSIX locks
+	// - FUSE_AUTO_INVAL_DATA: requires page caching eviction
+	// - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction
+	// - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation
+	// - FUSE_ASYNC_DIO
+	// - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler
+
+	// initialized after receiving FUSE_INIT reply.
+	// Until it's set, suspend sending FUSE requests.
+	// Use SetInitialized() and IsInitialized() for atomic access.
+	initialized int32
+
+	// initializedChan is used to block requests before initialization.
+	initializedChan chan struct{}
+
+	// blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground).
+	// TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block.
+	blocked bool
+
+	// connected (connection established) when a new FUSE file system is created.
+	// Set to false when:
+	//   umount,
+	//   connection abort,
+	//   device release.
+	connected bool
+
+	// aborted via sysfs.
+	// TODO(gvisor.dev/issue/3185): abort all queued requests.
+	aborted bool
+
+	// connInitError if FUSE_INIT encountered error (major version mismatch).
+	// Only set in INIT.
+	connInitError bool
+
+	// connInitSuccess if FUSE_INIT is successful.
+	// Only set in INIT.
+	// Used for destory.
+	connInitSuccess bool
+
+	// TODO(gvisor.dev/issue/3185): All the queue logic are working in progress.
+
+	// NumberBackground is the number of requests in the background.
+	numBackground uint16
+
+	// congestionThreshold for NumBackground.
+	// Negotiated in FUSE_INIT.
+	congestionThreshold uint16
+
+	// maxBackground is the maximum number of NumBackground.
+	// Block connection when it is reached.
+	// Negotiated in FUSE_INIT.
+	maxBackground uint16
+
+	// numActiveBackground is the number of requests in background and has being marked as active.
+	numActiveBackground uint16
+
+	// numWating is the number of requests waiting for completion.
+	numWaiting uint32
+
+	// TODO(gvisor.dev/issue/3185): BgQueue
+	// some queue for background queued requests.
+
+	// bgLock protects:
+	// MaxBackground, CongestionThreshold, NumBackground,
+	// NumActiveBackground, BgQueue, Blocked.
+	bgLock sync.Mutex
+
+	// maxRead is the maximum size of a read buffer in in bytes.
+	maxRead uint32
+
+	// maxWrite is the maximum size of a write buffer in bytes.
+	// Negotiated in FUSE_INIT.
+	maxWrite uint32
+
+	// maxPages is the maximum number of pages for a single request to use.
+	// Negotiated in FUSE_INIT.
+	maxPages uint16
+
+	// minor version of the FUSE protocol.
+	// Negotiated and only set in INIT.
+	minor uint32
+
+	// asyncRead if read pages asynchronously.
+	// Negotiated and only set in INIT.
+	asyncRead bool
+
+	// abortErr is true if kernel need to return an unique read error after abort.
+	// Negotiated and only set in INIT.
+	abortErr bool
+
+	// writebackCache is true for write-back cache policy,
+	// false for write-through policy.
+	// Negotiated and only set in INIT.
+	writebackCache bool
+
+	// cacheSymlinks if filesystem needs to cache READLINK responses in page cache.
+	// Negotiated and only set in INIT.
+	cacheSymlinks bool
+
+	// bigWrites if doing multi-page cached writes.
+	// Negotiated and only set in INIT.
+	bigWrites bool
+
+	// dontMask if filestestem does not apply umask to creation modes.
+	// Negotiated in INIT.
+	dontMask bool
+}
+
+// newFUSEConnection creates a FUSE connection to fd.
+func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) {
+	// Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to
+	// mount a FUSE filesystem.
+	fuseFD := fd.Impl().(*DeviceFD)
+	fuseFD.mounted = true
+
+	// Create the writeBuf for the header to be stored in.
+	hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+	fuseFD.writeBuf = make([]byte, hdrLen)
+	fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse)
+	fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests)
+	fuseFD.writeCursor = 0
+
+	return &connection{
+		fd:                  fuseFD,
+		maxBackground:       fuseDefaultMaxBackground,
+		congestionThreshold: fuseDefaultCongestionThreshold,
+		maxPages:            fuseDefaultMaxPagesPerReq,
+		initializedChan:     make(chan struct{}),
+		connected:           true,
+	}, nil
+}
+
+// SetInitialized atomically sets the connection as initialized.
+func (conn *connection) SetInitialized() {
+	// Unblock the requests sent before INIT.
+	close(conn.initializedChan)
+
+	// Close the channel first to avoid the non-atomic situation
+	// where conn.initialized is true but there are
+	// tasks being blocked on the channel.
+	// And it prevents the newer tasks from gaining
+	// unnecessary higher chance to be issued before the blocked one.
+
+	atomic.StoreInt32(&(conn.initialized), int32(1))
+}
+
+// IsInitialized atomically check if the connection is initialized.
+// pairs with SetInitialized().
+func (conn *connection) Initialized() bool {
+	return atomic.LoadInt32(&(conn.initialized)) != 0
+}
+
+// NewRequest creates a new request that can be sent to the FUSE server.
+func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
+	conn.fd.mu.Lock()
+	defer conn.fd.mu.Unlock()
+	conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
+
+	hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
+	hdr := linux.FUSEHeaderIn{
+		Len:    uint32(hdrLen + payload.SizeBytes()),
+		Opcode: opcode,
+		Unique: conn.fd.nextOpID,
+		NodeID: ino,
+		UID:    uint32(creds.EffectiveKUID),
+		GID:    uint32(creds.EffectiveKGID),
+		PID:    pid,
+	}
+
+	buf := make([]byte, hdr.Len)
+	hdr.MarshalUnsafe(buf[:hdrLen])
+	payload.MarshalUnsafe(buf[hdrLen:])
+
+	return &Request{
+		id:   hdr.Unique,
+		hdr:  &hdr,
+		data: buf,
+	}, nil
+}
+
+// Call makes a request to the server and blocks the invoking task until a
+// server responds with a response. Task should never be nil.
+// Requests will not be sent before the connection is initialized.
+// For async tasks, use CallAsync().
+func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
+	// Block requests sent before connection is initalized.
+	if !conn.Initialized() {
+		if err := t.Block(conn.initializedChan); err != nil {
+			return nil, err
+		}
+	}
+
+	return conn.call(t, r)
+}
+
+// CallAsync makes an async (aka background) request.
+// Those requests either do not expect a response (e.g. release) or
+// the response should be handled by others (e.g. init).
+// Return immediately unless the connection is blocked (before initialization).
+// Async call example: init, release, forget, aio, interrupt.
+// When the Request is FUSE_INIT, it will not be blocked before initialization.
+func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+	// Block requests sent before connection is initalized.
+	if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT {
+		if err := t.Block(conn.initializedChan); err != nil {
+			return err
+		}
+	}
+
+	// This should be the only place that invokes call() with a nil task.
+	_, err := conn.call(nil, r)
+	return err
+}
+
+// call makes a call without blocking checks.
+func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
+	if !conn.connected {
+		return nil, syserror.ENOTCONN
+	}
+
+	if conn.connInitError {
+		return nil, syserror.ECONNREFUSED
+	}
+
+	fut, err := conn.callFuture(t, r)
+	if err != nil {
+		return nil, err
+	}
+
+	return fut.resolve(t)
+}
+
+// Error returns the error of the FUSE call.
+func (r *Response) Error() error {
+	errno := r.hdr.Error
+	if errno >= 0 {
+		return nil
+	}
+
+	sysErrNo := syscall.Errno(-errno)
+	return error(sysErrNo)
+}
+
+// UnmarshalPayload unmarshals the response data into m.
+func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
+	hdrLen := r.hdr.SizeBytes()
+	haveDataLen := r.hdr.Len - uint32(hdrLen)
+	wantDataLen := uint32(m.SizeBytes())
+
+	if haveDataLen < wantDataLen {
+		return fmt.Errorf("payload too small. Minimum data lenth required: %d,  but got data length %d", wantDataLen, haveDataLen)
+	}
+
+	m.UnmarshalUnsafe(r.data[hdrLen:])
+	return nil
+}
+
+// callFuture makes a request to the server and returns a future response.
+// Call resolve() when the response needs to be fulfilled.
+func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) {
+	conn.fd.mu.Lock()
+	defer conn.fd.mu.Unlock()
+
+	// Is the queue full?
+	//
+	// We must busy wait here until the request can be queued. We don't
+	// block on the fd.fullQueueCh with a lock - so after being signalled,
+	// before we acquire the lock, it is possible that a barging task enters
+	// and queues a request. As a result, upon acquiring the lock we must
+	// again check if the room is available.
+	//
+	// This can potentially starve a request forever but this can only happen
+	// if there are always too many ongoing requests all the time. The
+	// supported maxActiveRequests setting should be really high to avoid this.
+	for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
+		if t == nil {
+			// Since there is no task that is waiting. We must error out.
+			return nil, errors.New("FUSE request queue full")
+		}
+
+		log.Infof("Blocking request %v from being queued. Too many active requests: %v",
+			r.id, conn.fd.numActiveRequests)
+		conn.fd.mu.Unlock()
+		err := t.Block(conn.fd.fullQueueCh)
+		conn.fd.mu.Lock()
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return conn.callFutureLocked(t, r)
+}
+
+// callFutureLocked makes a request to the server and returns a future response.
+func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) {
+	conn.fd.queue.PushBack(r)
+	conn.fd.numActiveRequests += 1
+	fut := newFutureResponse(r.hdr.Opcode)
+	conn.fd.completions[r.id] = fut
+
+	// Signal the readers that there is something to read.
+	conn.fd.waitQueue.Notify(waiter.EventIn)
+
+	return fut, nil
+}
+
+// futureResponse represents an in-flight request, that may or may not have
+// completed yet. Convert it to a resolved Response by calling Resolve, but note
+// that this may block.
+//
+// +stateify savable
+type futureResponse struct {
+	opcode linux.FUSEOpcode
+	ch     chan struct{}
+	hdr    *linux.FUSEHeaderOut
+	data   []byte
+}
+
+// newFutureResponse creates a future response to a FUSE request.
+func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse {
+	return &futureResponse{
+		opcode: opcode,
+		ch:     make(chan struct{}),
+	}
+}
+
+// resolve blocks the task until the server responds to its corresponding request,
+// then returns a resolved response.
+func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
+	// If there is no Task associated with this request  - then we don't try to resolve
+	// the response.  Instead, the task writing the response (proxy to the server) will
+	// process the response on our behalf.
+	if t == nil {
+		log.Infof("fuse.Response.resolve: Not waiting on a response from server.")
+		return nil, nil
+	}
+
+	if err := t.Block(f.ch); err != nil {
+		return nil, err
+	}
+
+	return f.getResponse(), nil
+}
+
+// getResponse creates a Response from the data the futureResponse has.
+func (f *futureResponse) getResponse() *Response {
+	return &Response{
+		opcode: f.opcode,
+		hdr:    *f.hdr,
+		data:   f.data,
+	}
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
new file mode 100644
index 000000000..2225076bc
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -0,0 +1,397 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const fuseDevMinor = 229
+
+// fuseDevice implements vfs.Device for /dev/fuse.
+type fuseDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	if !kernel.FUSEEnabled {
+		return nil, syserror.ENOENT
+	}
+
+	var fd DeviceFD
+	if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
+type DeviceFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
+
+	// mounted specifies whether a FUSE filesystem was mounted using the DeviceFD.
+	mounted bool
+
+	// nextOpID is used to create new requests.
+	nextOpID linux.FUSEOpID
+
+	// queue is the list of requests that need to be processed by the FUSE server.
+	queue requestList
+
+	// numActiveRequests is the number of requests made by the Sentry that has
+	// yet to be responded to.
+	numActiveRequests uint64
+
+	// completions is used to map a request to its response. A Writer will use this
+	// to notify the caller of a completed response.
+	completions map[linux.FUSEOpID]*futureResponse
+
+	writeCursor uint32
+
+	// writeBuf is the memory buffer used to copy in the FUSE out header from
+	// userspace.
+	writeBuf []byte
+
+	// writeCursorFR current FR being copied from server.
+	writeCursorFR *futureResponse
+
+	// mu protects all the queues, maps, buffers and cursors and nextOpID.
+	mu sync.Mutex
+
+	// waitQueue is used to notify interested parties when the device becomes
+	// readable or writable.
+	waitQueue waiter.Queue
+
+	// fullQueueCh is a channel used to synchronize the readers with the writers.
+	// Writers (inbound requests to the filesystem) block if there are too many
+	// unprocessed in-flight requests.
+	fullQueueCh chan struct{}
+
+	// fs is the FUSE filesystem that this FD is being used for.
+	fs *filesystem
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *DeviceFD) Release() {
+	fd.fs.conn.connected = false
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	return 0, syserror.ENOSYS
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	// We require that any Read done on this filesystem have a sane minimum
+	// read buffer. It must have the capacity for the fixed parts of any request
+	// header (Linux uses the request header and the FUSEWriteIn header for this
+	// calculation) + the negotiated MaxWrite room for the data.
+	minBuffSize := linux.FUSE_MIN_READ_BUFFER
+	inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes())
+	writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes())
+	negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.maxWrite
+	if minBuffSize < negotiatedMinBuffSize {
+		minBuffSize = negotiatedMinBuffSize
+	}
+
+	// If the read buffer is too small, error out.
+	if dst.NumBytes() < int64(minBuffSize) {
+		return 0, syserror.EINVAL
+	}
+
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	return fd.readLocked(ctx, dst, opts)
+}
+
+// readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
+func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	if fd.queue.Empty() {
+		return 0, syserror.ErrWouldBlock
+	}
+
+	var readCursor uint32
+	var bytesRead int64
+	for {
+		req := fd.queue.Front()
+		if dst.NumBytes() < int64(req.hdr.Len) {
+			// The request is too large. Cannot process it. All requests must be smaller than the
+			// negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
+			// handshake.
+			errno := -int32(syscall.EIO)
+			if req.hdr.Opcode == linux.FUSE_SETXATTR {
+				errno = -int32(syscall.E2BIG)
+			}
+
+			// Return the error to the calling task.
+			if err := fd.sendError(ctx, errno, req); err != nil {
+				return 0, err
+			}
+
+			// We're done with this request.
+			fd.queue.Remove(req)
+
+			// Restart the read as this request was invalid.
+			log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.")
+			return fd.readLocked(ctx, dst, opts)
+		}
+
+		n, err := dst.CopyOut(ctx, req.data[readCursor:])
+		if err != nil {
+			return 0, err
+		}
+		readCursor += uint32(n)
+		bytesRead += int64(n)
+
+		if readCursor >= req.hdr.Len {
+			// Fully done with this req, remove it from the queue.
+			fd.queue.Remove(req)
+			break
+		}
+	}
+
+	return bytesRead, nil
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	return 0, syserror.ENOSYS
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	return fd.writeLocked(ctx, src, opts)
+}
+
+// writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
+func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	var cn, n int64
+	hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+
+	for src.NumBytes() > 0 {
+		if fd.writeCursorFR != nil {
+			// Already have common header, and we're now copying the payload.
+			wantBytes := fd.writeCursorFR.hdr.Len
+
+			// Note that the FR data doesn't have the header. Copy it over if its necessary.
+			if fd.writeCursorFR.data == nil {
+				fd.writeCursorFR.data = make([]byte, wantBytes)
+			}
+
+			bytesCopied, err := src.CopyIn(ctx, fd.writeCursorFR.data[fd.writeCursor:wantBytes])
+			if err != nil {
+				return 0, err
+			}
+			src = src.DropFirst(bytesCopied)
+
+			cn = int64(bytesCopied)
+			n += cn
+			fd.writeCursor += uint32(cn)
+			if fd.writeCursor == wantBytes {
+				// Done reading this full response. Clean up and unblock the
+				// initiator.
+				break
+			}
+
+			// Check if we have more data in src.
+			continue
+		}
+
+		// Assert that the header isn't read into the writeBuf yet.
+		if fd.writeCursor >= hdrLen {
+			return 0, syserror.EINVAL
+		}
+
+		// We don't have the full common response header yet.
+		wantBytes := hdrLen - fd.writeCursor
+		bytesCopied, err := src.CopyIn(ctx, fd.writeBuf[fd.writeCursor:wantBytes])
+		if err != nil {
+			return 0, err
+		}
+		src = src.DropFirst(bytesCopied)
+
+		cn = int64(bytesCopied)
+		n += cn
+		fd.writeCursor += uint32(cn)
+		if fd.writeCursor == hdrLen {
+			// Have full header in the writeBuf. Use it to fetch the actual futureResponse
+			// from the device's completions map.
+			var hdr linux.FUSEHeaderOut
+			hdr.UnmarshalBytes(fd.writeBuf)
+
+			// We have the header now and so the writeBuf has served its purpose.
+			// We could reset it manually here but instead of doing that, at the
+			// end of the write, the writeCursor will be set to 0 thereby allowing
+			// the next request to overwrite whats in the buffer,
+
+			fut, ok := fd.completions[hdr.Unique]
+			if !ok {
+				// Server sent us a response for a request we never sent?
+				return 0, syserror.EINVAL
+			}
+
+			delete(fd.completions, hdr.Unique)
+
+			// Copy over the header into the future response. The rest of the payload
+			// will be copied over to the FR's data in the next iteration.
+			fut.hdr = &hdr
+			fd.writeCursorFR = fut
+
+			// Next iteration will now try read the complete request, if src has
+			// any data remaining. Otherwise we're done.
+		}
+	}
+
+	if fd.writeCursorFR != nil {
+		if err := fd.sendResponse(ctx, fd.writeCursorFR); err != nil {
+			return 0, err
+		}
+
+		// Ready the device for the next request.
+		fd.writeCursorFR = nil
+		fd.writeCursor = 0
+	}
+
+	return n, nil
+}
+
+// Readiness implements vfs.FileDescriptionImpl.Readiness.
+func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	var ready waiter.EventMask
+	ready |= waiter.EventOut // FD is always writable
+	if !fd.queue.Empty() {
+		// Have reqs available, FD is readable.
+		ready |= waiter.EventIn
+	}
+
+	return ready & mask
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *DeviceFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.waitQueue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
+	fd.waitQueue.EventUnregister(e)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	return 0, syserror.ENOSYS
+}
+
+// sendResponse sends a response to the waiting task (if any).
+func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
+	// See if the running task need to perform some action before returning.
+	// Since we just finished writing the future, we can be sure that
+	// getResponse generates a populated response.
+	if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil {
+		return err
+	}
+
+	// Signal that the queue is no longer full.
+	select {
+	case fd.fullQueueCh <- struct{}{}:
+	default:
+	}
+	fd.numActiveRequests -= 1
+
+	// Signal the task waiting on a response.
+	close(fut.ch)
+	return nil
+}
+
+// sendError sends an error response to the waiting task (if any).
+func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error {
+	// Return the error to the calling task.
+	outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+	respHdr := linux.FUSEHeaderOut{
+		Len:    outHdrLen,
+		Error:  errno,
+		Unique: req.hdr.Unique,
+	}
+
+	fut, ok := fd.completions[respHdr.Unique]
+	if !ok {
+		// Server sent us a response for a request we never sent?
+		return syserror.EINVAL
+	}
+	delete(fd.completions, respHdr.Unique)
+
+	fut.hdr = &respHdr
+	if err := fd.sendResponse(ctx, fut); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// noReceiverAction has the calling kernel.Task do some action if its known that no
+// receiver is going to be waiting on the future channel. This is to be used by:
+// FUSE_INIT.
+func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error {
+	if r.opcode == linux.FUSE_INIT {
+		creds := auth.CredentialsFromContext(ctx)
+		rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
+		return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go
new file mode 100644
index 000000000..84c222ad6
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/dev_test.go
@@ -0,0 +1,428 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"fmt"
+	"io"
+	"math/rand"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+)
+
+// echoTestOpcode is the Opcode used during testing. The server used in tests
+// will simply echo the payload back with the appropriate headers.
+const echoTestOpcode linux.FUSEOpcode = 1000
+
+type testPayload struct {
+	data uint32
+}
+
+// TestFUSECommunication tests that the communication layer between the Sentry and the
+// FUSE server daemon works as expected.
+func TestFUSECommunication(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	k := kernel.KernelFromContext(s.Ctx)
+	creds := auth.CredentialsFromContext(s.Ctx)
+
+	// Create test cases with different number of concurrent clients and servers.
+	testCases := []struct {
+		Name              string
+		NumClients        int
+		NumServers        int
+		MaxActiveRequests uint64
+	}{
+		{
+			Name:              "SingleClientSingleServer",
+			NumClients:        1,
+			NumServers:        1,
+			MaxActiveRequests: maxActiveRequestsDefault,
+		},
+		{
+			Name:              "SingleClientMultipleServers",
+			NumClients:        1,
+			NumServers:        10,
+			MaxActiveRequests: maxActiveRequestsDefault,
+		},
+		{
+			Name:              "MultipleClientsSingleServer",
+			NumClients:        10,
+			NumServers:        1,
+			MaxActiveRequests: maxActiveRequestsDefault,
+		},
+		{
+			Name:              "MultipleClientsMultipleServers",
+			NumClients:        10,
+			NumServers:        10,
+			MaxActiveRequests: maxActiveRequestsDefault,
+		},
+		{
+			Name:              "RequestCapacityFull",
+			NumClients:        10,
+			NumServers:        1,
+			MaxActiveRequests: 1,
+		},
+		{
+			Name:              "RequestCapacityContinuouslyFull",
+			NumClients:        100,
+			NumServers:        2,
+			MaxActiveRequests: 2,
+		},
+	}
+
+	for _, testCase := range testCases {
+		t.Run(testCase.Name, func(t *testing.T) {
+			conn, fd, err := newTestConnection(s, k, testCase.MaxActiveRequests)
+			if err != nil {
+				t.Fatalf("newTestConnection: %v", err)
+			}
+
+			clientsDone := make([]chan struct{}, testCase.NumClients)
+			serversDone := make([]chan struct{}, testCase.NumServers)
+			serversKill := make([]chan struct{}, testCase.NumServers)
+
+			// FUSE clients.
+			for i := 0; i < testCase.NumClients; i++ {
+				clientsDone[i] = make(chan struct{})
+				go func(i int) {
+					fuseClientRun(t, s, k, conn, creds, uint32(i), uint64(i), clientsDone[i])
+				}(i)
+			}
+
+			// FUSE servers.
+			for j := 0; j < testCase.NumServers; j++ {
+				serversDone[j] = make(chan struct{})
+				serversKill[j] = make(chan struct{}, 1) // The kill command shouldn't block.
+				go func(j int) {
+					fuseServerRun(t, s, k, fd, serversDone[j], serversKill[j])
+				}(j)
+			}
+
+			// Tear down.
+			//
+			// Make sure all the clients are done.
+			for i := 0; i < testCase.NumClients; i++ {
+				<-clientsDone[i]
+			}
+
+			// Kill any server that is potentially waiting.
+			for j := 0; j < testCase.NumServers; j++ {
+				serversKill[j] <- struct{}{}
+			}
+
+			// Make sure all the servers are done.
+			for j := 0; j < testCase.NumServers; j++ {
+				<-serversDone[j]
+			}
+		})
+	}
+}
+
+// CallTest makes a request to the server and blocks the invoking
+// goroutine until a server responds with a response. Doesn't block
+// a kernel.Task. Analogous to Connection.Call but used for testing.
+func CallTest(conn *connection, t *kernel.Task, r *Request, i uint32) (*Response, error) {
+	conn.fd.mu.Lock()
+
+	// Wait until we're certain that a new request can be processed.
+	for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
+		conn.fd.mu.Unlock()
+		select {
+		case <-conn.fd.fullQueueCh:
+		}
+		conn.fd.mu.Lock()
+	}
+
+	fut, err := conn.callFutureLocked(t, r) // No task given.
+	conn.fd.mu.Unlock()
+
+	if err != nil {
+		return nil, err
+	}
+
+	// Resolve the response.
+	//
+	// Block without a task.
+	select {
+	case <-fut.ch:
+	}
+
+	// A response is ready. Resolve and return it.
+	return fut.getResponse(), nil
+}
+
+// ReadTest is analogous to vfs.FileDescription.Read and reads from the FUSE
+// device. However, it does so by - not blocking the task that is calling - and
+// instead just waits on a channel. The behaviour is essentially the same as
+// DeviceFD.Read except it guarantees that the task is not blocked.
+func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem.IOSequence, killServer chan struct{}) (int64, bool, error) {
+	var err error
+	var n, total int64
+
+	dev := fd.Impl().(*DeviceFD)
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	dev.EventRegister(&w, waiter.EventIn)
+	for {
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err = dev.Read(serverTask, inIOseq, vfs.ReadOptions{})
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		// Emulate the blocking for when no requests are available
+		select {
+		case <-ch:
+		case <-killServer:
+			// Server killed by the main program.
+			return 0, true, nil
+		}
+	}
+
+	dev.EventUnregister(&w)
+	return total, false, err
+}
+
+// fuseClientRun emulates all the actions of a normal FUSE request. It creates
+// a header, a payload, calls the server, waits for the response, and processes
+// the response.
+func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) {
+	defer func() { clientDone <- struct{}{} }()
+
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	clientTask, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("fuse-client-%v", pid), tc, s.MntNs, s.Root, s.Root)
+	if err != nil {
+		t.Fatal(err)
+	}
+	testObj := &testPayload{
+		data: rand.Uint32(),
+	}
+
+	req, err := conn.NewRequest(creds, pid, inode, echoTestOpcode, testObj)
+	if err != nil {
+		t.Fatalf("NewRequest creation failed: %v", err)
+	}
+
+	// Queue up a request.
+	// Analogous to Call except it doesn't block on the task.
+	resp, err := CallTest(conn, clientTask, req, pid)
+	if err != nil {
+		t.Fatalf("CallTaskNonBlock failed: %v", err)
+	}
+
+	if err = resp.Error(); err != nil {
+		t.Fatalf("Server responded with an error: %v", err)
+	}
+
+	var respTestPayload testPayload
+	if err := resp.UnmarshalPayload(&respTestPayload); err != nil {
+		t.Fatalf("Unmarshalling payload error: %v", err)
+	}
+
+	if resp.hdr.Unique != req.hdr.Unique {
+		t.Fatalf("got response for another request. Expected response for req %v but got response for req %v",
+			req.hdr.Unique, resp.hdr.Unique)
+	}
+
+	if respTestPayload.data != testObj.data {
+		t.Fatalf("read incorrect data. Data expected: %v, but got %v", testObj.data, respTestPayload.data)
+	}
+
+}
+
+// fuseServerRun creates a task and emulates all the actions of a simple FUSE server
+// that simply reads a request and echos the same struct back as a response using the
+// appropriate headers.
+func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.FileDescription, serverDone, killServer chan struct{}) {
+	defer func() { serverDone <- struct{}{} }()
+
+	// Create the tasks that the server will be using.
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	var readPayload testPayload
+
+	serverTask, err := testutil.CreateTask(s.Ctx, "fuse-server", tc, s.MntNs, s.Root, s.Root)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Read the request.
+	for {
+		inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes())
+		payloadLen := uint32(readPayload.SizeBytes())
+
+		// The raed buffer must meet some certain size criteria.
+		buffSize := inHdrLen + payloadLen
+		if buffSize < linux.FUSE_MIN_READ_BUFFER {
+			buffSize = linux.FUSE_MIN_READ_BUFFER
+		}
+		inBuf := make([]byte, buffSize)
+		inIOseq := usermem.BytesIOSequence(inBuf)
+
+		n, serverKilled, err := ReadTest(serverTask, fd, inIOseq, killServer)
+		if err != nil {
+			t.Fatalf("Read failed :%v", err)
+		}
+
+		// Server should shut down. No new requests are going to be made.
+		if serverKilled {
+			break
+		}
+
+		if n <= 0 {
+			t.Fatalf("Read read no bytes")
+		}
+
+		var readFUSEHeaderIn linux.FUSEHeaderIn
+		readFUSEHeaderIn.UnmarshalUnsafe(inBuf[:inHdrLen])
+		readPayload.UnmarshalUnsafe(inBuf[inHdrLen : inHdrLen+payloadLen])
+
+		if readFUSEHeaderIn.Opcode != echoTestOpcode {
+			t.Fatalf("read incorrect data. Header: %v, Payload: %v", readFUSEHeaderIn, readPayload)
+		}
+
+		// Write the response.
+		outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+		outBuf := make([]byte, outHdrLen+payloadLen)
+		outHeader := linux.FUSEHeaderOut{
+			Len:    outHdrLen + payloadLen,
+			Error:  0,
+			Unique: readFUSEHeaderIn.Unique,
+		}
+
+		// Echo the payload back.
+		outHeader.MarshalUnsafe(outBuf[:outHdrLen])
+		readPayload.MarshalUnsafe(outBuf[outHdrLen:])
+		outIOseq := usermem.BytesIOSequence(outBuf)
+
+		n, err = fd.Write(s.Ctx, outIOseq, vfs.WriteOptions{})
+		if err != nil {
+			t.Fatalf("Write failed :%v", err)
+		}
+	}
+}
+
+func setup(t *testing.T) *testutil.System {
+	k, err := testutil.Boot()
+	if err != nil {
+		t.Fatalf("Error creating kernel: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+	creds := auth.CredentialsFromContext(ctx)
+
+	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList:  true,
+		AllowUserMount: true,
+	})
+
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("NewMountNamespace(): %v", err)
+	}
+
+	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
+}
+
+// newTestConnection creates a fuse connection that the sentry can communicate with
+// and the FD for the server to communicate with.
+func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
+	vfsObj := &vfs.VirtualFilesystem{}
+	fuseDev := &DeviceFD{}
+
+	if err := vfsObj.Init(); err != nil {
+		return nil, nil, err
+	}
+
+	vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+	defer vd.DecRef()
+	if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, nil, err
+	}
+
+	fsopts := filesystemOptions{
+		maxActiveRequests: maxActiveRequests,
+	}
+	fs, err := NewFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return fs.conn, &fuseDev.vfsfd, nil
+}
+
+// SizeBytes implements marshal.Marshallable.SizeBytes.
+func (t *testPayload) SizeBytes() int {
+	return 4
+}
+
+// MarshalBytes implements marshal.Marshallable.MarshalBytes.
+func (t *testPayload) MarshalBytes(dst []byte) {
+	usermem.ByteOrder.PutUint32(dst[:4], t.data)
+}
+
+// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
+func (t *testPayload) UnmarshalBytes(src []byte) {
+	*t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
+}
+
+// Packed implements marshal.Marshallable.Packed.
+func (t *testPayload) Packed() bool {
+	return true
+}
+
+// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
+func (t *testPayload) MarshalUnsafe(dst []byte) {
+	t.MarshalBytes(dst)
+}
+
+// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
+func (t *testPayload) UnmarshalUnsafe(src []byte) {
+	t.UnmarshalBytes(src)
+}
+
+// CopyOutN implements marshal.Marshallable.CopyOutN.
+func (t *testPayload) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
+	panic("not implemented")
+}
+
+// CopyOut implements marshal.Marshallable.CopyOut.
+func (t *testPayload) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
+	panic("not implemented")
+}
+
+// CopyIn implements marshal.Marshallable.CopyIn.
+func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
+	panic("not implemented")
+}
+
+// WriteTo implements io.WriterTo.WriteTo.
+func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
+	panic("not implemented")
+}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
new file mode 100644
index 000000000..200a93bbf
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -0,0 +1,228 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fuse implements fusefs.
+package fuse
+
+import (
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the default filesystem name.
+const Name = "fuse"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+type filesystemOptions struct {
+	// userID specifies the numeric uid of the mount owner.
+	// This option should not be specified by the filesystem owner.
+	// It is set by libfuse (or, if libfuse is not used, must be set
+	// by the filesystem itself). For more information, see man page
+	// for fuse(8)
+	userID uint32
+
+	// groupID specifies the numeric gid of the mount owner.
+	// This option should not be specified by the filesystem owner.
+	// It is set by libfuse (or, if libfuse is not used, must be set
+	// by the filesystem itself). For more information, see man page
+	// for fuse(8)
+	groupID uint32
+
+	// rootMode specifies the the file mode of the filesystem's root.
+	rootMode linux.FileMode
+
+	// maxActiveRequests specifies the maximum number of active requests that can
+	// exist at any time. Any further requests will block when trying to
+	// Call the server.
+	maxActiveRequests uint64
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+	devMinor uint32
+
+	// conn is used for communication between the FUSE server
+	// daemon and the sentry fusefs.
+	conn *connection
+
+	// opts is the options the fusefs is initialized with.
+	opts *filesystemOptions
+}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	var fsopts filesystemOptions
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	deviceDescriptorStr, ok := mopts["fd"]
+	if !ok {
+		log.Warningf("%s.GetFilesystem: communication file descriptor N (obtained by opening /dev/fuse) must be specified as 'fd=N'", fsType.Name())
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "fd")
+
+	deviceDescriptor, err := strconv.ParseInt(deviceDescriptorStr, 10 /* base */, 32 /* bitSize */)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	kernelTask := kernel.TaskFromContext(ctx)
+	if kernelTask == nil {
+		log.Warningf("%s.GetFilesystem: couldn't get kernel task from context", fsType.Name())
+		return nil, nil, syserror.EINVAL
+	}
+	fuseFd := kernelTask.GetFileVFS2(int32(deviceDescriptor))
+
+	// Parse and set all the other supported FUSE mount options.
+	// TODO(gVisor.dev/issue/3229): Expand the supported mount options.
+	if userIDStr, ok := mopts["user_id"]; ok {
+		delete(mopts, "user_id")
+		userID, err := strconv.ParseUint(userIDStr, 10, 32)
+		if err != nil {
+			log.Warningf("%s.GetFilesystem: invalid user_id: user_id=%s", fsType.Name(), userIDStr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.userID = uint32(userID)
+	}
+
+	if groupIDStr, ok := mopts["group_id"]; ok {
+		delete(mopts, "group_id")
+		groupID, err := strconv.ParseUint(groupIDStr, 10, 32)
+		if err != nil {
+			log.Warningf("%s.GetFilesystem: invalid group_id: group_id=%s", fsType.Name(), groupIDStr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.groupID = uint32(groupID)
+	}
+
+	rootMode := linux.FileMode(0777)
+	modeStr, ok := mopts["rootmode"]
+	if ok {
+		delete(mopts, "rootmode")
+		mode, err := strconv.ParseUint(modeStr, 8, 32)
+		if err != nil {
+			log.Warningf("%s.GetFilesystem: invalid mode: %q", fsType.Name(), modeStr)
+			return nil, nil, syserror.EINVAL
+		}
+		rootMode = linux.FileMode(mode)
+	}
+	fsopts.rootMode = rootMode
+
+	// Set the maxInFlightRequests option.
+	fsopts.maxActiveRequests = maxActiveRequestsDefault
+
+	// Check for unparsed options.
+	if len(mopts) != 0 {
+		log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Create a new FUSE filesystem.
+	fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
+	if err != nil {
+		log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err)
+		return nil, nil, err
+	}
+
+	fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
+
+	// Send a FUSE_INIT request to the FUSE daemon server before returning.
+	// This call is not blocking.
+	if err := fs.conn.InitSend(creds, uint32(kernelTask.ThreadID())); err != nil {
+		log.Warningf("%s.InitSend: failed with error: %v", fsType.Name(), err)
+		return nil, nil, err
+	}
+
+	// root is the fusefs root directory.
+	root := fs.newInode(creds, fsopts.rootMode)
+
+	return fs.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+// NewFUSEFilesystem creates a new FUSE filesystem.
+func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
+	fs := &filesystem{
+		devMinor: devMinor,
+		opts:     opts,
+	}
+
+	conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests)
+	if err != nil {
+		log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err)
+		return nil, syserror.EINVAL
+	}
+
+	fs.conn = conn
+	fuseFD := device.Impl().(*DeviceFD)
+	fuseFD.fs = fs
+
+	return fs, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release()
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoDynamicLookup
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
+
+	dentry kernfs.Dentry
+}
+
+func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
+	i := &inode{}
+	i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
+	i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	i.dentry.Init(i)
+
+	return &i.dentry
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/init.go
new file mode 100644
index 000000000..779c2bd3f
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/init.go
@@ -0,0 +1,166 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// consts used by FUSE_INIT negotiation.
+const (
+	// fuseMaxMaxPages is the maximum value for MaxPages received in InitOut.
+	// Follow the same behavior as unix fuse implementation.
+	fuseMaxMaxPages = 256
+
+	// Maximum value for the time granularity for file time stamps, 1s.
+	// Follow the same behavior as unix fuse implementation.
+	fuseMaxTimeGranNs = 1000000000
+
+	// Minimum value for MaxWrite.
+	// Follow the same behavior as unix fuse implementation.
+	fuseMinMaxWrite = 4096
+
+	// Temporary default value for max readahead, 128kb.
+	fuseDefaultMaxReadahead = 131072
+
+	// The FUSE_INIT_IN flags sent to the daemon.
+	// TODO(gvisor.dev/issue/3199): complete the flags.
+	fuseDefaultInitFlags = linux.FUSE_MAX_PAGES
+)
+
+// Adjustable maximums for Connection's cogestion control parameters.
+// Used as the upperbound of the config values.
+// Currently we do not support adjustment to them.
+var (
+	MaxUserBackgroundRequest   uint16 = fuseDefaultMaxBackground
+	MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold
+)
+
+// InitSend sends a FUSE_INIT request.
+func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
+	in := linux.FUSEInitIn{
+		Major: linux.FUSE_KERNEL_VERSION,
+		Minor: linux.FUSE_KERNEL_MINOR_VERSION,
+		// TODO(gvisor.dev/issue/3196): find appropriate way to calculate this
+		MaxReadahead: fuseDefaultMaxReadahead,
+		Flags:        fuseDefaultInitFlags,
+	}
+
+	req, err := conn.NewRequest(creds, pid, 0, linux.FUSE_INIT, &in)
+	if err != nil {
+		return err
+	}
+
+	// Since there is no task to block on and FUSE_INIT is the request
+	// to unblock other requests, use nil.
+	return conn.CallAsync(nil, req)
+}
+
+// InitRecv receives a FUSE_INIT reply and process it.
+func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error {
+	if err := res.Error(); err != nil {
+		return err
+	}
+
+	var out linux.FUSEInitOut
+	if err := res.UnmarshalPayload(&out); err != nil {
+		return err
+	}
+
+	return conn.initProcessReply(&out, hasSysAdminCap)
+}
+
+// Process the FUSE_INIT reply from the FUSE server.
+func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error {
+	// No support for old major fuse versions.
+	if out.Major != linux.FUSE_KERNEL_VERSION {
+		conn.connInitError = true
+
+		// Set the connection as initialized and unblock the blocked requests
+		// (i.e. return error for them).
+		conn.SetInitialized()
+
+		return nil
+	}
+
+	// Start processing the reply.
+	conn.connInitSuccess = true
+	conn.minor = out.Minor
+
+	// No support for limits before minor version 13.
+	if out.Minor >= 13 {
+		conn.bgLock.Lock()
+
+		if out.MaxBackground > 0 {
+			conn.maxBackground = out.MaxBackground
+
+			if !hasSysAdminCap &&
+				conn.maxBackground > MaxUserBackgroundRequest {
+				conn.maxBackground = MaxUserBackgroundRequest
+			}
+		}
+
+		if out.CongestionThreshold > 0 {
+			conn.congestionThreshold = out.CongestionThreshold
+
+			if !hasSysAdminCap &&
+				conn.congestionThreshold > MaxUserCongestionThreshold {
+				conn.congestionThreshold = MaxUserCongestionThreshold
+			}
+		}
+
+		conn.bgLock.Unlock()
+	}
+
+	// No support for the following flags before minor version 6.
+	if out.Minor >= 6 {
+		conn.asyncRead = out.Flags&linux.FUSE_ASYNC_READ != 0
+		conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0
+		conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0
+		conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0
+		conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0
+		conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0
+
+		// TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs).
+
+		if out.Flags&linux.FUSE_MAX_PAGES != 0 {
+			maxPages := out.MaxPages
+			if maxPages < 1 {
+				maxPages = 1
+			}
+			if maxPages > fuseMaxMaxPages {
+				maxPages = fuseMaxMaxPages
+			}
+			conn.maxPages = maxPages
+		}
+	}
+
+	// No support for negotiating MaxWrite before minor version 5.
+	if out.Minor >= 5 {
+		conn.maxWrite = out.MaxWrite
+	} else {
+		conn.maxWrite = fuseMinMaxWrite
+	}
+	if conn.maxWrite < fuseMinMaxWrite {
+		conn.maxWrite = fuseMinMaxWrite
+	}
+
+	// Set connection as initialized and unblock the requests
+	// issued before init.
+	conn.SetInitialized()
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/register.go b/pkg/sentry/fsimpl/fuse/register.go
new file mode 100644
index 000000000..b5b581152
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/register.go
@@ -0,0 +1,42 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Register registers the FUSE device with vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+	if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{
+		GroupName: "misc",
+	}); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// CreateDevtmpfsFile creates a device special file in devtmpfs.
+func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error {
+	if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil {
+		return err
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 67e916525..4a800dcf9 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -35,6 +35,7 @@ go_library(
         "fstree.go",
         "gofer.go",
         "handle.go",
+        "host_named_pipe.go",
         "p9file.go",
         "regular_file.go",
         "socket.go",
@@ -47,11 +48,13 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fd",
+        "//pkg/fdnotifier",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
         "//pkg/safemem",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/host",
         "//pkg/sentry/hostfd",
         "//pkg/sentry/kernel",
@@ -71,6 +74,7 @@ go_library(
         "//pkg/unet",
         "//pkg/usermem",
         "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index b98218753..8c7c8e1b3 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -85,6 +85,7 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
 	d2 := &dentry{
 		refs:      1, // held by d
 		fs:        d.fs,
+		ino:       d.fs.nextSyntheticIno(),
 		mode:      uint32(opts.mode),
 		uid:       uint32(opts.kuid),
 		gid:       uint32(opts.kgid),
@@ -138,6 +139,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 		fd.dirents = ds
 	}
 
+	d.InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
 	if d.cachedMetadataAuthoritative() {
 		d.touchAtime(fd.vfsfd.Mount())
 	}
@@ -183,13 +185,13 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 		{
 			Name:    ".",
 			Type:    linux.DT_DIR,
-			Ino:     d.ino,
+			Ino:     uint64(d.ino),
 			NextOff: 1,
 		},
 		{
 			Name:    "..",
 			Type:    uint8(atomic.LoadUint32(&parent.mode) >> 12),
-			Ino:     parent.ino,
+			Ino:     uint64(parent.ino),
 			NextOff: 2,
 		},
 	}
@@ -225,7 +227,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 				}
 				dirent := vfs.Dirent{
 					Name:    p9d.Name,
-					Ino:     p9d.QID.Path,
+					Ino:     uint64(inoFromPath(p9d.QID.Path)),
 					NextOff: int64(len(dirents) + 1),
 				}
 				// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
@@ -258,7 +260,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 			dirents = append(dirents, vfs.Dirent{
 				Name:    child.name,
 				Type:    uint8(atomic.LoadUint32(&child.mode) >> 12),
-				Ino:     child.ino,
+				Ino:     uint64(child.ino),
 				NextOff: int64(len(dirents) + 1),
 			})
 		}
@@ -299,3 +301,8 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
 		return 0, syserror.EINVAL
 	}
 }
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *directoryFD) Sync(ctx context.Context) error {
+	return fd.dentry().handle.sync(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 7f2181216..00e3c99cd 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -16,6 +16,7 @@ package gofer
 
 import (
 	"sync"
+	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -118,7 +119,7 @@ func putDentrySlice(ds *[]*dentry) {
 // must be up to date.
 //
 // Postconditions: The returned dentry's cached metadata is up to date.
-func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
 	}
@@ -149,11 +150,9 @@ afterSymlink:
 			return nil, err
 		}
 		if d != d.parent && !d.cachedMetadataAuthoritative() {
-			_, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask())
-			if err != nil {
+			if err := d.parent.updateFromGetattr(ctx); err != nil {
 				return nil, err
 			}
-			d.parent.updateFromP9Attrs(attrMask, &attr)
 		}
 		rp.Advance()
 		return d.parent, nil
@@ -168,7 +167,7 @@ afterSymlink:
 	if err := rp.CheckMount(&child.vfsd); err != nil {
 		return nil, err
 	}
-	if child.isSymlink() && rp.ShouldFollowSymlink() {
+	if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
 		target, err := child.readlink(ctx, rp.Mount())
 		if err != nil {
 			return nil, err
@@ -208,18 +207,28 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil
 
 // Preconditions: As for getChildLocked. !parent.isSynthetic().
 func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
+	if child != nil {
+		// Need to lock child.metadataMu because we might be updating child
+		// metadata. We need to hold the lock *before* getting metadata from the
+		// server and release it after updating local metadata.
+		child.metadataMu.Lock()
+	}
 	qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
 	if err != nil && err != syserror.ENOENT {
+		if child != nil {
+			child.metadataMu.Unlock()
+		}
 		return nil, err
 	}
 	if child != nil {
-		if !file.isNil() && qid.Path == child.ino {
-			// The file at this path hasn't changed. Just update cached
-			// metadata.
+		if !file.isNil() && inoFromPath(qid.Path) == child.ino {
+			// The file at this path hasn't changed. Just update cached metadata.
 			file.close(ctx)
-			child.updateFromP9Attrs(attrMask, &attr)
+			child.updateFromP9AttrsLocked(attrMask, &attr)
+			child.metadataMu.Unlock()
 			return child, nil
 		}
+		child.metadataMu.Unlock()
 		if file.isNil() && child.isSynthetic() {
 			// We have a synthetic file, and no remote file has arisen to
 			// replace it.
@@ -275,7 +284,7 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	for !rp.Final() {
 		d.dirMu.Lock()
-		next, err := fs.stepLocked(ctx, rp, d, ds)
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
 		d.dirMu.Unlock()
 		if err != nil {
 			return nil, err
@@ -301,7 +310,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 	}
 	for !rp.Done() {
 		d.dirMu.Lock()
-		next, err := fs.stepLocked(ctx, rp, d, ds)
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
 		d.dirMu.Unlock()
 		if err != nil {
 			return nil, err
@@ -371,17 +380,33 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 		}
 		parent.touchCMtime()
 		parent.dirents = nil
+		ev := linux.IN_CREATE
+		if dir {
+			ev |= linux.IN_ISDIR
+		}
+		parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
 		return nil
 	}
 	if fs.opts.interop == InteropModeShared {
-		// The existence of a dentry at name would be inconclusive because the
-		// file it represents may have been deleted from the remote filesystem,
-		// so we would need to make an RPC to revalidate the dentry. Just
-		// attempt the file creation RPC instead. If a file does exist, the RPC
-		// will fail with EEXIST like we would have. If the RPC succeeds, and a
-		// stale dentry exists, the dentry will fail revalidation next time
-		// it's used.
-		return createInRemoteDir(parent, name)
+		if child := parent.children[name]; child != nil && child.isSynthetic() {
+			return syserror.EEXIST
+		}
+		// The existence of a non-synthetic dentry at name would be inconclusive
+		// because the file it represents may have been deleted from the remote
+		// filesystem, so we would need to make an RPC to revalidate the dentry.
+		// Just attempt the file creation RPC instead. If a file does exist, the
+		// RPC will fail with EEXIST like we would have. If the RPC succeeds, and a
+		// stale dentry exists, the dentry will fail revalidation next time it's
+		// used.
+		if err := createInRemoteDir(parent, name); err != nil {
+			return err
+		}
+		ev := linux.IN_CREATE
+		if dir {
+			ev |= linux.IN_ISDIR
+		}
+		parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
+		return nil
 	}
 	if child := parent.children[name]; child != nil {
 		return syserror.EEXIST
@@ -397,6 +422,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	}
 	parent.touchCMtime()
 	parent.dirents = nil
+	ev := linux.IN_CREATE
+	if dir {
+		ev |= linux.IN_ISDIR
+	}
+	parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
 	return nil
 }
 
@@ -443,21 +473,61 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	defer mntns.DecRef()
 	parent.dirMu.Lock()
 	defer parent.dirMu.Unlock()
+
 	child, ok := parent.children[name]
 	if ok && child == nil {
 		return syserror.ENOENT
 	}
-	// We only need a dentry representing the file at name if it can be a mount
-	// point. If child is nil, then it can't be a mount point. If child is
-	// non-nil but stale, the actual file can't be a mount point either; we
-	// detect this case by just speculatively calling PrepareDeleteDentry and
-	// only revalidating the dentry if that fails (indicating that the existing
-	// dentry is a mount point).
+
+	sticky := atomic.LoadUint32(&parent.mode)&linux.ModeSticky != 0
+	if sticky {
+		if !ok {
+			// If the sticky bit is set, we need to retrieve the child to determine
+			// whether removing it is allowed.
+			child, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+			if err != nil {
+				return err
+			}
+		} else if child != nil && !child.cachedMetadataAuthoritative() {
+			// Make sure the dentry representing the file at name is up to date
+			// before examining its metadata.
+			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
+			if err != nil {
+				return err
+			}
+		}
+		if err := parent.mayDelete(rp.Credentials(), child); err != nil {
+			return err
+		}
+	}
+
+	// If a child dentry exists, prepare to delete it. This should fail if it is
+	// a mount point. We detect mount points by speculatively calling
+	// PrepareDeleteDentry, which fails if child is a mount point. However, we
+	// may need to revalidate the file in this case to make sure that it has not
+	// been deleted or replaced on the remote fs, in which case the mount point
+	// will have disappeared. If calling PrepareDeleteDentry fails again on the
+	// up-to-date dentry, we can be sure that it is a mount point.
+	//
+	// Also note that if child is nil, then it can't be a mount point.
 	if child != nil {
+		// Hold child.dirMu so we can check child.children and
+		// child.syntheticChildren. We don't access these fields until a bit later,
+		// but locking child.dirMu after calling vfs.PrepareDeleteDentry() would
+		// create an inconsistent lock ordering between dentry.dirMu and
+		// vfs.Dentry.mu (in the VFS lock order, it would make dentry.dirMu both "a
+		// FilesystemImpl lock" and "a lock acquired by a FilesystemImpl between
+		// PrepareDeleteDentry and CommitDeleteDentry). To avoid this, lock
+		// child.dirMu before calling PrepareDeleteDentry.
 		child.dirMu.Lock()
 		defer child.dirMu.Unlock()
 		if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
-			if parent.cachedMetadataAuthoritative() {
+			// We can skip revalidation in several cases:
+			// - We are not in InteropModeShared
+			// - The parent directory is synthetic, in which case the child must also
+			//   be synthetic
+			// - We already updated the child during the sticky bit check above
+			if parent.cachedMetadataAuthoritative() || sticky {
 				return err
 			}
 			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
@@ -518,7 +588,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 		if child == nil {
 			return syserror.ENOENT
 		}
-	} else {
+	} else if child == nil || !child.isSynthetic() {
 		err = parent.file.unlinkAt(ctx, name, flags)
 		if err != nil {
 			if child != nil {
@@ -527,6 +597,18 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 			return err
 		}
 	}
+
+	// Generate inotify events for rmdir or unlink.
+	if dir {
+		parent.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
+	} else {
+		var cw *vfs.Watches
+		if child != nil {
+			cw = &child.watches
+		}
+		vfs.InotifyRemoveChild(cw, &parent.watches, name)
+	}
+
 	if child != nil {
 		vfsObj.CommitDeleteDentry(&child.vfsd)
 		child.setDeleted()
@@ -754,25 +836,27 @@ afterTrailingSymlink:
 	}
 	// Determine whether or not we need to create a file.
 	parent.dirMu.Lock()
-	child, err := fs.stepLocked(ctx, rp, parent, &ds)
+	child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
 	if err == syserror.ENOENT && mayCreate {
 		if parent.isSynthetic() {
 			parent.dirMu.Unlock()
 			return nil, syserror.EPERM
 		}
-		fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts)
+		fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds)
 		parent.dirMu.Unlock()
 		return fd, err
 	}
+	parent.dirMu.Unlock()
 	if err != nil {
-		parent.dirMu.Unlock()
 		return nil, err
 	}
-	// Open existing child or follow symlink.
-	parent.dirMu.Unlock()
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
+	if !child.isDir() && rp.MustBeDir() {
+		return nil, syserror.ENOTDIR
+	}
+	// Open existing child or follow symlink.
 	if child.isSymlink() && rp.ShouldFollowSymlink() {
 		target, err := child.readlink(ctx, rp.Mount())
 		if err != nil {
@@ -793,20 +877,32 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
+
+	trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG
+	if trunc {
+		// Lock metadataMu *while* we open a regular file with O_TRUNC because
+		// open(2) will change the file size on server.
+		d.metadataMu.Lock()
+		defer d.metadataMu.Unlock()
+	}
+
+	var vfd *vfs.FileDescription
+	var err error
 	mnt := rp.Mount()
 	switch d.fileType() {
 	case linux.S_IFREG:
 		if !d.fs.opts.regularFilesUseSpecialFileFD {
-			if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil {
+			if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, trunc); err != nil {
 				return nil, err
 			}
 			fd := &regularFileFD{}
+			fd.LockFD.Init(&d.locks)
 			if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
 				AllowDirectIO: true,
 			}); err != nil {
 				return nil, err
 			}
-			return &fd.vfsfd, nil
+			vfd = &fd.vfsfd
 		}
 	case linux.S_IFDIR:
 		// Can't open directories with O_CREAT.
@@ -826,6 +922,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 			}
 		}
 		fd := &directoryFD{}
+		fd.LockFD.Init(&d.locks)
 		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
@@ -842,10 +939,28 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 		}
 	case linux.S_IFIFO:
 		if d.isSynthetic() {
-			return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags)
+			return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks)
+		}
+	}
+
+	if vfd == nil {
+		if vfd, err = d.openSpecialFileLocked(ctx, mnt, opts); err != nil {
+			return nil, err
+		}
+	}
+
+	if trunc {
+		// If no errors occured so far then update file size in memory. This
+		// step is required even if !d.cachedMetadataAuthoritative() because
+		// d.mappings has to be updated.
+		// d.metadataMu has already been acquired if trunc == true.
+		d.updateFileSizeLocked(0)
+
+		if d.cachedMetadataAuthoritative() {
+			d.touchCMtimeLocked()
 		}
 	}
-	return d.openSpecialFileLocked(ctx, mnt, opts)
+	return vfd, err
 }
 
 func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
@@ -873,19 +988,37 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts
 	if opts.Flags&linux.O_DIRECT != 0 {
 		return nil, syserror.EINVAL
 	}
-	h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0)
+	// We assume that the server silently inserts O_NONBLOCK in the open flags
+	// for all named pipes (because all existing gofers do this).
+	//
+	// NOTE(b/133875563): This makes named pipe opens racy, because the
+	// mechanisms for translating nonblocking to blocking opens can only detect
+	// the instantaneous presence of a peer holding the other end of the pipe
+	// open, not whether the pipe was *previously* opened by a peer that has
+	// since closed its end.
+	isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0
+retry:
+	h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0)
 	if err != nil {
+		if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && err == syserror.ENXIO {
+			// An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails
+			// with ENXIO if opening the same named pipe with O_WRONLY would
+			// block because there are no readers of the pipe.
+			if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+				return nil, err
+			}
+			goto retry
+		}
 		return nil, err
 	}
-	seekable := d.fileType() == linux.S_IFREG
-	fd := &specialFileFD{
-		handle:   h,
-		seekable: seekable,
+	if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 {
+		if err := blockUntilNonblockingPipeHasWriter(ctx, h.fd); err != nil {
+			h.close(ctx)
+			return nil, err
+		}
 	}
-	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
-		DenyPRead:  !seekable,
-		DenyPWrite: !seekable,
-	}); err != nil {
+	fd, err := newSpecialFileFD(h, mnt, d, &d.locks, opts.Flags)
+	if err != nil {
 		h.close(ctx)
 		return nil, err
 	}
@@ -894,7 +1027,7 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts
 
 // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
 // !d.isSynthetic().
-func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
 	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
 	}
@@ -919,7 +1052,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 	// Filter file creation flags and O_LARGEFILE out; the create RPC already
 	// has the semantics of O_CREAT|O_EXCL, while some servers will choke on
 	// O_LARGEFILE.
-	createFlags := p9.OpenFlags(opts.Flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_LARGEFILE))
+	createFlags := p9.OpenFlags(opts.Flags &^ (vfs.FileCreationFlags | linux.O_LARGEFILE))
 	fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
 	if err != nil {
 		dirfile.close(ctx)
@@ -947,6 +1080,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		}
 		return nil, err
 	}
+	*ds = appendDentry(*ds, child)
 	// Incorporate the fid that was opened by lcreate.
 	useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
 	if useRegularFileFD {
@@ -959,10 +1093,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags)
 		child.handleMu.Unlock()
 	}
-	// Take a reference on the new dentry to be held by the new file
-	// description. (This reference also means that the new dentry is not
-	// eligible for caching yet, so we don't need to append to a dentry slice.)
-	child.refs = 1
 	// Insert the dentry into the tree.
 	d.cacheNewChildLocked(child, name)
 	if d.cachedMetadataAuthoritative() {
@@ -974,6 +1104,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 	var childVFSFD *vfs.FileDescription
 	if useRegularFileFD {
 		fd := &regularFileFD{}
+		fd.LockFD.Init(&child.locks)
 		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
 			AllowDirectIO: true,
 		}); err != nil {
@@ -981,26 +1112,21 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		}
 		childVFSFD = &fd.vfsfd
 	} else {
-		seekable := child.fileType() == linux.S_IFREG
-		fd := &specialFileFD{
-			handle: handle{
-				file: openFile,
-				fd:   -1,
-			},
-			seekable: seekable,
+		h := handle{
+			file: openFile,
+			fd:   -1,
 		}
 		if fdobj != nil {
-			fd.handle.fd = int32(fdobj.Release())
+			h.fd = int32(fdobj.Release())
 		}
-		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
-			DenyPRead:  !seekable,
-			DenyPWrite: !seekable,
-		}); err != nil {
-			fd.handle.close(ctx)
+		fd, err := newSpecialFileFD(h, mnt, child, &d.locks, opts.Flags)
+		if err != nil {
+			h.close(ctx)
 			return nil, err
 		}
 		childVFSFD = &fd.vfsfd
 	}
+	d.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
 	return childVFSFD, nil
 }
 
@@ -1052,7 +1178,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			return err
 		}
 	}
-	if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+	creds := rp.Credentials()
+	if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	vfsObj := rp.VirtualFilesystem()
@@ -1067,12 +1194,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if renamed == nil {
 		return syserror.ENOENT
 	}
+	if err := oldParent.mayDelete(creds, renamed); err != nil {
+		return err
+	}
 	if renamed.isDir() {
 		if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
 			return syserror.EINVAL
 		}
 		if oldParent != newParent {
-			if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+			if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
 				return err
 			}
 		}
@@ -1083,7 +1213,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 
 	if oldParent != newParent {
-		if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
 			return err
 		}
 		newParent.dirMu.Lock()
@@ -1181,10 +1311,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if newParent.cachedMetadataAuthoritative() {
 		newParent.dirents = nil
 		newParent.touchCMtime()
-		if renamed.isDir() {
+		if renamed.isDir() && (replaced == nil || !replaced.isDir()) {
+			// Increase the link count if we did not replace another directory.
 			newParent.incLinks()
 		}
 	}
+	vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
 	return nil
 }
 
@@ -1197,12 +1329,21 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckCaching(&ds)
 		return err
 	}
-	return d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount())
+	if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil {
+		fs.renameMuRUnlockAndCheckCaching(&ds)
+		return err
+	}
+	fs.renameMuRUnlockAndCheckCaching(&ds)
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+	}
+	return nil
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
@@ -1326,24 +1467,38 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckCaching(&ds)
 		return err
 	}
-	return d.setxattr(ctx, rp.Credentials(), &opts)
+	if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil {
+		fs.renameMuRUnlockAndCheckCaching(&ds)
+		return err
+	}
+	fs.renameMuRUnlockAndCheckCaching(&ds)
+
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
 func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckCaching(&ds)
 		return err
 	}
-	return d.removexattr(ctx, rp.Credentials(), name)
+	if err := d.removexattr(ctx, rp.Credentials(), name); err != nil {
+		fs.renameMuRUnlockAndCheckCaching(&ds)
+		return err
+	}
+	fs.renameMuRUnlockAndCheckCaching(&ds)
+
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
@@ -1352,3 +1507,7 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 	defer fs.renameMu.RUnlock()
 	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
 }
+
+func (fs *filesystem) nextSyntheticIno() inodeNumber {
+	return inodeNumber(atomic.AddUint64(&fs.syntheticSeq, 1) | syntheticInoMask)
+}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 6295f6b54..e20de84b5 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -45,6 +45,7 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -84,12 +85,6 @@ type filesystem struct {
 	// devMinor is the filesystem's minor device number. devMinor is immutable.
 	devMinor uint32
 
-	// uid and gid are the effective KUID and KGID of the filesystem's creator,
-	// and are used as the owner and group for files that don't specify one.
-	// uid and gid are immutable.
-	uid auth.KUID
-	gid auth.KGID
-
 	// renameMu serves two purposes:
 	//
 	// - It synchronizes path resolution with renaming initiated by this
@@ -115,6 +110,26 @@ type filesystem struct {
 	syncMu           sync.Mutex
 	syncableDentries map[*dentry]struct{}
 	specialFileFDs   map[*specialFileFD]struct{}
+
+	// syntheticSeq stores a counter to used to generate unique inodeNumber for
+	// synthetic dentries.
+	syntheticSeq uint64
+}
+
+// inodeNumber represents inode number reported in Dirent.Ino. For regular
+// dentries, it comes from QID.Path from the 9P server. Synthetic dentries
+// have have their inodeNumber generated sequentially, with the MSB reserved to
+// prevent conflicts with regular dentries.
+type inodeNumber uint64
+
+// Reserve MSB for synthetic mounts.
+const syntheticInoMask = uint64(1) << 63
+
+func inoFromPath(path uint64) inodeNumber {
+	if path&syntheticInoMask != 0 {
+		log.Warningf("Dropping MSB from ino, collision is possible. Original: %d, new: %d", path, path&^syntheticInoMask)
+	}
+	return inodeNumber(path &^ syntheticInoMask)
 }
 
 type filesystemOptions struct {
@@ -122,6 +137,8 @@ type filesystemOptions struct {
 	fd      int
 	aname   string
 	interop InteropMode // derived from the "cache" mount option
+	dfltuid auth.KUID
+	dfltgid auth.KGID
 	msize   uint32
 	version string
 
@@ -230,6 +247,15 @@ type InternalFilesystemOptions struct {
 	OpenSocketsByConnecting bool
 }
 
+// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default
+// UIDs and GIDs used for files that do not provide a specific owner or group
+// respectively.
+const (
+	// uint32(-2) doesn't work in Go.
+	_V9FS_DEFUID = auth.KUID(4294967294)
+	_V9FS_DEFGID = auth.KGID(4294967294)
+)
+
 // Name implements vfs.FilesystemType.Name.
 func (FilesystemType) Name() string {
 	return Name
@@ -315,6 +341,31 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		}
 	}
 
+	// Parse the default UID and GID.
+	fsopts.dfltuid = _V9FS_DEFUID
+	if dfltuidstr, ok := mopts["dfltuid"]; ok {
+		delete(mopts, "dfltuid")
+		dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltuid=%s", dfltuidstr)
+			return nil, nil, syserror.EINVAL
+		}
+		// In Linux, dfltuid is interpreted as a UID and is converted to a KUID
+		// in the caller's user namespace, but goferfs isn't
+		// application-mountable.
+		fsopts.dfltuid = auth.KUID(dfltuid)
+	}
+	fsopts.dfltgid = _V9FS_DEFGID
+	if dfltgidstr, ok := mopts["dfltgid"]; ok {
+		delete(mopts, "dfltgid")
+		dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltgid=%s", dfltgidstr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.dfltgid = auth.KGID(dfltgid)
+	}
+
 	// Parse the 9P message size.
 	fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M
 	if msizestr, ok := mopts["msize"]; ok {
@@ -422,8 +473,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		client:           client,
 		clock:            ktime.RealtimeClockFromContext(ctx),
 		devMinor:         devMinor,
-		uid:              creds.EffectiveKUID,
-		gid:              creds.EffectiveKGID,
 		syncableDentries: make(map[*dentry]struct{}),
 		specialFileFDs:   make(map[*specialFileFD]struct{}),
 	}
@@ -553,21 +602,27 @@ type dentry struct {
 	// returned by the server. dirents is protected by dirMu.
 	dirents []vfs.Dirent
 
-	// Cached metadata; protected by metadataMu and accessed using atomic
-	// memory operations unless otherwise specified.
+	// Cached metadata; protected by metadataMu.
+	// To access:
+	//   - In situations where consistency is not required (like stat), these
+	//     can be accessed using atomic operations only (without locking).
+	//   - Lock metadataMu and can access without atomic operations.
+	// To mutate:
+	//   - Lock metadataMu and use atomic operations to update because we might
+	//     have atomic readers that don't hold the lock.
 	metadataMu sync.Mutex
-	ino        uint64 // immutable
-	mode       uint32 // type is immutable, perms are mutable
-	uid        uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
-	gid        uint32 // auth.KGID, but ...
-	blockSize  uint32 // 0 if unknown
+	ino        inodeNumber // immutable
+	mode       uint32      // type is immutable, perms are mutable
+	uid        uint32      // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid        uint32      // auth.KGID, but ...
+	blockSize  uint32      // 0 if unknown
 	// Timestamps, all nsecs from the Unix epoch.
 	atime int64
 	mtime int64
 	ctime int64
 	btime int64
 	// File size, protected by both metadataMu and dataMu (i.e. both must be
-	// locked to mutate it).
+	// locked to mutate it; locking either is sufficient to access it).
 	size uint64
 
 	// nlink counts the number of hard links to this dentry. It's updated and
@@ -634,6 +689,11 @@ type dentry struct {
 	// If this dentry represents a synthetic named pipe, pipe is the pipe
 	// endpoint bound to this file.
 	pipe *pipe.VFSPipe
+
+	locks vfs.FileLocks
+
+	// Inotify watches for this dentry.
+	watches vfs.Watches
 }
 
 // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the
@@ -670,10 +730,10 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 	d := &dentry{
 		fs:        fs,
 		file:      file,
-		ino:       qid.Path,
+		ino:       inoFromPath(qid.Path),
 		mode:      uint32(attr.Mode),
-		uid:       uint32(fs.uid),
-		gid:       uint32(fs.gid),
+		uid:       uint32(fs.opts.dfltuid),
+		gid:       uint32(fs.opts.dfltgid),
 		blockSize: usermem.PageSize,
 		handle: handle{
 			fd: -1,
@@ -725,8 +785,8 @@ func (d *dentry) cachedMetadataAuthoritative() bool {
 
 // updateFromP9Attrs is called to update d's metadata after an update from the
 // remote filesystem.
-func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
-	d.metadataMu.Lock()
+// Precondition: d.metadataMu must be locked.
+func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
 	if mask.Mode {
 		if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want {
 			d.metadataMu.Unlock()
@@ -760,11 +820,8 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
 		atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
 	}
 	if mask.Size {
-		d.dataMu.Lock()
-		atomic.StoreUint64(&d.size, attr.Size)
-		d.dataMu.Unlock()
+		d.updateFileSizeLocked(attr.Size)
 	}
-	d.metadataMu.Unlock()
 }
 
 // Preconditions: !d.isSynthetic()
@@ -776,6 +833,10 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error {
 		file            p9file
 		handleMuRLocked bool
 	)
+	// d.metadataMu must be locked *before* we getAttr so that we do not end up
+	// updating stale attributes in d.updateFromP9AttrsLocked().
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
 	d.handleMu.RLock()
 	if !d.handle.file.isNil() {
 		file = d.handle.file
@@ -791,7 +852,7 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error {
 	if err != nil {
 		return err
 	}
-	d.updateFromP9Attrs(attrMask, &attr)
+	d.updateFromP9AttrsLocked(attrMask, &attr)
 	return nil
 }
 
@@ -803,10 +864,18 @@ func (d *dentry) statTo(stat *linux.Statx) {
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
 	stat.Blksize = atomic.LoadUint32(&d.blockSize)
 	stat.Nlink = atomic.LoadUint32(&d.nlink)
+	if stat.Nlink == 0 {
+		// The remote filesystem doesn't support link count; just make
+		// something up. This is consistent with Linux, where
+		// fs/inode.c:inode_init_always() initializes link count to 1, and
+		// fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if
+		// it's not provided by the remote filesystem.
+		stat.Nlink = 1
+	}
 	stat.UID = atomic.LoadUint32(&d.uid)
 	stat.GID = atomic.LoadUint32(&d.gid)
 	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
-	stat.Ino = d.ino
+	stat.Ino = uint64(d.ino)
 	stat.Size = atomic.LoadUint64(&d.size)
 	// This is consistent with regularFileFD.Seek(), which treats regular files
 	// as having no holes.
@@ -819,7 +888,8 @@ func (d *dentry) statTo(stat *linux.Statx) {
 	stat.DevMinor = d.fs.devMinor
 }
 
-func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error {
+func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error {
+	stat := &opts.Stat
 	if stat.Mask == 0 {
 		return nil
 	}
@@ -827,7 +897,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 		return syserror.EPERM
 	}
 	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
-	if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
 		return err
 	}
 	if err := mnt.CheckBeginWrite(); err != nil {
@@ -844,14 +914,14 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 
 		// Prepare for truncate.
 		if stat.Mask&linux.STATX_SIZE != 0 {
-			switch d.mode & linux.S_IFMT {
-			case linux.S_IFREG:
+			switch mode.FileType() {
+			case linux.ModeRegular:
 				if !setLocalMtime {
 					// Truncate updates mtime.
 					setLocalMtime = true
 					stat.Mtime.Nsec = linux.UTIME_NOW
 				}
-			case linux.S_IFDIR:
+			case linux.ModeDirectory:
 				return syserror.EISDIR
 			default:
 				return syserror.EINVAL
@@ -860,8 +930,25 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	}
 	d.metadataMu.Lock()
 	defer d.metadataMu.Unlock()
+	if stat.Mask&linux.STATX_SIZE != 0 {
+		// The size needs to be changed even when
+		// !d.cachedMetadataAuthoritative() because d.mappings has to be
+		// updated.
+		d.updateFileSizeLocked(stat.Size)
+	}
 	if !d.isSynthetic() {
 		if stat.Mask != 0 {
+			if stat.Mask&linux.STATX_SIZE != 0 {
+				// Check whether to allow a truncate request to be made.
+				switch d.mode & linux.S_IFMT {
+				case linux.S_IFREG:
+					// Allow.
+				case linux.S_IFDIR:
+					return syserror.EISDIR
+				default:
+					return syserror.EINVAL
+				}
+			}
 			if err := d.file.setAttr(ctx, p9.SetAttrMask{
 				Permissions:        stat.Mask&linux.STATX_MODE != 0,
 				UID:                stat.Mask&linux.STATX_UID != 0,
@@ -908,6 +995,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 		} else {
 			atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
 		}
+		// Restore mask bits that we cleared earlier.
+		stat.Mask |= linux.STATX_ATIME
 	}
 	if setLocalMtime {
 		if stat.Mtime.Nsec == linux.UTIME_NOW {
@@ -915,48 +1004,56 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 		} else {
 			atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
 		}
+		// Restore mask bits that we cleared earlier.
+		stat.Mask |= linux.STATX_MTIME
 	}
 	atomic.StoreInt64(&d.ctime, now)
-	if stat.Mask&linux.STATX_SIZE != 0 {
+	return nil
+}
+
+// Preconditions: d.metadataMu must be locked.
+func (d *dentry) updateFileSizeLocked(newSize uint64) {
+	d.dataMu.Lock()
+	oldSize := d.size
+	atomic.StoreUint64(&d.size, newSize)
+	// d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
+	// below. This allows concurrent calls to Read/Translate/etc. These
+	// functions synchronize with truncation by refusing to use cache
+	// contents beyond the new d.size. (We are still holding d.metadataMu,
+	// so we can't race with Write or another truncate.)
+	d.dataMu.Unlock()
+	if d.size < oldSize {
+		oldpgend, _ := usermem.PageRoundUp(oldSize)
+		newpgend, _ := usermem.PageRoundUp(d.size)
+		if oldpgend != newpgend {
+			d.mapsMu.Lock()
+			d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+				// Compare Linux's mm/truncate.c:truncate_setsize() =>
+				// truncate_pagecache() =>
+				// mm/memory.c:unmap_mapping_range(evencows=1).
+				InvalidatePrivate: true,
+			})
+			d.mapsMu.Unlock()
+		}
+		// We are now guaranteed that there are no translations of
+		// truncated pages, and can remove them from the cache. Since
+		// truncated pages have been removed from the remote file, they
+		// should be dropped without being written back.
 		d.dataMu.Lock()
-		oldSize := d.size
-		d.size = stat.Size
-		// d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
-		// below. This allows concurrent calls to Read/Translate/etc. These
-		// functions synchronize with truncation by refusing to use cache
-		// contents beyond the new d.size. (We are still holding d.metadataMu,
-		// so we can't race with Write or another truncate.)
+		d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
+		d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
 		d.dataMu.Unlock()
-		if d.size < oldSize {
-			oldpgend, _ := usermem.PageRoundUp(oldSize)
-			newpgend, _ := usermem.PageRoundUp(d.size)
-			if oldpgend != newpgend {
-				d.mapsMu.Lock()
-				d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
-					// Compare Linux's mm/truncate.c:truncate_setsize() =>
-					// truncate_pagecache() =>
-					// mm/memory.c:unmap_mapping_range(evencows=1).
-					InvalidatePrivate: true,
-				})
-				d.mapsMu.Unlock()
-			}
-			// We are now guaranteed that there are no translations of
-			// truncated pages, and can remove them from the cache. Since
-			// truncated pages have been removed from the remote file, they
-			// should be dropped without being written back.
-			d.dataMu.Lock()
-			d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
-			d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
-			d.dataMu.Unlock()
-		}
 	}
-	return nil
 }
 
 func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
+func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
+	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid)))
+}
+
 func dentryUIDFromP9UID(uid p9.UID) uint32 {
 	if !uid.Ok() {
 		return uint32(auth.OverflowUID)
@@ -1011,6 +1108,37 @@ func (d *dentry) decRefLocked() {
 	}
 }
 
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {
+	if d.isDir() {
+		events |= linux.IN_ISDIR
+	}
+
+	d.fs.renameMu.RLock()
+	// The ordering below is important, Linux always notifies the parent first.
+	if d.parent != nil {
+		d.parent.watches.Notify(d.name, events, cookie, et, d.isDeleted())
+	}
+	d.watches.Notify("", events, cookie, et, d.isDeleted())
+	d.fs.renameMu.RUnlock()
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+	return &d.watches
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+//
+// If no watches are left on this dentry and it has no references, cache it.
+func (d *dentry) OnZeroWatches() {
+	if atomic.LoadInt64(&d.refs) == 0 {
+		d.fs.renameMu.Lock()
+		d.checkCachingLocked()
+		d.fs.renameMu.Unlock()
+	}
+}
+
 // checkCachingLocked should be called after d's reference count becomes 0 or it
 // becomes disowned.
 //
@@ -1042,6 +1170,9 @@ func (d *dentry) checkCachingLocked() {
 	// Deleted and invalidated dentries with zero references are no longer
 	// reachable by path resolution and should be dropped immediately.
 	if d.vfsd.IsDead() {
+		if d.isDeleted() {
+			d.watches.HandleDeletion()
+		}
 		if d.cached {
 			d.fs.cachedDentries.Remove(d)
 			d.fs.cachedDentriesLen--
@@ -1050,6 +1181,14 @@ func (d *dentry) checkCachingLocked() {
 		d.destroyLocked()
 		return
 	}
+	// If d still has inotify watches and it is not deleted or invalidated, we
+	// cannot cache it and allow it to be evicted. Otherwise, we will lose its
+	// watches, even if a new dentry is created for the same file in the future.
+	// Note that the size of d.watches cannot concurrently transition from zero
+	// to non-zero, because adding a watch requires holding a reference on d.
+	if d.watches.Size() > 0 {
+		return
+	}
 	// If d is already cached, just move it to the front of the LRU.
 	if d.cached {
 		d.fs.cachedDentries.Remove(d)
@@ -1155,7 +1294,7 @@ func (d *dentry) setDeleted() {
 // We only support xattrs prefixed with "user." (see b/148380782). Currently,
 // there is no need to expose any other xattrs through a gofer.
 func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
-	if d.file.isNil() {
+	if d.file.isNil() || !d.userXattrSupported() {
 		return nil, nil
 	}
 	xattrMap, err := d.file.listXattr(ctx, size)
@@ -1181,6 +1320,9 @@ func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vf
 	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
 		return "", syserror.EOPNOTSUPP
 	}
+	if !d.userXattrSupported() {
+		return "", syserror.ENODATA
+	}
 	return d.file.getXattr(ctx, opts.Name, opts.Size)
 }
 
@@ -1194,6 +1336,9 @@ func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vf
 	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
 		return syserror.EOPNOTSUPP
 	}
+	if !d.userXattrSupported() {
+		return syserror.EPERM
+	}
 	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
 }
 
@@ -1207,10 +1352,20 @@ func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name
 	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
 		return syserror.EOPNOTSUPP
 	}
+	if !d.userXattrSupported() {
+		return syserror.EPERM
+	}
 	return d.file.removeXattr(ctx, name)
 }
 
-// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDirectory().
+// Extended attributes in the user.* namespace are only supported for regular
+// files and directories.
+func (d *dentry) userXattrSupported() bool {
+	filetype := linux.FileMode(atomic.LoadUint32(&d.mode)).FileType()
+	return filetype == linux.ModeRegular || filetype == linux.ModeDirectory
+}
+
+// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDir().
 func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
 	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
 	// O_TRUNC).
@@ -1302,23 +1457,21 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 }
 
 // incLinks increments link count.
-//
-// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32.
 func (d *dentry) incLinks() {
-	v := atomic.AddUint32(&d.nlink, 1)
-	if v < 2 {
-		panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v))
+	if atomic.LoadUint32(&d.nlink) == 0 {
+		// The remote filesystem doesn't support link count.
+		return
 	}
+	atomic.AddUint32(&d.nlink, 1)
 }
 
 // decLinks decrements link count.
-//
-// Preconditions: d.nlink > 1.
 func (d *dentry) decLinks() {
-	v := atomic.AddUint32(&d.nlink, ^uint32(0))
-	if v == 0 {
-		panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v))
+	if atomic.LoadUint32(&d.nlink) == 0 {
+		// The remote filesystem doesn't support link count.
+		return
 	}
+	atomic.AddUint32(&d.nlink, ^uint32(0))
 }
 
 // fileDescription is embedded by gofer implementations of
@@ -1326,6 +1479,9 @@ func (d *dentry) decLinks() {
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	lockLogging sync.Once
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
@@ -1354,7 +1510,13 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount())
+	if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()); err != nil {
+		return err
+	}
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		fd.dentry().InotifyWithParent(ev, 0, vfs.InodeEvent)
+	}
+	return nil
 }
 
 // Listxattr implements vfs.FileDescriptionImpl.Listxattr.
@@ -1369,10 +1531,41 @@ func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOption
 
 // Setxattr implements vfs.FileDescriptionImpl.Setxattr.
 func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
-	return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
+	d := fd.dentry()
+	if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
+		return err
+	}
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // Removexattr implements vfs.FileDescriptionImpl.Removexattr.
 func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
-	return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name)
+	d := fd.dentry()
+	if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
+		return err
+	}
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	fd.lockLogging.Do(func() {
+		log.Infof("File lock using gofer file handled internally.")
+	})
+	return fd.LockFD.LockBSD(ctx, uid, t, block)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	fd.lockLogging.Do(func() {
+		log.Infof("Range lock using gofer file handled internally.")
+	})
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
 }
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index 724a3f1f7..8792ca4f2 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -126,11 +126,16 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o
 }
 
 func (h *handle) sync(ctx context.Context) error {
+	// Handle most common case first.
 	if h.fd >= 0 {
 		ctx.UninterruptibleSleepStart(false)
 		err := syscall.Fsync(int(h.fd))
 		ctx.UninterruptibleSleepFinish(false)
 		return err
 	}
+	if h.file.isNil() {
+		// File hasn't been touched, there is nothing to sync.
+		return nil
+	}
 	return h.file.fsync(ctx)
 }
diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
new file mode 100644
index 000000000..7294de7d6
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
@@ -0,0 +1,97 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"sync"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create
+// pipes after sentry initialization due to syscall filters.
+var (
+	tempPipeMu      sync.Mutex
+	tempPipeReadFD  int
+	tempPipeWriteFD int
+	tempPipeBuf     [1]byte
+)
+
+func init() {
+	var pipeFDs [2]int
+	if err := unix.Pipe(pipeFDs[:]); err != nil {
+		panic(fmt.Sprintf("failed to create pipe for gofer.blockUntilNonblockingPipeHasWriter: %v", err))
+	}
+	tempPipeReadFD = pipeFDs[0]
+	tempPipeWriteFD = pipeFDs[1]
+}
+
+func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error {
+	for {
+		ok, err := nonblockingPipeHasWriter(fd)
+		if err != nil {
+			return err
+		}
+		if ok {
+			return nil
+		}
+		if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+			return err
+		}
+	}
+}
+
+func nonblockingPipeHasWriter(fd int32) (bool, error) {
+	tempPipeMu.Lock()
+	defer tempPipeMu.Unlock()
+	// Copy 1 byte from fd into the temporary pipe.
+	n, err := unix.Tee(int(fd), tempPipeWriteFD, 1, unix.SPLICE_F_NONBLOCK)
+	if err == syserror.EAGAIN {
+		// The pipe represented by fd is empty, but has a writer.
+		return true, nil
+	}
+	if err != nil {
+		return false, err
+	}
+	if n == 0 {
+		// The pipe represented by fd is empty and has no writer.
+		return false, nil
+	}
+	// The pipe represented by fd is non-empty, so it either has, or has
+	// previously had, a writer. Remove the byte copied to the temporary pipe
+	// before returning.
+	if n, err := unix.Read(tempPipeReadFD, tempPipeBuf[:]); err != nil || n != 1 {
+		panic(fmt.Sprintf("failed to drain pipe for gofer.blockUntilNonblockingPipeHasWriter: got (%d, %v), wanted (1, nil)", n, err))
+	}
+	return true, nil
+}
+
+func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error {
+	t := time.NewTimer(100 * time.Millisecond)
+	defer t.Stop()
+	cancel := ctx.SleepStart()
+	select {
+	case <-t.C:
+		ctx.SleepFinish(true)
+		return nil
+	case <-cancel:
+		ctx.SleepFinish(false)
+		return syserror.ErrInterrupted
+	}
+}
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 0d10cf7ac..09f142cfc 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -24,11 +24,11 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -67,12 +67,46 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
 	return d.handle.file.flush(ctx)
 }
 
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+
+	d := fd.dentry()
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+
+	size := offset + length
+
+	// Allocating a smaller size is a noop.
+	if size <= d.size {
+		return nil
+	}
+
+	d.handleMu.Lock()
+	defer d.handleMu.Unlock()
+
+	err := d.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+	if err != nil {
+		return err
+	}
+	d.dataMu.Lock()
+	atomic.StoreUint64(&d.size, size)
+	d.dataMu.Unlock()
+	if !d.cachedMetadataAuthoritative() {
+		d.touchCMtimeLocked()
+	}
+	return nil
+}
+
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
-	if opts.Flags != 0 {
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
 
@@ -120,21 +154,53 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
 
 // PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, _, err := fd.pwrite(ctx, src, offset, opts)
+	return n, err
+}
+
+// pwrite returns the number of bytes written, final offset, error. The final
+// offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
 	if offset < 0 {
-		return 0, syserror.EINVAL
+		return 0, offset, syserror.EINVAL
 	}
-	if opts.Flags != 0 {
-		return 0, syserror.EOPNOTSUPP
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, offset, syserror.EOPNOTSUPP
+	}
+
+	d := fd.dentry()
+	// If the fd was opened with O_APPEND, make sure the file size is updated.
+	// There is a possible race here if size is modified externally after
+	// metadata cache is updated.
+	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return 0, offset, err
+		}
+	}
+
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+	// Set offset to file size if the fd was opened with O_APPEND.
+	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+		// Holding d.metadataMu is sufficient for reading d.size.
+		offset = int64(d.size)
 	}
 	limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
 	if err != nil {
-		return 0, err
+		return 0, offset, err
 	}
 	src = src.TakeFirst64(limit)
+	n, err := fd.pwriteLocked(ctx, src, offset, opts)
+	return n, offset + n, err
+}
 
+// Preconditions: fd.dentry().metatdataMu must be locked.
+func (fd *regularFileFD) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	d := fd.dentry()
-	d.metadataMu.Lock()
-	defer d.metadataMu.Unlock()
 	if d.fs.opts.interop != InteropModeShared {
 		// Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
 		// file_update_time(). This is d.touchCMtime(), but without locking
@@ -154,12 +220,12 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 			return 0, syserror.EINVAL
 		}
 		mr := memmap.MappableRange{pgstart, pgend}
-		var freed []platform.FileRange
+		var freed []memmap.FileRange
 		d.dataMu.Lock()
 		cseg := d.cache.LowerBoundSegment(mr.Start)
 		for cseg.Ok() && cseg.Start() < mr.End {
 			cseg = d.cache.Isolate(cseg, mr)
-			freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
+			freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
 			cseg = d.cache.Remove(cseg).NextSegment()
 		}
 		d.dataMu.Unlock()
@@ -197,8 +263,8 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 // Write implements vfs.FileDescriptionImpl.Write.
 func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	fd.mu.Lock()
-	n, err := fd.PWrite(ctx, src, fd.off, opts)
-	fd.off += n
+	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+	fd.off = off
 	fd.mu.Unlock()
 	return n, err
 }
@@ -489,15 +555,24 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
 func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
 	fd.mu.Lock()
 	defer fd.mu.Unlock()
+	newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
+	if err != nil {
+		return 0, err
+	}
+	fd.off = newOffset
+	return newOffset, nil
+}
+
+// Calculate the new offset for a seek operation on a regular file.
+func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) {
 	switch whence {
 	case linux.SEEK_SET:
 		// Use offset as specified.
 	case linux.SEEK_CUR:
-		offset += fd.off
+		offset += fdOffset
 	case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE:
 		// Ensure file size is up to date.
-		d := fd.dentry()
-		if fd.filesystem().opts.interop == InteropModeShared {
+		if !d.cachedMetadataAuthoritative() {
 			if err := d.updateFromGetattr(ctx); err != nil {
 				return 0, err
 			}
@@ -525,7 +600,6 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
-	fd.off = offset
 	return offset, nil
 }
 
@@ -536,20 +610,19 @@ func (fd *regularFileFD) Sync(ctx context.Context) error {
 
 func (d *dentry) syncSharedHandle(ctx context.Context) error {
 	d.handleMu.RLock()
-	if !d.handleWritable {
-		d.handleMu.RUnlock()
-		return nil
-	}
-	d.dataMu.Lock()
-	// Write dirty cached data to the remote file.
-	err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
-	d.dataMu.Unlock()
-	if err == nil {
-		// Sync the remote file.
-		err = d.handle.sync(ctx)
+	defer d.handleMu.RUnlock()
+
+	if d.handleWritable {
+		d.dataMu.Lock()
+		// Write dirty cached data to the remote file.
+		err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
+		d.dataMu.Unlock()
+		if err != nil {
+			return err
+		}
 	}
-	d.handleMu.RUnlock()
-	return err
+	// Sync the remote file.
+	return d.handle.sync(ctx)
 }
 
 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
@@ -747,7 +820,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange
 
 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
 func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
-	// Whether we have a host fd (and consequently what platform.File is
+	// Whether we have a host fd (and consequently what memmap.File is
 	// mapped) can change across save/restore, so invalidate all translations
 	// unconditionally.
 	d.mapsMu.Lock()
@@ -795,8 +868,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
 	}
 }
 
-// dentryPlatformFile implements platform.File. It exists solely because dentry
-// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef.
+// dentryPlatformFile implements memmap.File. It exists solely because dentry
+// cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef.
 //
 // dentryPlatformFile is only used when a host FD representing the remote file
 // is available (i.e. dentry.handle.fd >= 0), and that FD is used for
@@ -804,7 +877,7 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
 type dentryPlatformFile struct {
 	*dentry
 
-	// fdRefs counts references on platform.File offsets. fdRefs is protected
+	// fdRefs counts references on memmap.File offsets. fdRefs is protected
 	// by dentry.dataMu.
 	fdRefs fsutil.FrameRefSet
 
@@ -816,29 +889,29 @@ type dentryPlatformFile struct {
 	hostFileMapperInitOnce sync.Once
 }
 
-// IncRef implements platform.File.IncRef.
-func (d *dentryPlatformFile) IncRef(fr platform.FileRange) {
+// IncRef implements memmap.File.IncRef.
+func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) {
 	d.dataMu.Lock()
 	d.fdRefs.IncRefAndAccount(fr)
 	d.dataMu.Unlock()
 }
 
-// DecRef implements platform.File.DecRef.
-func (d *dentryPlatformFile) DecRef(fr platform.FileRange) {
+// DecRef implements memmap.File.DecRef.
+func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) {
 	d.dataMu.Lock()
 	d.fdRefs.DecRefAndAccount(fr)
 	d.dataMu.Unlock()
 }
 
-// MapInternal implements platform.File.MapInternal.
-func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+// MapInternal implements memmap.File.MapInternal.
+func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
 	d.handleMu.RLock()
 	bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write)
 	d.handleMu.RUnlock()
 	return bs, err
 }
 
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
 func (d *dentryPlatformFile) FD() int {
 	d.handleMu.RLock()
 	fd := d.handle.fd
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index a464e6a94..811528982 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -16,20 +16,22 @@ package gofer
 
 import (
 	"sync"
+	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// specialFileFD implements vfs.FileDescriptionImpl for files other than
-// regular files, directories, and symlinks: pipes, sockets, etc. It is also
-// used for regular files when filesystemOptions.specialRegularFiles is in
-// effect. specialFileFD differs from regularFileFD by using per-FD handles
-// instead of shared per-dentry handles, and never buffering I/O.
+// specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device
+// special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is
+// in effect) regular files. specialFileFD differs from regularFileFD by using
+// per-FD handles instead of shared per-dentry handles, and never buffering I/O.
 type specialFileFD struct {
 	fileDescription
 
@@ -40,13 +42,48 @@ type specialFileFD struct {
 	// file offset is significant, i.e. a regular file. seekable is immutable.
 	seekable bool
 
+	// haveQueue is true if this file description represents a file for which
+	// queue may send I/O readiness events. haveQueue is immutable.
+	haveQueue bool
+	queue     waiter.Queue
+
 	// If seekable is true, off is the file offset. off is protected by mu.
 	mu  sync.Mutex
 	off int64
 }
 
+func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) {
+	ftype := d.fileType()
+	seekable := ftype == linux.S_IFREG
+	haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0
+	fd := &specialFileFD{
+		handle:    h,
+		seekable:  seekable,
+		haveQueue: haveQueue,
+	}
+	fd.LockFD.Init(locks)
+	if haveQueue {
+		if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil {
+			return nil, err
+		}
+	}
+	if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+		DenyPRead:  !seekable,
+		DenyPWrite: !seekable,
+	}); err != nil {
+		if haveQueue {
+			fdnotifier.RemoveFD(h.fd)
+		}
+		return nil, err
+	}
+	return fd, nil
+}
+
 // Release implements vfs.FileDescriptionImpl.Release.
 func (fd *specialFileFD) Release() {
+	if fd.haveQueue {
+		fdnotifier.RemoveFD(fd.handle.fd)
+	}
 	fd.handle.close(context.Background())
 	fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
 	fs.syncMu.Lock()
@@ -62,12 +99,44 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error {
 	return fd.handle.file.flush(ctx)
 }
 
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if fd.haveQueue {
+		return fdnotifier.NonBlockingPoll(fd.handle.fd, mask)
+	}
+	return fd.fileDescription.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	if fd.haveQueue {
+		fd.queue.EventRegister(e, mask)
+		fdnotifier.UpdateFD(fd.handle.fd)
+		return
+	}
+	fd.fileDescription.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
+	if fd.haveQueue {
+		fd.queue.EventUnregister(e)
+		fdnotifier.UpdateFD(fd.handle.fd)
+		return
+	}
+	fd.fileDescription.EventUnregister(e)
+}
+
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	if fd.seekable && offset < 0 {
 		return 0, syserror.EINVAL
 	}
-	if opts.Flags != 0 {
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
 
@@ -76,11 +145,14 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 	// mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't
 	// hold here since specialFileFD doesn't client-cache data. Just buffer the
 	// read instead.
-	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+	if d := fd.dentry(); d.cachedMetadataAuthoritative() {
 		d.touchAtime(fd.vfsfd.Mount())
 	}
 	buf := make([]byte, dst.NumBytes())
 	n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	if err == syserror.EAGAIN {
+		err = syserror.ErrWouldBlock
+	}
 	if n == 0 {
 		return 0, err
 	}
@@ -105,32 +177,76 @@ func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
 
 // PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, _, err := fd.pwrite(ctx, src, offset, opts)
+	return n, err
+}
+
+// pwrite returns the number of bytes written, final offset, error. The final
+// offset should be ignored by PWrite.
+func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
 	if fd.seekable && offset < 0 {
-		return 0, syserror.EINVAL
+		return 0, offset, syserror.EINVAL
 	}
-	if opts.Flags != 0 {
-		return 0, syserror.EOPNOTSUPP
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, offset, syserror.EOPNOTSUPP
+	}
+
+	d := fd.dentry()
+	// If the regular file fd was opened with O_APPEND, make sure the file size
+	// is updated. There is a possible race here if size is modified externally
+	// after metadata cache is updated.
+	if fd.seekable && fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return 0, offset, err
+		}
 	}
 
 	if fd.seekable {
+		// We need to hold the metadataMu *while* writing to a regular file.
+		d.metadataMu.Lock()
+		defer d.metadataMu.Unlock()
+
+		// Set offset to file size if the regular file was opened with O_APPEND.
+		if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+			// Holding d.metadataMu is sufficient for reading d.size.
+			offset = int64(d.size)
+		}
 		limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
 		if err != nil {
-			return 0, err
+			return 0, offset, err
 		}
 		src = src.TakeFirst64(limit)
 	}
 
 	// Do a buffered write. See rationale in PRead.
-	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+	if d.cachedMetadataAuthoritative() {
 		d.touchCMtime()
 	}
 	buf := make([]byte, src.NumBytes())
 	// Don't do partial writes if we get a partial read from src.
 	if _, err := src.CopyIn(ctx, buf); err != nil {
-		return 0, err
+		return 0, offset, err
 	}
 	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
-	return int64(n), err
+	if err == syserror.EAGAIN {
+		err = syserror.ErrWouldBlock
+	}
+	finalOff = offset
+	// Update file size for regular files.
+	if fd.seekable {
+		finalOff += int64(n)
+		// d.metadataMu is already locked at this point.
+		if uint64(finalOff) > d.size {
+			d.dataMu.Lock()
+			defer d.dataMu.Unlock()
+			atomic.StoreUint64(&d.size, uint64(finalOff))
+		}
+	}
+	return int64(n), finalOff, err
 }
 
 // Write implements vfs.FileDescriptionImpl.Write.
@@ -140,8 +256,8 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts
 	}
 
 	fd.mu.Lock()
-	n, err := fd.PWrite(ctx, src, fd.off, opts)
-	fd.off += n
+	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+	fd.off = off
 	fd.mu.Unlock()
 	return n, err
 }
@@ -153,27 +269,15 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (
 	}
 	fd.mu.Lock()
 	defer fd.mu.Unlock()
-	switch whence {
-	case linux.SEEK_SET:
-		// Use offset as given.
-	case linux.SEEK_CUR:
-		offset += fd.off
-	default:
-		// SEEK_END, SEEK_DATA, and SEEK_HOLE aren't supported since it's not
-		// clear that file size is even meaningful for these files.
-		return 0, syserror.EINVAL
-	}
-	if offset < 0 {
-		return 0, syserror.EINVAL
+	newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
+	if err != nil {
+		return 0, err
 	}
-	fd.off = offset
-	return offset, nil
+	fd.off = newOffset
+	return newOffset, nil
 }
 
 // Sync implements vfs.FileDescriptionImpl.Sync.
 func (fd *specialFileFD) Sync(ctx context.Context) error {
-	if !fd.vfsfd.IsWritable() {
-		return nil
-	}
-	return fd.handle.sync(ctx)
+	return fd.dentry().syncSharedHandle(ctx)
 }
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index 2608e7e1d..0eef4e16e 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -36,8 +36,11 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
 	}
 }
 
-// Preconditions: fs.interop != InteropModeShared.
+// Preconditions: d.cachedMetadataAuthoritative() == true.
 func (d *dentry) touchAtime(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime {
+		return
+	}
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return
 	}
@@ -48,8 +51,8 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) {
 	mnt.EndWrite()
 }
 
-// Preconditions: fs.interop != InteropModeShared. The caller has successfully
-// called vfs.Mount.CheckBeginWrite().
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// successfully called vfs.Mount.CheckBeginWrite().
 func (d *dentry) touchCtime() {
 	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
@@ -57,8 +60,8 @@ func (d *dentry) touchCtime() {
 	d.metadataMu.Unlock()
 }
 
-// Preconditions: fs.interop != InteropModeShared. The caller has successfully
-// called vfs.Mount.CheckBeginWrite().
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// successfully called vfs.Mount.CheckBeginWrite().
 func (d *dentry) touchCMtime() {
 	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
@@ -67,6 +70,8 @@ func (d *dentry) touchCMtime() {
 	d.metadataMu.Unlock()
 }
 
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// locked d.metadataMu.
 func (d *dentry) touchCMtimeLocked() {
 	now := d.fs.clock.Now().Nanoseconds()
 	atomic.StoreInt64(&d.mtime, now)
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index ca0fe6d2b..bd701bbc7 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -22,17 +22,18 @@ go_library(
         "//pkg/context",
         "//pkg/fdnotifier",
         "//pkg/fspath",
+        "//pkg/iovec",
         "//pkg/log",
         "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/hostfd",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/platform",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 18b127521..c894f2ca0 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -28,6 +28,7 @@ import (
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/hostfd"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -90,7 +91,9 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
 		isTTY:      opts.IsTTY,
 		wouldBlock: wouldBlock(uint32(fileType)),
 		seekable:   seekable,
-		canMap:     canMap(uint32(fileType)),
+		// NOTE(b/38213152): Technically, some obscure char devices can be memory
+		// mapped, but we only allow regular files.
+		canMap: fileType == linux.S_IFREG,
 	}
 	i.pf.inode = i
 
@@ -182,6 +185,8 @@ type inode struct {
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
 
+	locks vfs.FileLocks
+
 	// When the reference count reaches zero, the host fd is closed.
 	refs.AtomicRefCount
 
@@ -254,7 +259,7 @@ func (i *inode) Mode() linux.FileMode {
 }
 
 // Stat implements kernfs.Inode.
-func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
 	if opts.Mask&linux.STATX__RESERVED != 0 {
 		return linux.Statx{}, syserror.EINVAL
 	}
@@ -368,7 +373,7 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) {
 
 // SetStat implements kernfs.Inode.
 func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
-	s := opts.Stat
+	s := &opts.Stat
 
 	m := s.Mask
 	if m == 0 {
@@ -381,7 +386,7 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 	if err := syscall.Fstat(i.hostFD, &hostStat); err != nil {
 		return err
 	}
-	if err := vfs.CheckSetStat(ctx, creds, &s, linux.FileMode(hostStat.Mode&linux.PermissionsMask), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
 		return err
 	}
 
@@ -391,6 +396,9 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 		}
 	}
 	if m&linux.STATX_SIZE != 0 {
+		if hostStat.Mode&linux.S_IFMT != linux.S_IFREG {
+			return syserror.EINVAL
+		}
 		if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
 			return err
 		}
@@ -454,10 +462,12 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 	fileType := s.Mode & linux.FileTypeMask
 
 	// Constrain flags to a subset we can handle.
-	// TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags.
-	flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND
+	//
+	// TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls.
+	flags &= syscall.O_ACCMODE | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND
 
-	if fileType == syscall.S_IFSOCK {
+	switch fileType {
+	case syscall.S_IFSOCK:
 		if i.isTTY {
 			log.Warningf("cannot use host socket fd %d as TTY", i.hostFD)
 			return nil, syserror.ENOTTY
@@ -468,35 +478,41 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 			return nil, err
 		}
 		// Currently, we only allow Unix sockets to be imported.
-		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d)
-	}
+		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks)
 
-	// TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that
-	// we don't allow importing arbitrary file types without proper support.
-	if i.isTTY {
-		fd := &TTYFileDescription{
-			fileDescription: fileDescription{inode: i},
-			termios:         linux.DefaultSlaveTermios,
+	case syscall.S_IFREG, syscall.S_IFIFO, syscall.S_IFCHR:
+		if i.isTTY {
+			fd := &TTYFileDescription{
+				fileDescription: fileDescription{inode: i},
+				termios:         linux.DefaultSlaveTermios,
+			}
+			fd.LockFD.Init(&i.locks)
+			vfsfd := &fd.vfsfd
+			if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+				return nil, err
+			}
+			return vfsfd, nil
 		}
+
+		fd := &fileDescription{inode: i}
+		fd.LockFD.Init(&i.locks)
 		vfsfd := &fd.vfsfd
 		if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return vfsfd, nil
-	}
 
-	fd := &fileDescription{inode: i}
-	vfsfd := &fd.vfsfd
-	if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
-		return nil, err
+	default:
+		log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType)
+		return nil, syserror.EPERM
 	}
-	return vfsfd, nil
 }
 
 // fileDescription is embedded by host fd implementations of FileDescriptionImpl.
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	// inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but
 	// cached to reduce indirections and casting. fileDescription does not hold
@@ -521,8 +537,8 @@ func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 }
 
 // Stat implements vfs.FileDescriptionImpl.
-func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) {
-	return f.inode.Stat(f.vfsfd.Mount().Filesystem(), opts)
+func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
 }
 
 // Release implements vfs.FileDescriptionImpl.
@@ -530,6 +546,16 @@ func (f *fileDescription) Release() {
 	// noop
 }
 
+// Allocate implements vfs.FileDescriptionImpl.
+func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	if !f.inode.seekable {
+		return syserror.ESPIPE
+	}
+
+	// TODO(gvisor.dev/issue/2923): Implement Allocate for non-pipe hostfds.
+	return syserror.EOPNOTSUPP
+}
+
 // PRead implements FileDescriptionImpl.
 func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	i := f.inode
@@ -556,7 +582,7 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts
 		}
 		return n, err
 	}
-	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+
 	f.offsetMu.Lock()
 	n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags)
 	f.offset += n
@@ -565,8 +591,10 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts
 }
 
 func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
-	// TODO(gvisor.dev/issue/1672): Support select preadv2 flags.
-	if flags != 0 {
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if flags&^linux.RWF_HIPRI != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
 	reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
@@ -577,41 +605,58 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off
 
 // PWrite implements FileDescriptionImpl.
 func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	i := f.inode
-	if !i.seekable {
+	if !f.inode.seekable {
 		return 0, syserror.ESPIPE
 	}
 
-	return writeToHostFD(ctx, i.hostFD, src, offset, opts.Flags)
+	return f.writeToHostFD(ctx, src, offset, opts.Flags)
 }
 
 // Write implements FileDescriptionImpl.
 func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	i := f.inode
 	if !i.seekable {
-		n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags)
+		n, err := f.writeToHostFD(ctx, src, -1, opts.Flags)
 		if isBlockError(err) {
 			err = syserror.ErrWouldBlock
 		}
 		return n, err
 	}
-	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
-	// TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file.
+
 	f.offsetMu.Lock()
-	n, err := writeToHostFD(ctx, i.hostFD, src, f.offset, opts.Flags)
+	// NOTE(gvisor.dev/issue/2983): O_APPEND may cause memory corruption if
+	// another process modifies the host file between retrieving the file size
+	// and writing to the host fd. This is an unavoidable race condition because
+	// we cannot enforce synchronization on the host.
+	if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+		var s syscall.Stat_t
+		if err := syscall.Fstat(i.hostFD, &s); err != nil {
+			f.offsetMu.Unlock()
+			return 0, err
+		}
+		f.offset = s.Size
+	}
+	n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags)
 	f.offset += n
 	f.offsetMu.Unlock()
 	return n, err
 }
 
-func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
-	// TODO(gvisor.dev/issue/1672): Support select pwritev2 flags.
+func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
+	hostFD := f.inode.hostFD
+	// TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
 	if flags != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
 	writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
 	n, err := src.CopyInTo(ctx, writer)
 	hostfd.PutReadWriterAt(writer)
+	// NOTE(gvisor.dev/issue/2979): We always sync everything, even for O_DSYNC.
+	if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+		if syncErr := unix.Fsync(hostFD); syncErr != nil {
+			return int64(n), syncErr
+		}
+	}
 	return int64(n), err
 }
 
@@ -682,7 +727,7 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i
 
 // Sync implements FileDescriptionImpl.
 func (f *fileDescription) Sync(context.Context) error {
-	// TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything.
+	// TODO(gvisor.dev/issue/1897): Currently, we always sync everything.
 	return unix.Fsync(f.inode.hostFD)
 }
 
@@ -712,3 +757,13 @@ func (f *fileDescription) EventUnregister(e *waiter.Entry) {
 func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask)
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (f *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return f.Locks().LockPOSIX(ctx, &f.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (f *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return f.Locks().UnlockPOSIX(ctx, &f.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go
index 8545a82f0..65d3af38c 100644
--- a/pkg/sentry/fsimpl/host/mmap.go
+++ b/pkg/sentry/fsimpl/host/mmap.go
@@ -19,13 +19,12 @@ import (
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// inodePlatformFile implements platform.File. It exists solely because inode
-// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef.
+// inodePlatformFile implements memmap.File. It exists solely because inode
+// cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef.
 //
 // inodePlatformFile should only be used if inode.canMap is true.
 type inodePlatformFile struct {
@@ -34,7 +33,7 @@ type inodePlatformFile struct {
 	// fdRefsMu protects fdRefs.
 	fdRefsMu sync.Mutex
 
-	// fdRefs counts references on platform.File offsets. It is used solely for
+	// fdRefs counts references on memmap.File offsets. It is used solely for
 	// memory accounting.
 	fdRefs fsutil.FrameRefSet
 
@@ -45,32 +44,32 @@ type inodePlatformFile struct {
 	fileMapperInitOnce sync.Once
 }
 
-// IncRef implements platform.File.IncRef.
+// IncRef implements memmap.File.IncRef.
 //
 // Precondition: i.inode.canMap must be true.
-func (i *inodePlatformFile) IncRef(fr platform.FileRange) {
+func (i *inodePlatformFile) IncRef(fr memmap.FileRange) {
 	i.fdRefsMu.Lock()
 	i.fdRefs.IncRefAndAccount(fr)
 	i.fdRefsMu.Unlock()
 }
 
-// DecRef implements platform.File.DecRef.
+// DecRef implements memmap.File.DecRef.
 //
 // Precondition: i.inode.canMap must be true.
-func (i *inodePlatformFile) DecRef(fr platform.FileRange) {
+func (i *inodePlatformFile) DecRef(fr memmap.FileRange) {
 	i.fdRefsMu.Lock()
 	i.fdRefs.DecRefAndAccount(fr)
 	i.fdRefsMu.Unlock()
 }
 
-// MapInternal implements platform.File.MapInternal.
+// MapInternal implements memmap.File.MapInternal.
 //
 // Precondition: i.inode.canMap must be true.
-func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+func (i *inodePlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
 	return i.fileMapper.MapInternal(fr, i.hostFD, at.Write)
 }
 
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
 func (i *inodePlatformFile) FD() int {
 	return i.hostFD
 }
diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go
index 38f1fbfba..fd16bd92d 100644
--- a/pkg/sentry/fsimpl/host/socket.go
+++ b/pkg/sentry/fsimpl/host/socket.go
@@ -47,11 +47,6 @@ func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transpor
 	return ep, nil
 }
 
-// maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
-//
-// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
-const maxSendBufferSize = 8 << 20
-
 // ConnectedEndpoint is an implementation of transport.ConnectedEndpoint and
 // transport.Receiver. It is backed by a host fd that was imported at sentry
 // startup. This fd is shared with a hostfs inode, which retains ownership of
@@ -114,10 +109,6 @@ func (c *ConnectedEndpoint) init() *syserr.Error {
 	if err != nil {
 		return syserr.FromError(err)
 	}
-	if sndbuf > maxSendBufferSize {
-		log.Warningf("Socket send buffer too large: %d", sndbuf)
-		return syserr.ErrInvalidEndpointState
-	}
 
 	c.stype = linux.SockType(stype)
 	c.sndbuf = int64(sndbuf)
diff --git a/pkg/sentry/fsimpl/host/socket_iovec.go b/pkg/sentry/fsimpl/host/socket_iovec.go
index 584c247d2..fc0d5fd38 100644
--- a/pkg/sentry/fsimpl/host/socket_iovec.go
+++ b/pkg/sentry/fsimpl/host/socket_iovec.go
@@ -17,13 +17,10 @@ package host
 import (
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/iovec"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// maxIovs is the maximum number of iovecs to pass to the host.
-var maxIovs = linux.UIO_MAXIOV
-
 // copyToMulti copies as many bytes from src to dst as possible.
 func copyToMulti(dst [][]byte, src []byte) {
 	for _, d := range dst {
@@ -74,7 +71,7 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec
 		}
 	}
 
-	if iovsRequired > maxIovs {
+	if iovsRequired > iovec.MaxIovs {
 		// The kernel will reject our call if we pass this many iovs.
 		// Use a single intermediate buffer instead.
 		b := make([]byte, stopLen)
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index 68af6e5af..4ee9270cc 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -325,9 +326,9 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal)
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		// No task? Linux does not have an analog for this case, but
-		// tty_check_change is more of a blacklist of cases than a
-		// whitelist, and is surprisingly permissive. Allowing the
-		// change seems most appropriate.
+		// tty_check_change only blocks specific cases and is
+		// surprisingly permissive. Allowing the change seems
+		// appropriate.
 		return nil
 	}
 
@@ -377,3 +378,13 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal)
 	_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
 	return kernel.ERESTARTSYS
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (t *TTYFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, typ fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return t.Locks().LockPOSIX(ctx, &t.vfsfd, uid, typ, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (t *TTYFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return t.Locks().UnlockPOSIX(ctx, &t.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
index 2bc757b1a..412bdb2eb 100644
--- a/pkg/sentry/fsimpl/host/util.go
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -49,16 +49,6 @@ func wouldBlock(fileType uint32) bool {
 	return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
 }
 
-// canMap returns true if a file with fileType is allowed to be memory mapped.
-// This is ported over from VFS1, but it's probably not the best way for us
-// to check if a file can be memory mapped.
-func canMap(fileType uint32) bool {
-	// TODO(gvisor.dev/issue/1672): Also allow "special files" to be mapped (see fs/host:canMap()).
-	//
-	// TODO(b/38213152): Some obscure character devices can be mapped.
-	return fileType == syscall.S_IFREG
-}
-
 // isBlockError checks if an error is EAGAIN or EWOULDBLOCK.
 // If so, they can be transformed into syserror.ErrWouldBlock.
 func isBlockError(err error) bool {
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index ef34cb28a..3835557fe 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -45,6 +45,7 @@ go_library(
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
         "//pkg/sentry/socket/unix/transport",
@@ -69,6 +70,6 @@ go_test(
         "//pkg/sentry/vfs",
         "//pkg/syserror",
         "//pkg/usermem",
-        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 1568a9d49..c6c4472e7 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -38,7 +39,8 @@ type DynamicBytesFile struct {
 	InodeNotDirectory
 	InodeNotSymlink
 
-	data vfs.DynamicBytesSource
+	locks vfs.FileLocks
+	data  vfs.DynamicBytesSource
 }
 
 var _ Inode = (*DynamicBytesFile)(nil)
@@ -55,7 +57,7 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint
 // Open implements Inode.Open.
 func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &DynamicBytesFD{}
-	if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil {
+	if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
@@ -77,13 +79,15 @@ func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credent
 type DynamicBytesFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.DynamicBytesFileDescriptionImpl
+	vfs.LockFD
 
 	vfsfd vfs.FileDescription
 	inode Inode
 }
 
 // Init initializes a DynamicBytesFD.
-func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) error {
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
+	fd.LockFD.Init(locks)
 	if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
 		return err
 	}
@@ -97,12 +101,12 @@ func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32)
 	return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence)
 }
 
-// Read implmenets vfs.FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
 	return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts)
 }
 
-// PRead implmenets vfs.FileDescriptionImpl.PRead.
+// PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts)
 }
@@ -123,7 +127,7 @@ func (fd *DynamicBytesFD) Release() {}
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return fd.inode.Stat(fs, opts)
+	return fd.inode.Stat(ctx, fs, opts)
 }
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
@@ -131,3 +135,13 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error {
 	// DynamicBytesFiles are immutable.
 	return syserror.EPERM
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *DynamicBytesFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *DynamicBytesFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 8284e76a7..1d37ccb98 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -42,6 +43,7 @@ import (
 type GenericDirectoryFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.DirectoryFileDescriptionDefaultImpl
+	vfs.LockFD
 
 	vfsfd    vfs.FileDescription
 	children *OrderedChildren
@@ -55,9 +57,9 @@ type GenericDirectoryFD struct {
 
 // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
 // dentry.
-func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
 	fd := &GenericDirectoryFD{}
-	if err := fd.Init(children, opts); err != nil {
+	if err := fd.Init(children, locks, opts); err != nil {
 		return nil, err
 	}
 	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
@@ -69,11 +71,12 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre
 // Init initializes a GenericDirectoryFD. Use it when overriding
 // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
 // correct implementation.
-func (fd *GenericDirectoryFD) Init(children *OrderedChildren, opts *vfs.OpenOptions) error {
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error {
 	if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
 		// Can't open directories for writing.
 		return syserror.EISDIR
 	}
+	fd.LockFD.Init(locks)
 	fd.children = children
 	return nil
 }
@@ -109,7 +112,7 @@ func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence
 	return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
 }
 
-// Release implements vfs.FileDecriptionImpl.Release.
+// Release implements vfs.FileDescriptionImpl.Release.
 func (fd *GenericDirectoryFD) Release() {}
 
 func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem {
@@ -120,7 +123,7 @@ func (fd *GenericDirectoryFD) inode() Inode {
 	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
 }
 
-// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents. IterDirents holds
 // o.mu when calling cb.
 func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
 	fd.mu.Lock()
@@ -129,7 +132,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	opts := vfs.StatOptions{Mask: linux.STATX_INO}
 	// Handle ".".
 	if fd.off == 0 {
-		stat, err := fd.inode().Stat(fd.filesystem(), opts)
+		stat, err := fd.inode().Stat(ctx, fd.filesystem(), opts)
 		if err != nil {
 			return err
 		}
@@ -149,7 +152,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	if fd.off == 1 {
 		vfsd := fd.vfsfd.VirtualDentry().Dentry()
 		parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode
-		stat, err := parentInode.Stat(fd.filesystem(), opts)
+		stat, err := parentInode.Stat(ctx, fd.filesystem(), opts)
 		if err != nil {
 			return err
 		}
@@ -173,7 +176,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	childIdx := fd.off - 2
 	for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
 		inode := it.Dentry.Impl().(*Dentry).inode
-		stat, err := inode.Stat(fd.filesystem(), opts)
+		stat, err := inode.Stat(ctx, fd.filesystem(), opts)
 		if err != nil {
 			return err
 		}
@@ -195,7 +198,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	return err
 }
 
-// Seek implements vfs.FileDecriptionImpl.Seek.
+// Seek implements vfs.FileDescriptionImpl.Seek.
 func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
 	fd.mu.Lock()
 	defer fd.mu.Unlock()
@@ -223,7 +226,7 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
 func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := fd.filesystem()
 	inode := fd.inode()
-	return inode.Stat(fs, opts)
+	return inode.Stat(ctx, fs, opts)
 }
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
@@ -232,3 +235,18 @@ func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptio
 	inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
 	return inode.SetStat(ctx, fd.filesystem(), creds, opts)
 }
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *GenericDirectoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 4a12ae245..61a36cff9 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -35,7 +35,7 @@ import (
 // Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
 //
 // Postcondition: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) (*vfs.Dentry, error) {
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) {
 	d := vfsd.Impl().(*Dentry)
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
@@ -81,7 +81,7 @@ afterSymlink:
 		return nil, err
 	}
 	// Resolve any symlink at current path component.
-	if rp.ShouldFollowSymlink() && next.isSymlink() {
+	if mayFollowSymlinks && rp.ShouldFollowSymlink() && next.isSymlink() {
 		targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount())
 		if err != nil {
 			return nil, err
@@ -152,7 +152,7 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP
 	vfsd := rp.Start()
 	for !rp.Done() {
 		var err error
-		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
 		if err != nil {
 			return nil, nil, err
 		}
@@ -178,7 +178,7 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
 	vfsd := rp.Start()
 	for !rp.Final() {
 		var err error
-		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
 		if err != nil {
 			return nil, nil, err
 		}
@@ -449,7 +449,7 @@ afterTrailingSymlink:
 		return nil, syserror.ENAMETOOLONG
 	}
 	// Determine whether or not we need to create a file.
-	childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD)
+	childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */)
 	if err == syserror.ENOENT {
 		// Already checked for searchability above; now check for writability.
 		if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
@@ -684,7 +684,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	if err != nil {
 		return linux.Statx{}, err
 	}
-	return inode.Stat(fs.VFSFilesystem(), opts)
+	return inode.Stat(ctx, fs.VFSFilesystem(), opts)
 }
 
 // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 982daa2e6..579e627f0 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -243,7 +243,7 @@ func (a *InodeAttrs) Mode() linux.FileMode {
 // Stat partially implements Inode.Stat. Note that this function doesn't provide
 // all the stat fields, and the embedder should consider extending the result
 // with filesystem-specific fields.
-func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
+func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
 	var stat linux.Statx
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
 	stat.DevMajor = a.devMajor
@@ -267,7 +267,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
 		return err
 	}
 
@@ -293,6 +293,8 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 	// inode numbers are immutable after node creation.
 
 	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
+	// Also, STATX_SIZE will need some special handling, because read-only static
+	// files should return EIO for truncate operations.
 
 	return nil
 }
@@ -469,6 +471,8 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.De
 	if err := o.checkExistingLocked(name, child); err != nil {
 		return err
 	}
+
+	// TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
 	o.removeLocked(name)
 	return nil
 }
@@ -516,6 +520,8 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c
 	if err := o.checkExistingLocked(oldname, child); err != nil {
 		return nil, err
 	}
+
+	// TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
 	replaced := dst.replaceChildLocked(newname, child)
 	return replaced, nil
 }
@@ -555,6 +561,8 @@ type StaticDirectory struct {
 	InodeAttrs
 	InodeNoDynamicLookup
 	OrderedChildren
+
+	locks vfs.FileLocks
 }
 
 var _ Inode = (*StaticDirectory)(nil)
@@ -584,7 +592,7 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint3
 
 // Open implements kernfs.Inode.
 func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
+	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index a83151ad3..46f207664 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -225,9 +225,24 @@ func (d *Dentry) destroy() {
 	}
 }
 
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// Although Linux technically supports inotify on pseudo filesystems (inotify
+// is implemented at the vfs layer), it is not particularly useful. It is left
+// unimplemented until someone actually needs it.
+func (d *Dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *Dentry) Watches() *vfs.Watches {
+	return nil
+}
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+func (d *Dentry) OnZeroWatches() {}
+
 // InsertChild inserts child into the vfs dentry cache with the given name under
 // this dentry. This does not update the directory inode, so calling this on
-// it's own isn't sufficient to insert a child into a directory. InsertChild
+// its own isn't sufficient to insert a child into a directory. InsertChild
 // updates the link count on d if required.
 //
 // Precondition: d must represent a directory inode.
@@ -331,7 +346,7 @@ type inodeMetadata interface {
 
 	// Stat returns the metadata for this inode. This corresponds to
 	// vfs.FilesystemImpl.StatAt.
-	Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
+	Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
 
 	// SetStat updates the metadata for this inode. This corresponds to
 	// vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking
@@ -413,10 +428,10 @@ type inodeDynamicLookup interface {
 	// IterDirents is used to iterate over dynamically created entries. It invokes
 	// cb on each entry in the directory represented by the FileDescription.
 	// 'offset' is the offset for the entire IterDirents call, which may include
-	// results from the caller. 'relOffset' is the offset inside the entries
-	// returned by this IterDirents invocation. In other words,
-	// 'offset+relOffset+1' is the value that should be set in vfs.Dirent.NextOff,
-	// while 'relOffset' is the place where iteration should start from.
+	// results from the caller (e.g. "." and ".."). 'relOffset' is the offset
+	// inside the entries returned by this IterDirents invocation. In other words,
+	// 'offset' should be used to calculate each vfs.Dirent.NextOff as well as
+	// the return value, while 'relOffset' is the place to start iteration.
 	IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 412cf6ac9..dc407eb1d 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -100,8 +100,10 @@ type readonlyDir struct {
 	kernfs.InodeNotSymlink
 	kernfs.InodeNoDynamicLookup
 	kernfs.InodeDirectoryNoNewChildren
-
 	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
+
 	dentry kernfs.Dentry
 }
 
@@ -117,7 +119,7 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
 }
 
 func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
@@ -128,10 +130,12 @@ type dir struct {
 	attrs
 	kernfs.InodeNotSymlink
 	kernfs.InodeNoDynamicLookup
+	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
 
 	fs     *filesystem
 	dentry kernfs.Dentry
-	kernfs.OrderedChildren
 }
 
 func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
@@ -147,7 +151,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 }
 
 func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD
new file mode 100644
index 000000000..8cf5b35d3
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/BUILD
@@ -0,0 +1,41 @@
+load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "overlay",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "dentry",
+    },
+)
+
+go_library(
+    name = "overlay",
+    srcs = [
+        "copy_up.go",
+        "directory.go",
+        "filesystem.go",
+        "fstree.go",
+        "non_directory.go",
+        "overlay.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
new file mode 100644
index 000000000..8f8dcfafe
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -0,0 +1,262 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"fmt"
+	"io"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isCopiedUp() bool {
+	return atomic.LoadUint32(&d.copiedUp) != 0
+}
+
+// copyUpLocked ensures that d exists on the upper layer, i.e. d.upperVD.Ok().
+//
+// Preconditions: filesystem.renameMu must be locked.
+func (d *dentry) copyUpLocked(ctx context.Context) error {
+	// Fast path.
+	if d.isCopiedUp() {
+		return nil
+	}
+
+	ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
+	switch ftype {
+	case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR:
+		// Can be copied-up.
+	default:
+		// Can't be copied-up.
+		return syserror.EPERM
+	}
+
+	// Ensure that our parent directory is copied-up.
+	if d.parent == nil {
+		// d is a filesystem root with no upper layer.
+		return syserror.EROFS
+	}
+	if err := d.parent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+
+	d.copyMu.Lock()
+	defer d.copyMu.Unlock()
+	if d.upperVD.Ok() {
+		// Raced with another call to d.copyUpLocked().
+		return nil
+	}
+	if d.vfsd.IsDead() {
+		// Raced with deletion of d.
+		return syserror.ENOENT
+	}
+
+	// Perform copy-up.
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	newpop := vfs.PathOperation{
+		Root:  d.parent.upperVD,
+		Start: d.parent.upperVD,
+		Path:  fspath.Parse(d.name),
+	}
+	cleanupUndoCopyUp := func() {
+		var err error
+		if ftype == linux.S_IFDIR {
+			err = vfsObj.RmdirAt(ctx, d.fs.creds, &newpop)
+		} else {
+			err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop)
+		}
+		if err != nil {
+			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)
+		}
+	}
+	switch ftype {
+	case linux.S_IFREG:
+		oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.lowerVDs[0],
+			Start: d.lowerVDs[0],
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDONLY,
+		})
+		if err != nil {
+			return err
+		}
+		defer oldFD.DecRef()
+		newFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &newpop, &vfs.OpenOptions{
+			Flags: linux.O_WRONLY | linux.O_CREAT | linux.O_EXCL,
+			Mode:  linux.FileMode(d.mode &^ linux.S_IFMT),
+		})
+		if err != nil {
+			return err
+		}
+		defer newFD.DecRef()
+		bufIOSeq := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size
+		for {
+			readN, readErr := oldFD.Read(ctx, bufIOSeq, vfs.ReadOptions{})
+			if readErr != nil && readErr != io.EOF {
+				cleanupUndoCopyUp()
+				return readErr
+			}
+			total := int64(0)
+			for total < readN {
+				writeN, writeErr := newFD.Write(ctx, bufIOSeq.DropFirst64(total), vfs.WriteOptions{})
+				total += writeN
+				if writeErr != nil {
+					cleanupUndoCopyUp()
+					return writeErr
+				}
+			}
+			if readErr == io.EOF {
+				break
+			}
+		}
+		if err := newFD.SetStat(ctx, vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  d.uid,
+				GID:  d.gid,
+			},
+		}); err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		d.upperVD = newFD.VirtualDentry()
+		d.upperVD.IncRef()
+
+	case linux.S_IFDIR:
+		if err := vfsObj.MkdirAt(ctx, d.fs.creds, &newpop, &vfs.MkdirOptions{
+			Mode: linux.FileMode(d.mode &^ linux.S_IFMT),
+		}); err != nil {
+			return err
+		}
+		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  d.uid,
+				GID:  d.gid,
+			},
+		}); err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+		if err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		d.upperVD = upperVD
+
+	case linux.S_IFLNK:
+		target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.lowerVDs[0],
+			Start: d.lowerVDs[0],
+		})
+		if err != nil {
+			return err
+		}
+		if err := vfsObj.SymlinkAt(ctx, d.fs.creds, &newpop, target); err != nil {
+			return err
+		}
+		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID,
+				Mode: uint16(d.mode),
+				UID:  d.uid,
+				GID:  d.gid,
+			},
+		}); err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+		if err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		d.upperVD = upperVD
+
+	case linux.S_IFBLK, linux.S_IFCHR:
+		lowerStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.lowerVDs[0],
+			Start: d.lowerVDs[0],
+		}, &vfs.StatOptions{})
+		if err != nil {
+			return err
+		}
+		if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{
+			Mode:     linux.FileMode(d.mode),
+			DevMajor: lowerStat.RdevMajor,
+			DevMinor: lowerStat.RdevMinor,
+		}); err != nil {
+			return err
+		}
+		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  d.uid,
+				GID:  d.gid,
+			},
+		}); err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+		if err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		d.upperVD = upperVD
+
+	default:
+		// Should have rejected this at the beginning of this function?
+		panic(fmt.Sprintf("unexpected file type %o", ftype))
+	}
+
+	// TODO(gvisor.dev/issue/1199): copy up xattrs
+
+	// Update the dentry's device and inode numbers (except for directories,
+	// for which these remain overlay-assigned).
+	if ftype != linux.S_IFDIR {
+		upperStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.upperVD,
+			Start: d.upperVD,
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_INO,
+		})
+		if err != nil {
+			d.upperVD.DecRef()
+			d.upperVD = vfs.VirtualDentry{}
+			cleanupUndoCopyUp()
+			return err
+		}
+		if upperStat.Mask&linux.STATX_INO == 0 {
+			d.upperVD.DecRef()
+			d.upperVD = vfs.VirtualDentry{}
+			cleanupUndoCopyUp()
+			return syserror.EREMOTE
+		}
+		atomic.StoreUint32(&d.devMajor, upperStat.DevMajor)
+		atomic.StoreUint32(&d.devMinor, upperStat.DevMinor)
+		atomic.StoreUint64(&d.ino, upperStat.Ino)
+	}
+
+	atomic.StoreUint32(&d.copiedUp, 1)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go
new file mode 100644
index 000000000..f5c2462a5
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/directory.go
@@ -0,0 +1,287 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func (d *dentry) isDir() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
+}
+
+// Preconditions: d.dirMu must be locked. d.isDir().
+func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) {
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	var readdirErr error
+	whiteouts := make(map[string]bool)
+	var maybeWhiteouts []string
+	d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool {
+		layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  layerVD,
+			Start: layerVD,
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+		})
+		if err != nil {
+			readdirErr = err
+			return false
+		}
+		defer layerFD.DecRef()
+
+		// Reuse slice allocated for maybeWhiteouts from a previous layer to
+		// reduce allocations.
+		maybeWhiteouts = maybeWhiteouts[:0]
+		if err := layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+			if dirent.Name == "." || dirent.Name == ".." {
+				return nil
+			}
+			if _, ok := whiteouts[dirent.Name]; ok {
+				// This file has been whited-out in a previous layer.
+				return nil
+			}
+			if dirent.Type == linux.DT_CHR {
+				// We have to determine if this is a whiteout, which doesn't
+				// count against the directory's emptiness. However, we can't
+				// do so while holding locks held by layerFD.IterDirents().
+				maybeWhiteouts = append(maybeWhiteouts, dirent.Name)
+				return nil
+			}
+			// Non-whiteout file in the directory prevents rmdir.
+			return syserror.ENOTEMPTY
+		})); err != nil {
+			readdirErr = err
+			return false
+		}
+
+		for _, maybeWhiteoutName := range maybeWhiteouts {
+			stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+				Root:  layerVD,
+				Start: layerVD,
+				Path:  fspath.Parse(maybeWhiteoutName),
+			}, &vfs.StatOptions{})
+			if err != nil {
+				readdirErr = err
+				return false
+			}
+			if stat.RdevMajor != 0 || stat.RdevMinor != 0 {
+				// This file is a real character device, not a whiteout.
+				readdirErr = syserror.ENOTEMPTY
+				return false
+			}
+			whiteouts[maybeWhiteoutName] = isUpper
+		}
+		// Continue iteration since we haven't found any non-whiteout files in
+		// this directory yet.
+		return true
+	})
+	return whiteouts, readdirErr
+}
+
+type directoryFD struct {
+	fileDescription
+	vfs.DirectoryFileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	mu      sync.Mutex
+	off     int64
+	dirents []vfs.Dirent
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	d := fd.dentry()
+	if fd.dirents == nil {
+		ds, err := d.getDirents(ctx)
+		if err != nil {
+			return err
+		}
+		fd.dirents = ds
+	}
+
+	for fd.off < int64(len(fd.dirents)) {
+		if err := cb.Handle(fd.dirents[fd.off]); err != nil {
+			return err
+		}
+		fd.off++
+	}
+	return nil
+}
+
+// Preconditions: d.isDir().
+func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
+	d.fs.renameMu.RLock()
+	defer d.fs.renameMu.RUnlock()
+	d.dirMu.Lock()
+	defer d.dirMu.Unlock()
+
+	if d.dirents != nil {
+		return d.dirents, nil
+	}
+
+	parent := genericParentOrSelf(d)
+	dirents := []vfs.Dirent{
+		{
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     d.ino,
+			NextOff: 1,
+		},
+		{
+			Name:    "..",
+			Type:    uint8(atomic.LoadUint32(&parent.mode) >> 12),
+			Ino:     parent.ino,
+			NextOff: 2,
+		},
+	}
+
+	// Merge dirents from all layers comprising this directory.
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	var readdirErr error
+	prevDirents := make(map[string]struct{})
+	var maybeWhiteouts []vfs.Dirent
+	d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool {
+		layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  layerVD,
+			Start: layerVD,
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+		})
+		if err != nil {
+			readdirErr = err
+			return false
+		}
+		defer layerFD.DecRef()
+
+		// Reuse slice allocated for maybeWhiteouts from a previous layer to
+		// reduce allocations.
+		maybeWhiteouts = maybeWhiteouts[:0]
+		if err := layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+			if dirent.Name == "." || dirent.Name == ".." {
+				return nil
+			}
+			if _, ok := prevDirents[dirent.Name]; ok {
+				// This file is hidden by, or merged with, another file with
+				// the same name in a previous layer.
+				return nil
+			}
+			prevDirents[dirent.Name] = struct{}{}
+			if dirent.Type == linux.DT_CHR {
+				// We can't determine if this file is a whiteout while holding
+				// locks held by layerFD.IterDirents().
+				maybeWhiteouts = append(maybeWhiteouts, dirent)
+				return nil
+			}
+			dirent.NextOff = int64(len(dirents) + 1)
+			dirents = append(dirents, dirent)
+			return nil
+		})); err != nil {
+			readdirErr = err
+			return false
+		}
+
+		for _, dirent := range maybeWhiteouts {
+			stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+				Root:  layerVD,
+				Start: layerVD,
+				Path:  fspath.Parse(dirent.Name),
+			}, &vfs.StatOptions{})
+			if err != nil {
+				readdirErr = err
+				return false
+			}
+			if stat.RdevMajor == 0 && stat.RdevMinor == 0 {
+				// This file is a whiteout; don't emit a dirent for it.
+				continue
+			}
+			dirent.NextOff = int64(len(dirents) + 1)
+			dirents = append(dirents, dirent)
+		}
+		return true
+	})
+	if readdirErr != nil {
+		return nil, readdirErr
+	}
+
+	// Cache dirents for future directoryFDs.
+	d.dirents = dirents
+	return dirents, nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		if offset == 0 {
+			// Ensure that the next call to fd.IterDirents() calls
+			// fd.dentry().getDirents().
+			fd.dirents = nil
+		}
+		fd.off = offset
+		return fd.off, nil
+	case linux.SEEK_CUR:
+		offset += fd.off
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		// Don't clear fd.dirents in this case, even if offset == 0.
+		fd.off = offset
+		return fd.off, nil
+	default:
+		return 0, syserror.EINVAL
+	}
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync. Forwards sync to the upper
+// layer, if there is one. The lower layer doesn't need to sync because it
+// never changes.
+func (fd *directoryFD) Sync(ctx context.Context) error {
+	d := fd.dentry()
+	if !d.isCopiedUp() {
+		return nil
+	}
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	pop := vfs.PathOperation{
+		Root:  d.upperVD,
+		Start: d.upperVD,
+	}
+	upperFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
+	if err != nil {
+		return err
+	}
+	err = upperFD.Sync(ctx)
+	upperFD.DecRef()
+	return err
+}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
new file mode 100644
index 000000000..6b705e955
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -0,0 +1,1364 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
+// opaque directories.
+// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
+const _OVL_XATTR_OPAQUE = "trusted.overlay.opaque"
+
+func isWhiteout(stat *linux.Statx) bool {
+	return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	if fs.opts.UpperRoot.Ok() {
+		return fs.opts.UpperRoot.Mount().Filesystem().Impl().Sync(ctx)
+	}
+	return nil
+}
+
+var dentrySlicePool = sync.Pool{
+	New: func() interface{} {
+		ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+		return &ds
+	},
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+	if ds == nil {
+		ds = dentrySlicePool.Get().(*[]*dentry)
+	}
+	*ds = append(*ds, d)
+	return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+	// Allow dentries to be GC'd.
+	for i := range *ds {
+		(*ds)[i] = nil
+	}
+	*ds = (*ds)[:0]
+	dentrySlicePool.Put(ds)
+}
+
+// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls
+// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckDrop(ds **[]*dentry) {
+	fs.renameMu.RUnlock()
+	if *ds == nil {
+		return
+	}
+	if len(**ds) != 0 {
+		fs.renameMu.Lock()
+		for _, d := range **ds {
+			d.checkDropLocked()
+		}
+		fs.renameMu.Unlock()
+	}
+	putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckDrop(ds **[]*dentry) {
+	if *ds == nil {
+		fs.renameMu.Unlock()
+		return
+	}
+	for _, d := range **ds {
+		d.checkDropLocked()
+	}
+	fs.renameMu.Unlock()
+	putDentrySlice(*ds)
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may have a reference count of zero, and which therefore
+// should be dropped once traversal is complete, are appended to ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done().
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
+	}
+	if name == ".." {
+		if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return d, nil
+		}
+		if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return d.parent, nil
+	}
+	child, err := fs.getChildLocked(ctx, d, name, ds)
+	if err != nil {
+		return nil, err
+	}
+	if err := rp.CheckMount(&child.vfsd); err != nil {
+		return nil, err
+	}
+	if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+	if child, ok := parent.children[name]; ok {
+		return child, nil
+	}
+	child, err := fs.lookupLocked(ctx, parent, name)
+	if err != nil {
+		return nil, err
+	}
+	if parent.children == nil {
+		parent.children = make(map[string]*dentry)
+	}
+	parent.children[name] = child
+	// child's refcount is initially 0, so it may be dropped after traversal.
+	*ds = appendDentry(*ds, child)
+	return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
+	childPath := fspath.Parse(name)
+	child := fs.newDentry()
+	existsOnAnyLayer := false
+	var lookupErr error
+
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+	parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
+		childVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  parentVD,
+			Start: parentVD,
+			Path:  childPath,
+		}, &vfs.GetDentryOptions{})
+		if err == syserror.ENOENT || err == syserror.ENAMETOOLONG {
+			// The file doesn't exist on this layer. Proceed to the next one.
+			return true
+		}
+		if err != nil {
+			lookupErr = err
+			return false
+		}
+
+		mask := uint32(linux.STATX_TYPE)
+		if !existsOnAnyLayer {
+			// Mode, UID, GID, and (for non-directories) inode number come from
+			// the topmost layer on which the file exists.
+			mask |= linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+		}
+		stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  childVD,
+			Start: childVD,
+		}, &vfs.StatOptions{
+			Mask: mask,
+		})
+		if err != nil {
+			lookupErr = err
+			return false
+		}
+		if stat.Mask&mask != mask {
+			lookupErr = syserror.EREMOTE
+			return false
+		}
+
+		if isWhiteout(&stat) {
+			// This is a whiteout, so it "doesn't exist" on this layer, and
+			// layers below this one are ignored.
+			return false
+		}
+		isDir := stat.Mode&linux.S_IFMT == linux.S_IFDIR
+		if existsOnAnyLayer && !isDir {
+			// Directories are not merged with non-directory files from lower
+			// layers; instead, layers including and below the first
+			// non-directory file are ignored. (This file must be a directory
+			// on previous layers, since lower layers aren't searched for
+			// non-directory files.)
+			return false
+		}
+
+		// Update child to include this layer.
+		if isUpper {
+			child.upperVD = childVD
+			child.copiedUp = 1
+		} else {
+			child.lowerVDs = append(child.lowerVDs, childVD)
+		}
+		if !existsOnAnyLayer {
+			existsOnAnyLayer = true
+			child.mode = uint32(stat.Mode)
+			child.uid = stat.UID
+			child.gid = stat.GID
+			child.devMajor = stat.DevMajor
+			child.devMinor = stat.DevMinor
+			child.ino = stat.Ino
+		}
+
+		// For non-directory files, only the topmost layer that contains a file
+		// matters.
+		if !isDir {
+			return false
+		}
+
+		// Directories are merged with directories from lower layers if they
+		// are not explicitly opaque.
+		opaqueVal, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  childVD,
+			Start: childVD,
+		}, &vfs.GetxattrOptions{
+			Name: _OVL_XATTR_OPAQUE,
+			Size: 1,
+		})
+		return !(err == nil && opaqueVal == "y")
+	})
+
+	if lookupErr != nil {
+		child.destroyLocked()
+		return nil, lookupErr
+	}
+	if !existsOnAnyLayer {
+		child.destroyLocked()
+		return nil, syserror.ENOENT
+	}
+
+	// Device and inode numbers were copied from the topmost layer above;
+	// override them if necessary.
+	if child.isDir() {
+		child.devMajor = linux.UNNAMED_MAJOR
+		child.devMinor = fs.dirDevMinor
+		child.ino = fs.newDirIno()
+	} else if !child.upperVD.Ok() {
+		child.devMajor = linux.UNNAMED_MAJOR
+		child.devMinor = fs.lowerDevMinors[child.lowerVDs[0].Mount().Filesystem()]
+	}
+
+	parent.IncRef()
+	child.parent = parent
+	child.name = name
+	return child, nil
+}
+
+// lookupLayerLocked is similar to lookupLocked, but only returns information
+// about the file rather than a dentry.
+//
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) {
+	childPath := fspath.Parse(name)
+	lookupLayer := lookupLayerNone
+	var lookupErr error
+
+	parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
+		stat, err := fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  parentVD,
+			Start: parentVD,
+			Path:  childPath,
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_TYPE,
+		})
+		if err == syserror.ENOENT || err == syserror.ENAMETOOLONG {
+			// The file doesn't exist on this layer. Proceed to the next
+			// one.
+			return true
+		}
+		if err != nil {
+			lookupErr = err
+			return false
+		}
+		if stat.Mask&linux.STATX_TYPE == 0 {
+			// Linux's overlayfs tends to return EREMOTE in cases where a file
+			// is unusable for reasons that are not better captured by another
+			// errno.
+			lookupErr = syserror.EREMOTE
+			return false
+		}
+		if isWhiteout(&stat) {
+			// This is a whiteout, so it "doesn't exist" on this layer, and
+			// layers below this one are ignored.
+			if isUpper {
+				lookupLayer = lookupLayerUpperWhiteout
+			}
+			return false
+		}
+		// The file exists; we can stop searching.
+		if isUpper {
+			lookupLayer = lookupLayerUpper
+		} else {
+			lookupLayer = lookupLayerLower
+		}
+		return false
+	})
+
+	return lookupLayer, lookupErr
+}
+
+type lookupLayer int
+
+const (
+	// lookupLayerNone indicates that no file exists at the given path on the
+	// upper layer, and is either whited out or does not exist on lower layers.
+	// Therefore, the file does not exist in the overlay filesystem, and file
+	// creation may proceed normally (if an upper layer exists).
+	lookupLayerNone lookupLayer = iota
+
+	// lookupLayerLower indicates that no file exists at the given path on the
+	// upper layer, but exists on a lower layer. Therefore, the file exists in
+	// the overlay filesystem, but must be copied-up before mutation.
+	lookupLayerLower
+
+	// lookupLayerUpper indicates that a non-whiteout file exists at the given
+	// path on the upper layer. Therefore, the file exists in the overlay
+	// filesystem, and is already copied-up.
+	lookupLayerUpper
+
+	// lookupLayerUpperWhiteout indicates that a whiteout exists at the given
+	// path on the upper layer. Therefore, the file does not exist in the
+	// overlay filesystem, and file creation must remove the whiteout before
+	// proceeding.
+	lookupLayerUpperWhiteout
+)
+
+func (ll lookupLayer) existsInOverlay() bool {
+	return ll == lookupLayerLower || ll == lookupLayerUpper
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions: fs.renameMu must be locked. !rp.Done().
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+	for !rp.Final() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	for !rp.Done() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	start := rp.Start().Impl().(*dentry)
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EEXIST
+	}
+	if !dir && rp.MustBeDir() {
+		return syserror.ENOENT
+	}
+	if parent.vfsd.IsDead() {
+		return syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	// Determine if a file already exists at name.
+	if _, ok := parent.children[name]; ok {
+		return syserror.EEXIST
+	}
+	childLayer, err := fs.lookupLayerLocked(ctx, parent, name)
+	if err != nil {
+		return err
+	}
+	if childLayer.existsInOverlay() {
+		return syserror.EEXIST
+	}
+
+	// Ensure that the parent directory is copied-up so that we can create the
+	// new file in the upper layer.
+	if err := parent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+
+	// Finally create the new file.
+	if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil {
+		return err
+	}
+	parent.dirents = nil
+	return nil
+}
+
+// Preconditions: pop's parent directory has been copied up.
+func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) error {
+	return vfsObj.MknodAt(ctx, fs.creds, pop, &vfs.MknodOptions{
+		Mode: linux.S_IFCHR, // permissions == include/linux/fs.h:WHITEOUT_MODE == 0
+		// DevMajor == DevMinor == 0, from include/linux/fs.h:WHITEOUT_DEV
+	})
+}
+
+func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) {
+	if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil {
+		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)
+	}
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.checkPermissions(creds, ats)
+}
+
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	layerVD := d.topLayer()
+	return fs.vfsfs.VirtualFilesystem().BoundEndpointAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	}, &opts)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !d.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+			return nil, err
+		}
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	start := rp.Start().Impl().(*dentry)
+	d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+		if rp.Mount() != vd.Mount() {
+			return syserror.EXDEV
+		}
+		old := vd.Dentry().Impl().(*dentry)
+		if old.isDir() {
+			return syserror.EPERM
+		}
+		if err := old.copyUpLocked(ctx); err != nil {
+			return err
+		}
+		vfsObj := fs.vfsfs.VirtualFilesystem()
+		newpop := vfs.PathOperation{
+			Root:  parent.upperVD,
+			Start: parent.upperVD,
+			Path:  fspath.Parse(childName),
+		}
+		if haveUpperWhiteout {
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
+				return err
+			}
+		}
+		if err := vfsObj.LinkAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  old.upperVD,
+			Start: old.upperVD,
+		}, &newpop); err != nil {
+			if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
+			}
+			return err
+		}
+		creds := rp.Credentials()
+		if err := vfsObj.SetStatAt(ctx, fs.creds, &newpop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  uint32(creds.EffectiveKUID),
+				GID:  uint32(creds.EffectiveKGID),
+			},
+		}); err != nil {
+			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil {
+				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)
+			} else if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
+			}
+			return err
+		}
+		return nil
+	})
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+		vfsObj := fs.vfsfs.VirtualFilesystem()
+		pop := vfs.PathOperation{
+			Root:  parent.upperVD,
+			Start: parent.upperVD,
+			Path:  fspath.Parse(childName),
+		}
+		if haveUpperWhiteout {
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+				return err
+			}
+		}
+		if err := vfsObj.MkdirAt(ctx, fs.creds, &pop, &opts); err != nil {
+			if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		creds := rp.Credentials()
+		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  uint32(creds.EffectiveKUID),
+				GID:  uint32(creds.EffectiveKGID),
+			},
+		}); err != nil {
+			if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
+				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)
+			} else if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		if haveUpperWhiteout {
+			// There may be directories on lower layers (previously hidden by
+			// the whiteout) that the new directory should not be merged with.
+			// Mark it opaque to prevent merging.
+			if err := vfsObj.SetxattrAt(ctx, fs.creds, &pop, &vfs.SetxattrOptions{
+				Name:  _OVL_XATTR_OPAQUE,
+				Value: "y",
+			}); err != nil {
+				if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
+					ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)
+				} else {
+					fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+				}
+				return err
+			}
+		}
+		return nil
+	})
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+		// Disallow attempts to create whiteouts.
+		if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 {
+			return syserror.EPERM
+		}
+		vfsObj := fs.vfsfs.VirtualFilesystem()
+		pop := vfs.PathOperation{
+			Root:  parent.upperVD,
+			Start: parent.upperVD,
+			Path:  fspath.Parse(childName),
+		}
+		if haveUpperWhiteout {
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+				return err
+			}
+		}
+		if err := vfsObj.MknodAt(ctx, fs.creds, &pop, &opts); err != nil {
+			if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		creds := rp.Credentials()
+		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  uint32(creds.EffectiveKUID),
+				GID:  uint32(creds.EffectiveKGID),
+			},
+		}); err != nil {
+			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)
+			} else if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		return nil
+	})
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	mayCreate := opts.Flags&linux.O_CREAT != 0
+	mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
+
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+
+	start := rp.Start().Impl().(*dentry)
+	if rp.Done() {
+		if mustCreate {
+			return nil, syserror.EEXIST
+		}
+		return start.openLocked(ctx, rp, &opts)
+	}
+
+afterTrailingSymlink:
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+	// Determine whether or not we need to create a file.
+	parent.dirMu.Lock()
+	child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+	if err == syserror.ENOENT && mayCreate {
+		fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds)
+		parent.dirMu.Unlock()
+		return fd, err
+	}
+	if err != nil {
+		parent.dirMu.Unlock()
+		return nil, err
+	}
+	// Open existing child or follow symlink.
+	parent.dirMu.Unlock()
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	if child.isSymlink() && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		start = parent
+		goto afterTrailingSymlink
+	}
+	return child.openLocked(ctx, rp, &opts)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(opts)
+	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+		return nil, err
+	}
+	if ats.MayWrite() {
+		if err := d.copyUpLocked(ctx); err != nil {
+			return nil, err
+		}
+	}
+	mnt := rp.Mount()
+
+	// Directory FDs open FDs from each layer when directory entries are read,
+	// so they don't require opening an FD from d.topLayer() up front.
+	ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
+	if ftype == linux.S_IFDIR {
+		// Can't open directories with O_CREAT.
+		if opts.Flags&linux.O_CREAT != 0 {
+			return nil, syserror.EISDIR
+		}
+		// Can't open directories writably.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		if opts.Flags&linux.O_DIRECT != 0 {
+			return nil, syserror.EINVAL
+		}
+		fd := &directoryFD{}
+		fd.LockFD.Init(&d.locks)
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+			UseDentryMetadata: true,
+		}); err != nil {
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	}
+
+	layerVD, isUpper := d.topLayerInfo()
+	layerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	}, opts)
+	if err != nil {
+		return nil, err
+	}
+	layerFlags := layerFD.StatusFlags()
+	fd := &nonDirectoryFD{
+		copiedUp:    isUpper,
+		cachedFD:    layerFD,
+		cachedFlags: layerFlags,
+	}
+	fd.LockFD.Init(&d.locks)
+	layerFDOpts := layerFD.Options()
+	if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil {
+		layerFD.DecRef()
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Preconditions: parent.dirMu must be locked. parent does not already contain
+// a child named rp.Component().
+func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
+	creds := rp.Credentials()
+	if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	if parent.vfsd.IsDead() {
+		return nil, syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return nil, err
+	}
+	defer mnt.EndWrite()
+
+	if err := parent.copyUpLocked(ctx); err != nil {
+		return nil, err
+	}
+
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+	childName := rp.Component()
+	pop := vfs.PathOperation{
+		Root:  parent.upperVD,
+		Start: parent.upperVD,
+		Path:  fspath.Parse(childName),
+	}
+	// We don't know if a whiteout exists on the upper layer; speculatively
+	// unlink it.
+	//
+	// TODO(gvisor.dev/issue/1199): Modify OpenAt => stepLocked so that we do
+	// know whether a whiteout exists.
+	var haveUpperWhiteout bool
+	switch err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err {
+	case nil:
+		haveUpperWhiteout = true
+	case syserror.ENOENT:
+		haveUpperWhiteout = false
+	default:
+		return nil, err
+	}
+	// Create the file on the upper layer, and get an FD representing it.
+	upperFD, err := vfsObj.OpenAt(ctx, fs.creds, &pop, &vfs.OpenOptions{
+		Flags: opts.Flags&^vfs.FileCreationFlags | linux.O_CREAT | linux.O_EXCL,
+		Mode:  opts.Mode,
+	})
+	if err != nil {
+		if haveUpperWhiteout {
+			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+		}
+		return nil, err
+	}
+	// Change the file's owner to the caller. We can't use upperFD.SetStat()
+	// because it will pick up creds from ctx.
+	if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_UID | linux.STATX_GID,
+			UID:  uint32(creds.EffectiveKUID),
+			GID:  uint32(creds.EffectiveKGID),
+		},
+	}); err != nil {
+		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)
+		} else if haveUpperWhiteout {
+			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+		}
+		return nil, err
+	}
+	// Re-lookup to get a dentry representing the new file, which is needed for
+	// the returned FD.
+	child, err := fs.getChildLocked(ctx, parent, childName, ds)
+	if err != nil {
+		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)
+		} else if haveUpperWhiteout {
+			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+		}
+		return nil, err
+	}
+	// Finally construct the overlay FD.
+	upperFlags := upperFD.StatusFlags()
+	fd := &nonDirectoryFD{
+		copiedUp:    true,
+		cachedFD:    upperFD,
+		cachedFlags: upperFlags,
+	}
+	fd.LockFD.Init(&child.locks)
+	upperFDOpts := upperFD.Options()
+	if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil {
+		upperFD.DecRef()
+		// Don't bother with cleanup; the file was created successfully, we
+		// just can't open it anymore for some reason.
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	layerVD := d.topLayer()
+	return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	})
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	if opts.Flags != 0 {
+		return syserror.EINVAL
+	}
+
+	var ds *[]*dentry
+	fs.renameMu.Lock()
+	defer fs.renameMuUnlockAndCheckDrop(&ds)
+	newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
+	if err != nil {
+		return err
+	}
+	newName := rp.Component()
+	if newName == "." || newName == ".." {
+		return syserror.EBUSY
+	}
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	// FIXME(gvisor.dev/issue/1199): Actually implement rename.
+	_ = newParent
+	return syserror.EXDEV
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	start := rp.Start().Impl().(*dentry)
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	name := rp.Component()
+	if name == "." {
+		return syserror.EINVAL
+	}
+	if name == ".." {
+		return syserror.ENOTEMPTY
+	}
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	// Ensure that parent is copied-up before potentially holding child.copyMu
+	// below.
+	if err := parent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+
+	// Unlike UnlinkAt, we need a dentry representing the child directory being
+	// removed in order to verify that it's empty.
+	child, err := fs.getChildLocked(ctx, parent, name, &ds)
+	if err != nil {
+		return err
+	}
+	if !child.isDir() {
+		return syserror.ENOTDIR
+	}
+	child.dirMu.Lock()
+	defer child.dirMu.Unlock()
+	whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx)
+	if err != nil {
+		return err
+	}
+	child.copyMu.RLock()
+	defer child.copyMu.RUnlock()
+	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+		return err
+	}
+
+	pop := vfs.PathOperation{
+		Root:  parent.upperVD,
+		Start: parent.upperVD,
+		Path:  fspath.Parse(name),
+	}
+	if child.upperVD.Ok() {
+		cleanupRecreateWhiteouts := func() {
+			if !child.upperVD.Ok() {
+				return
+			}
+			for whiteoutName, whiteoutUpper := range whiteouts {
+				if !whiteoutUpper {
+					continue
+				}
+				if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{
+					Root:  child.upperVD,
+					Start: child.upperVD,
+					Path:  fspath.Parse(whiteoutName),
+				}); err != nil && err != syserror.EEXIST {
+					ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)
+				}
+			}
+		}
+		// Remove existing whiteouts on the upper layer.
+		for whiteoutName, whiteoutUpper := range whiteouts {
+			if !whiteoutUpper {
+				continue
+			}
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
+				Root:  child.upperVD,
+				Start: child.upperVD,
+				Path:  fspath.Parse(whiteoutName),
+			}); err != nil {
+				cleanupRecreateWhiteouts()
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+				return err
+			}
+		}
+		// Remove the existing directory on the upper layer.
+		if err := vfsObj.RmdirAt(ctx, fs.creds, &pop); err != nil {
+			cleanupRecreateWhiteouts()
+			vfsObj.AbortDeleteDentry(&child.vfsd)
+			return err
+		}
+	}
+	if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
+		// Don't attempt to recover from this: the original directory is
+		// already gone, so any dentries representing it are invalid, and
+		// creating a new directory won't undo that.
+		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err)
+		vfsObj.AbortDeleteDentry(&child.vfsd)
+		return err
+	}
+
+	vfsObj.CommitDeleteDentry(&child.vfsd)
+	delete(parent.children, name)
+	ds = appendDentry(ds, child)
+	parent.dirents = nil
+	return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+		return err
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// Changes to d's attributes are serialized by d.copyMu.
+	d.copyMu.Lock()
+	defer d.copyMu.Unlock()
+	if err := d.fs.vfsfs.VirtualFilesystem().SetStatAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.upperVD,
+		Start: d.upperVD,
+	}, &opts); err != nil {
+		return err
+	}
+	d.updateAfterSetStatLocked(&opts)
+	return nil
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+
+	var stat linux.Statx
+	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
+		layerVD := d.topLayer()
+		stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  layerVD,
+			Start: layerVD,
+		}, &vfs.StatOptions{
+			Mask: layerMask,
+			Sync: opts.Sync,
+		})
+		if err != nil {
+			return linux.Statx{}, err
+		}
+	}
+	d.statInternalTo(ctx, &opts, &stat)
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	return fs.statFS(ctx)
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+		vfsObj := fs.vfsfs.VirtualFilesystem()
+		pop := vfs.PathOperation{
+			Root:  parent.upperVD,
+			Start: parent.upperVD,
+			Path:  fspath.Parse(childName),
+		}
+		if haveUpperWhiteout {
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+				return err
+			}
+		}
+		if err := vfsObj.SymlinkAt(ctx, fs.creds, &pop, target); err != nil {
+			if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		creds := rp.Credentials()
+		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  uint32(creds.EffectiveKUID),
+				GID:  uint32(creds.EffectiveKGID),
+			},
+		}); err != nil {
+			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)
+			} else if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		return nil
+	})
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	start := rp.Start().Impl().(*dentry)
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EISDIR
+	}
+	if rp.MustBeDir() {
+		return syserror.ENOTDIR
+	}
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	// Ensure that parent is copied-up before potentially holding child.copyMu
+	// below.
+	if err := parent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+
+	child := parent.children[name]
+	var childLayer lookupLayer
+	if child != nil {
+		if child.isDir() {
+			return syserror.EISDIR
+		}
+		if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+			return err
+		}
+		// Hold child.copyMu to prevent it from being copied-up during
+		// deletion.
+		child.copyMu.RLock()
+		defer child.copyMu.RUnlock()
+		if child.upperVD.Ok() {
+			childLayer = lookupLayerUpper
+		} else {
+			childLayer = lookupLayerLower
+		}
+	} else {
+		// Determine if the file being unlinked actually exists. Holding
+		// parent.dirMu prevents a dentry from being instantiated for the file,
+		// which in turn prevents it from being copied-up, so this result is
+		// stable.
+		childLayer, err = fs.lookupLayerLocked(ctx, parent, name)
+		if err != nil {
+			return err
+		}
+		if !childLayer.existsInOverlay() {
+			return syserror.ENOENT
+		}
+	}
+
+	pop := vfs.PathOperation{
+		Root:  parent.upperVD,
+		Start: parent.upperVD,
+		Path:  fspath.Parse(name),
+	}
+	if childLayer == lookupLayerUpper {
+		// Remove the existing file on the upper layer.
+		if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+			if child != nil {
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+			}
+			return err
+		}
+	}
+	if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
+		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err)
+		if child != nil {
+			vfsObj.AbortDeleteDentry(&child.vfsd)
+		}
+		return err
+	}
+
+	if child != nil {
+		vfsObj.CommitDeleteDentry(&child.vfsd)
+		delete(parent.children, name)
+		ds = appendDentry(ds, child)
+	}
+	parent.dirents = nil
+	return nil
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	// TODO(gvisor.dev/issue/1199): Linux overlayfs actually allows listxattr,
+	// but not any other xattr syscalls. For now we just reject all of them.
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(&ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.renameMu.RLock()
+	defer fs.renameMu.RUnlock()
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
+}
diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go
new file mode 100644
index 000000000..c0749e711
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/non_directory.go
@@ -0,0 +1,266 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isSymlink() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
+}
+
+func (d *dentry) readlink(ctx context.Context) (string, error) {
+	layerVD := d.topLayer()
+	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	})
+}
+
+type nonDirectoryFD struct {
+	fileDescription
+
+	// If copiedUp is false, cachedFD represents
+	// fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents
+	// fileDescription.dentry().upperVD. cachedFlags is the last known value of
+	// cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are
+	// protected by mu.
+	mu          sync.Mutex
+	copiedUp    bool
+	cachedFD    *vfs.FileDescription
+	cachedFlags uint32
+}
+
+func (fd *nonDirectoryFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return nil, err
+	}
+	wrappedFD.IncRef()
+	return wrappedFD, nil
+}
+
+func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) {
+	d := fd.dentry()
+	statusFlags := fd.vfsfd.StatusFlags()
+	if !fd.copiedUp && d.isCopiedUp() {
+		// Switch to the copied-up file.
+		upperVD := d.topLayer()
+		upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  upperVD,
+			Start: upperVD,
+		}, &vfs.OpenOptions{
+			Flags: statusFlags,
+		})
+		if err != nil {
+			return nil, err
+		}
+		oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR)
+		if oldOffErr == nil {
+			if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil {
+				upperFD.DecRef()
+				return nil, err
+			}
+		}
+		fd.cachedFD.DecRef()
+		fd.copiedUp = true
+		fd.cachedFD = upperFD
+		fd.cachedFlags = statusFlags
+	} else if fd.cachedFlags != statusFlags {
+		if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil {
+			return nil, err
+		}
+		fd.cachedFlags = statusFlags
+	}
+	return fd.cachedFD, nil
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *nonDirectoryFD) Release() {
+	fd.cachedFD.DecRef()
+	fd.cachedFD = nil
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *nonDirectoryFD) OnClose(ctx context.Context) error {
+	// Linux doesn't define ovl_file_operations.flush at all (i.e. its
+	// equivalent to OnClose is a no-op). We pass through to
+	// fd.cachedFD.OnClose() without upgrading if fd.dentry() has been
+	// copied-up, since OnClose is mostly used to define post-close writeback,
+	// and if fd.cachedFD hasn't been updated then it can't have been used to
+	// mutate fd.dentry() anyway.
+	fd.mu.Lock()
+	if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags {
+		if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil {
+			fd.mu.Unlock()
+			return err
+		}
+		fd.cachedFlags = statusFlags
+	}
+	wrappedFD := fd.cachedFD
+	defer wrappedFD.IncRef()
+	fd.mu.Unlock()
+	return wrappedFD.OnClose(ctx)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
+		wrappedFD, err := fd.getCurrentFD(ctx)
+		if err != nil {
+			return linux.Statx{}, err
+		}
+		stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{
+			Mask: layerMask,
+			Sync: opts.Sync,
+		})
+		wrappedFD.DecRef()
+		if err != nil {
+			return linux.Statx{}, err
+		}
+	}
+	fd.dentry().statInternalTo(ctx, &opts, &stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	d := fd.dentry()
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+		return err
+	}
+	mnt := fd.vfsfd.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// Changes to d's attributes are serialized by d.copyMu.
+	d.copyMu.Lock()
+	defer d.copyMu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return err
+	}
+	if err := wrappedFD.SetStat(ctx, opts); err != nil {
+		return err
+	}
+	d.updateAfterSetStatLocked(&opts)
+	return nil
+}
+
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *nonDirectoryFD) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return fd.filesystem().statFS(ctx)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef()
+	return wrappedFD.PRead(ctx, dst, offset, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *nonDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// Hold fd.mu during the read to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Read(ctx, dst, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef()
+	return wrappedFD.PWrite(ctx, src, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *nonDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// Hold fd.mu during the write to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Write(ctx, src, opts)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *nonDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	// Hold fd.mu during the seek to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Seek(ctx, offset, whence)
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *nonDirectoryFD) Sync(ctx context.Context) error {
+	fd.mu.Lock()
+	if !fd.dentry().isCopiedUp() {
+		fd.mu.Unlock()
+		return nil
+	}
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		fd.mu.Unlock()
+		return err
+	}
+	wrappedFD.IncRef()
+	defer wrappedFD.DecRef()
+	fd.mu.Unlock()
+	return wrappedFD.Sync(ctx)
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return err
+	}
+	defer wrappedFD.DecRef()
+	return wrappedFD.ConfigureMMap(ctx, opts)
+}
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
new file mode 100644
index 000000000..e720d4825
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -0,0 +1,627 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package overlay provides an overlay filesystem implementation, which
+// synthesizes a filesystem by composing one or more immutable filesystems
+// ("lower layers") with an optional mutable filesystem ("upper layer").
+//
+// Lock order:
+//
+// directoryFD.mu / nonDirectoryFD.mu
+//   filesystem.renameMu
+//     dentry.dirMu
+//       dentry.copyMu
+//
+// Locking dentry.dirMu in multiple dentries requires that parent dentries are
+// locked before child dentries, and that filesystem.renameMu is locked to
+// stabilize this relationship.
+package overlay
+
+import (
+	"strings"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the default filesystem name.
+const Name = "overlay"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
+// FilesystemType.GetFilesystem.
+type FilesystemOptions struct {
+	// Callers passing FilesystemOptions to
+	// overlay.FilesystemType.GetFilesystem() are responsible for ensuring that
+	// the vfs.Mounts comprising the layers of the overlay filesystem do not
+	// contain submounts.
+
+	// If UpperRoot.Ok(), it is the root of the writable upper layer of the
+	// overlay.
+	UpperRoot vfs.VirtualDentry
+
+	// LowerRoots contains the roots of the immutable lower layers of the
+	// overlay. LowerRoots is immutable.
+	LowerRoots []vfs.VirtualDentry
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// Immutable options.
+	opts FilesystemOptions
+
+	// creds is a copy of the filesystem's creator's credentials, which are
+	// used for accesses to the filesystem's layers. creds is immutable.
+	creds *auth.Credentials
+
+	// dirDevMinor is the device minor number used for directories. dirDevMinor
+	// is immutable.
+	dirDevMinor uint32
+
+	// lowerDevMinors maps lower layer filesystems to device minor numbers
+	// assigned to non-directory files originating from that filesystem.
+	// lowerDevMinors is immutable.
+	lowerDevMinors map[*vfs.Filesystem]uint32
+
+	// renameMu synchronizes renaming with non-renaming operations in order to
+	// ensure consistent lock ordering between dentry.dirMu in different
+	// dentries.
+	renameMu sync.RWMutex
+
+	// lastDirIno is the last inode number assigned to a directory. lastDirIno
+	// is accessed using atomic memory operations.
+	lastDirIno uint64
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	fsoptsRaw := opts.InternalData
+	fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions)
+	if fsoptsRaw != nil && !haveFSOpts {
+		ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
+		return nil, nil, syserror.EINVAL
+	}
+	if haveFSOpts {
+		if len(fsopts.LowerRoots) == 0 {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
+			return nil, nil, syserror.EINVAL
+		}
+		if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
+			return nil, nil, syserror.EINVAL
+		}
+		// We don't enforce a maximum number of lower layers when not
+		// configured by applications; the sandbox owner can have an overlay
+		// filesystem with any number of lower layers.
+	} else {
+		vfsroot := vfs.RootFromContext(ctx)
+		defer vfsroot.DecRef()
+		upperPathname, ok := mopts["upperdir"]
+		if ok {
+			delete(mopts, "upperdir")
+			// Linux overlayfs also requires a workdir when upperdir is
+			// specified; we don't, so silently ignore this option.
+			delete(mopts, "workdir")
+			upperPath := fspath.Parse(upperPathname)
+			if !upperPath.Absolute {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
+				return nil, nil, syserror.EINVAL
+			}
+			upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
+				Root:               vfsroot,
+				Start:              vfsroot,
+				Path:               upperPath,
+				FollowFinalSymlink: true,
+			}, &vfs.GetDentryOptions{
+				CheckSearchable: true,
+			})
+			if err != nil {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
+				return nil, nil, err
+			}
+			defer upperRoot.DecRef()
+			privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
+			if err != nil {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
+				return nil, nil, err
+			}
+			defer privateUpperRoot.DecRef()
+			fsopts.UpperRoot = privateUpperRoot
+		}
+		lowerPathnamesStr, ok := mopts["lowerdir"]
+		if !ok {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
+			return nil, nil, syserror.EINVAL
+		}
+		delete(mopts, "lowerdir")
+		lowerPathnames := strings.Split(lowerPathnamesStr, ":")
+		const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
+		if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
+			return nil, nil, syserror.EINVAL
+		}
+		if len(lowerPathnames) > maxLowerLayers {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
+			return nil, nil, syserror.EINVAL
+		}
+		for _, lowerPathname := range lowerPathnames {
+			lowerPath := fspath.Parse(lowerPathname)
+			if !lowerPath.Absolute {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
+				return nil, nil, syserror.EINVAL
+			}
+			lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
+				Root:               vfsroot,
+				Start:              vfsroot,
+				Path:               lowerPath,
+				FollowFinalSymlink: true,
+			}, &vfs.GetDentryOptions{
+				CheckSearchable: true,
+			})
+			if err != nil {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
+				return nil, nil, err
+			}
+			defer lowerRoot.DecRef()
+			privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
+			if err != nil {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
+				return nil, nil, err
+			}
+			defer privateLowerRoot.DecRef()
+			fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot)
+		}
+	}
+	if len(mopts) != 0 {
+		ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Allocate device numbers.
+	dirDevMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+	lowerDevMinors := make(map[*vfs.Filesystem]uint32)
+	for _, lowerRoot := range fsopts.LowerRoots {
+		lowerFS := lowerRoot.Mount().Filesystem()
+		if _, ok := lowerDevMinors[lowerFS]; !ok {
+			devMinor, err := vfsObj.GetAnonBlockDevMinor()
+			if err != nil {
+				vfsObj.PutAnonBlockDevMinor(dirDevMinor)
+				for _, lowerDevMinor := range lowerDevMinors {
+					vfsObj.PutAnonBlockDevMinor(lowerDevMinor)
+				}
+				return nil, nil, err
+			}
+			lowerDevMinors[lowerFS] = devMinor
+		}
+	}
+
+	// Take extra references held by the filesystem.
+	if fsopts.UpperRoot.Ok() {
+		fsopts.UpperRoot.IncRef()
+	}
+	for _, lowerRoot := range fsopts.LowerRoots {
+		lowerRoot.IncRef()
+	}
+
+	fs := &filesystem{
+		opts:           fsopts,
+		creds:          creds.Fork(),
+		dirDevMinor:    dirDevMinor,
+		lowerDevMinors: lowerDevMinors,
+	}
+	fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+	// Construct the root dentry.
+	root := fs.newDentry()
+	root.refs = 1
+	if fs.opts.UpperRoot.Ok() {
+		fs.opts.UpperRoot.IncRef()
+		root.copiedUp = 1
+		root.upperVD = fs.opts.UpperRoot
+	}
+	for _, lowerRoot := range fs.opts.LowerRoots {
+		lowerRoot.IncRef()
+		root.lowerVDs = append(root.lowerVDs, lowerRoot)
+	}
+	rootTopVD := root.topLayer()
+	// Get metadata from the topmost layer. See fs.lookupLocked().
+	const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+	rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+		Root:  rootTopVD,
+		Start: rootTopVD,
+	}, &vfs.StatOptions{
+		Mask: rootStatMask,
+	})
+	if err != nil {
+		root.destroyLocked()
+		fs.vfsfs.DecRef()
+		return nil, nil, err
+	}
+	if rootStat.Mask&rootStatMask != rootStatMask {
+		root.destroyLocked()
+		fs.vfsfs.DecRef()
+		return nil, nil, syserror.EREMOTE
+	}
+	if isWhiteout(&rootStat) {
+		ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
+		root.destroyLocked()
+		fs.vfsfs.DecRef()
+		return nil, nil, syserror.EINVAL
+	}
+	root.mode = uint32(rootStat.Mode)
+	root.uid = rootStat.UID
+	root.gid = rootStat.GID
+	if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR {
+		root.devMajor = linux.UNNAMED_MAJOR
+		root.devMinor = fs.dirDevMinor
+		root.ino = fs.newDirIno()
+	} else if !root.upperVD.Ok() {
+		root.devMajor = linux.UNNAMED_MAJOR
+		root.devMinor = fs.lowerDevMinors[root.lowerVDs[0].Mount().Filesystem()]
+		root.ino = rootStat.Ino
+	} else {
+		root.devMajor = rootStat.DevMajor
+		root.devMinor = rootStat.DevMinor
+		root.ino = rootStat.Ino
+	}
+
+	return &fs.vfsfs, &root.vfsd, nil
+}
+
+// clonePrivateMount creates a non-recursive bind mount rooted at vd, not
+// associated with any MountNamespace, and returns the root of the new mount.
+// (This is required to ensure that each layer of an overlay comprises only a
+// single mount, and therefore can't cross into e.g. the overlay filesystem
+// itself, risking lock recursion.) A reference is held on the returned
+// VirtualDentry.
+func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) {
+	oldmnt := vd.Mount()
+	opts := oldmnt.Options()
+	if forceReadOnly {
+		opts.ReadOnly = true
+	}
+	newmnt, err := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts)
+	if err != nil {
+		return vfs.VirtualDentry{}, err
+	}
+	return vfs.MakeVirtualDentry(newmnt, vd.Dentry()), nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+	vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor)
+	for _, lowerDevMinor := range fs.lowerDevMinors {
+		vfsObj.PutAnonBlockDevMinor(lowerDevMinor)
+	}
+	if fs.opts.UpperRoot.Ok() {
+		fs.opts.UpperRoot.DecRef()
+	}
+	for _, lowerRoot := range fs.opts.LowerRoots {
+		lowerRoot.DecRef()
+	}
+}
+
+func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) {
+	// Always statfs the root of the topmost layer. Compare Linux's
+	// fs/overlayfs/super.c:ovl_statfs().
+	var rootVD vfs.VirtualDentry
+	if fs.opts.UpperRoot.Ok() {
+		rootVD = fs.opts.UpperRoot
+	} else {
+		rootVD = fs.opts.LowerRoots[0]
+	}
+	fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  rootVD,
+		Start: rootVD,
+	})
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC
+	return fsstat, nil
+}
+
+func (fs *filesystem) newDirIno() uint64 {
+	return atomic.AddUint64(&fs.lastDirIno, 1)
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	vfsd vfs.Dentry
+
+	refs int64
+
+	// fs is the owning filesystem. fs is immutable.
+	fs *filesystem
+
+	// mode, uid, and gid are the file mode, owner, and group of the file in
+	// the topmost layer (and therefore the overlay file as well), and are used
+	// for permission checks on this dentry. These fields are protected by
+	// copyMu and accessed using atomic memory operations.
+	mode uint32
+	uid  uint32
+	gid  uint32
+
+	// copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and
+	// 0 otherwise. copiedUp is accessed using atomic memory operations.
+	copiedUp uint32
+
+	// parent is the dentry corresponding to this dentry's parent directory.
+	// name is this dentry's name in parent. If this dentry is a filesystem
+	// root, parent is nil and name is the empty string. parent and name are
+	// protected by fs.renameMu.
+	parent *dentry
+	name   string
+
+	// If this dentry represents a directory, children maps the names of
+	// children for which dentries have been instantiated to those dentries,
+	// and dirents (if not nil) is a cache of dirents as returned by
+	// directoryFDs representing this directory. children is protected by
+	// dirMu.
+	dirMu    sync.Mutex
+	children map[string]*dentry
+	dirents  []vfs.Dirent
+
+	// upperVD and lowerVDs are the files from the overlay filesystem's layers
+	// that comprise the file on the overlay filesystem.
+	//
+	// If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e.
+	// be copied up) with copyMu locked for writing; otherwise, it is
+	// immutable. lowerVDs is always immutable.
+	copyMu   sync.RWMutex
+	upperVD  vfs.VirtualDentry
+	lowerVDs []vfs.VirtualDentry
+
+	// inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <=
+	// len(inlineLowerVDs).
+	inlineLowerVDs [1]vfs.VirtualDentry
+
+	// devMajor, devMinor, and ino are the device major/minor and inode numbers
+	// used by this dentry. These fields are protected by copyMu and accessed
+	// using atomic memory operations.
+	devMajor uint32
+	devMinor uint32
+	ino      uint64
+
+	locks vfs.FileLocks
+}
+
+// newDentry creates a new dentry. The dentry initially has no references; it
+// is the caller's responsibility to set the dentry's reference count and/or
+// call dentry.destroy() as appropriate. The dentry is initially invalid in
+// that it contains no layers; the caller is responsible for setting them.
+func (fs *filesystem) newDentry() *dentry {
+	d := &dentry{
+		fs: fs,
+	}
+	d.lowerVDs = d.inlineLowerVDs[:0]
+	d.vfsd.Init(d)
+	return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+	// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
+	// d.checkDropLocked().
+	atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&d.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef() {
+	if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+		d.fs.renameMu.Lock()
+		d.checkDropLocked()
+		d.fs.renameMu.Unlock()
+	} else if refs < 0 {
+		panic("overlay.dentry.DecRef() called without holding a reference")
+	}
+}
+
+// checkDropLocked should be called after d's reference count becomes 0 or it
+// becomes deleted.
+//
+// Preconditions: d.fs.renameMu must be locked for writing.
+func (d *dentry) checkDropLocked() {
+	// Dentries with a positive reference count must be retained. (The only way
+	// to obtain a reference on a dentry with zero references is via path
+	// resolution, which requires renameMu, so if d.refs is zero then it will
+	// remain zero while we hold renameMu for writing.) Dentries with a
+	// negative reference count have already been destroyed.
+	if atomic.LoadInt64(&d.refs) != 0 {
+		return
+	}
+	// Refs is still zero; destroy it.
+	d.destroyLocked()
+	return
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+func (d *dentry) destroyLocked() {
+	switch atomic.LoadInt64(&d.refs) {
+	case 0:
+		// Mark the dentry destroyed.
+		atomic.StoreInt64(&d.refs, -1)
+	case -1:
+		panic("overlay.dentry.destroyLocked() called on already destroyed dentry")
+	default:
+		panic("overlay.dentry.destroyLocked() called with references on the dentry")
+	}
+
+	if d.upperVD.Ok() {
+		d.upperVD.DecRef()
+	}
+	for _, lowerVD := range d.lowerVDs {
+		lowerVD.DecRef()
+	}
+
+	if d.parent != nil {
+		d.parent.dirMu.Lock()
+		if !d.vfsd.IsDead() {
+			delete(d.parent.children, d.name)
+		}
+		d.parent.dirMu.Unlock()
+		// Drop the reference held by d on its parent without recursively
+		// locking d.fs.renameMu.
+		if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+			d.parent.checkDropLocked()
+		} else if refs < 0 {
+			panic("overlay.dentry.DecRef() called without holding a reference")
+		}
+	}
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {
+	// TODO(gvisor.dev/issue/1479): Implement inotify.
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+	// TODO(gvisor.dev/issue/1479): Implement inotify.
+	return nil
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) OnZeroWatches() {}
+
+// iterLayers invokes yield on each layer comprising d, from top to bottom. If
+// any call to yield returns false, iterLayer stops iteration.
+func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) {
+	if d.isCopiedUp() {
+		if !yield(d.upperVD, true) {
+			return
+		}
+	}
+	for _, lowerVD := range d.lowerVDs {
+		if !yield(lowerVD, false) {
+			return
+		}
+	}
+}
+
+func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) {
+	if d.isCopiedUp() {
+		return d.upperVD, true
+	}
+	return d.lowerVDs[0], false
+}
+
+func (d *dentry) topLayer() vfs.VirtualDentry {
+	vd, _ := d.topLayerInfo()
+	return vd
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+// statInternalMask is the set of stat fields that is set by
+// dentry.statInternalTo().
+const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+
+// statInternalTo writes fields to stat that are stored in d, and therefore do
+// not requiring invoking StatAt on the overlay's layers.
+func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) {
+	stat.Mask |= statInternalMask
+	if d.isDir() {
+		// Linux sets nlink to 1 for merged directories
+		// (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is
+		// correct more often ("." and the directory's entry in its parent),
+		// and some of our tests expect this.
+		stat.Nlink = 2
+	}
+	stat.UID = atomic.LoadUint32(&d.uid)
+	stat.GID = atomic.LoadUint32(&d.gid)
+	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
+	stat.Ino = atomic.LoadUint64(&d.ino)
+	stat.DevMajor = atomic.LoadUint32(&d.devMajor)
+	stat.DevMinor = atomic.LoadUint32(&d.devMinor)
+}
+
+// Preconditions: d.copyMu must be locked for writing.
+func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) {
+	if opts.Stat.Mask&linux.STATX_MODE != 0 {
+		atomic.StoreUint32(&d.mode, (d.mode&linux.S_IFMT)|uint32(opts.Stat.Mode&^linux.S_IFMT))
+	}
+	if opts.Stat.Mask&linux.STATX_UID != 0 {
+		atomic.StoreUint32(&d.uid, opts.Stat.UID)
+	}
+	if opts.Stat.Mask&linux.STATX_GID != 0 {
+		atomic.StoreUint32(&d.gid, opts.Stat.GID)
+	}
+}
+
+// fileDescription is embedded by overlay implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) dentry() *dentry {
+	return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index cab771211..811f80a5f 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -81,7 +81,8 @@ type inode struct {
 	kernfs.InodeNotSymlink
 	kernfs.InodeNoopRefCount
 
-	pipe *pipe.VFSPipe
+	locks vfs.FileLocks
+	pipe  *pipe.VFSPipe
 
 	ino uint64
 	uid auth.KUID
@@ -114,7 +115,7 @@ func (i *inode) Mode() linux.FileMode {
 }
 
 // Stat implements kernfs.Inode.Stat.
-func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+func (i *inode) Stat(_ context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
 	ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds())
 	return linux.Statx{
 		Mask:     linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
@@ -147,7 +148,7 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.
 
 // Open implements kernfs.Inode.Open.
 func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags)
+	return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks)
 }
 
 // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 17c1342b5..6014138ff 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -22,6 +22,7 @@ go_library(
         "//pkg/log",
         "//pkg/refs",
         "//pkg/safemem",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsbridge",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/inet",
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 36a911db4..79c2725f3 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -37,6 +37,8 @@ type subtasksInode struct {
 	kernfs.OrderedChildren
 	kernfs.AlwaysValid
 
+	locks vfs.FileLocks
+
 	fs                *filesystem
 	task              *kernel.Task
 	pidns             *kernel.PIDNamespace
@@ -126,7 +128,7 @@ func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallbac
 	return fd.GenericDirectoryFD.IterDirents(ctx, cb)
 }
 
-// Seek implements vfs.FileDecriptionImpl.Seek.
+// Seek implements vfs.FileDescriptionImpl.Seek.
 func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
 	if fd.task.ExitState() >= kernel.TaskExitZombie {
 		return 0, syserror.ENOENT
@@ -153,7 +155,7 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro
 // Open implements kernfs.Inode.
 func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &subtasksFD{task: i.task}
-	if err := fd.Init(&i.OrderedChildren, &opts); err != nil {
+	if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil {
 		return nil, err
 	}
 	if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
@@ -163,8 +165,8 @@ func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *v
 }
 
 // Stat implements kernfs.Inode.
-func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	stat, err := i.InodeAttrs.Stat(vsfs, opts)
+func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
 	if err != nil {
 		return linux.Statx{}, err
 	}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 482055db1..a5c7aa470 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -38,6 +38,8 @@ type taskInode struct {
 	kernfs.InodeAttrs
 	kernfs.OrderedChildren
 
+	locks vfs.FileLocks
+
 	task *kernel.Task
 }
 
@@ -103,7 +105,7 @@ func (i *taskInode) Valid(ctx context.Context) bool {
 
 // Open implements kernfs.Inode.
 func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
@@ -154,8 +156,8 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.
 }
 
 // Stat implements kernfs.Inode.
-func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	stat, err := i.Inode.Stat(fs, opts)
+func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.Inode.Stat(ctx, fs, opts)
 	if err != nil {
 		return linux.Statx{}, err
 	}
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 44ccc9e4a..fea29e5f0 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -53,6 +53,8 @@ func taskFDExists(t *kernel.Task, fd int32) bool {
 }
 
 type fdDir struct {
+	locks vfs.FileLocks
+
 	fs   *filesystem
 	task *kernel.Task
 
@@ -62,7 +64,7 @@ type fdDir struct {
 }
 
 // IterDirents implements kernfs.inodeDynamicLookup.
-func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, absOffset, relOffset int64) (int64, error) {
+func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	var fds []int32
 	i.task.WithMuLocked(func(t *kernel.Task) {
 		if fdTable := t.FDTable(); fdTable != nil {
@@ -70,7 +72,6 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, abs
 		}
 	})
 
-	offset := absOffset + relOffset
 	typ := uint8(linux.DT_REG)
 	if i.produceSymlink {
 		typ = linux.DT_LNK
@@ -143,7 +144,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 
 // Open implements kernfs.Inode.
 func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
@@ -270,7 +271,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
 
 // Open implements kernfs.Inode.
 func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 2f297e48a..859b7d727 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -34,6 +35,10 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// "There is an (arbitrary) limit on the number of lines in the file. As at
+// Linux 3.18, the limit is five lines." - user_namespaces(7)
+const maxIDMapLines = 5
+
 // mm gets the kernel task's MemoryManager. No additional reference is taken on
 // mm here. This is safe because MemoryManager.destroy is required to leave the
 // MemoryManager in a state where it's still usable as a DynamicBytesSource.
@@ -226,8 +231,9 @@ func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 
 		// Linux will return envp up to and including the first NULL character,
 		// so find it.
-		if end := bytes.IndexByte(buf.Bytes()[ar.Length():], 0); end != -1 {
-			buf.Truncate(end)
+		envStart := int(ar.Length())
+		if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 {
+			buf.Truncate(envStart + nullIdx)
 		}
 	}
 
@@ -282,7 +288,8 @@ func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
-// idMapData implements vfs.DynamicBytesSource for /proc/[pid]/{gid_map|uid_map}.
+// idMapData implements vfs.WritableDynamicBytesSource for
+// /proc/[pid]/{gid_map|uid_map}.
 //
 // +stateify savable
 type idMapData struct {
@@ -294,7 +301,7 @@ type idMapData struct {
 
 var _ dynamicInode = (*idMapData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.Generate.
+// Generate implements vfs.WritableDynamicBytesSource.Generate.
 func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	var entries []auth.IDMapEntry
 	if d.gids {
@@ -308,6 +315,60 @@ func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	// "In addition, the number of bytes written to the file must be less than
+	// the system page size, and the write must be performed at the start of
+	// the file ..." - user_namespaces(7)
+	srclen := src.NumBytes()
+	if srclen >= usermem.PageSize || offset != 0 {
+		return 0, syserror.EINVAL
+	}
+	b := make([]byte, srclen)
+	if _, err := src.CopyIn(ctx, b); err != nil {
+		return 0, err
+	}
+
+	// Truncate from the first NULL byte.
+	var nul int64
+	nul = int64(bytes.IndexByte(b, 0))
+	if nul == -1 {
+		nul = srclen
+	}
+	b = b[:nul]
+	// Remove the last \n.
+	if nul >= 1 && b[nul-1] == '\n' {
+		b = b[:nul-1]
+	}
+	lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1)
+	if len(lines) > maxIDMapLines {
+		return 0, syserror.EINVAL
+	}
+
+	entries := make([]auth.IDMapEntry, len(lines))
+	for i, l := range lines {
+		var e auth.IDMapEntry
+		_, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length)
+		if err != nil {
+			return 0, syserror.EINVAL
+		}
+		entries[i] = e
+	}
+	var err error
+	if d.gids {
+		err = d.task.UserNamespace().SetGIDMap(ctx, entries)
+	} else {
+		err = d.task.UserNamespace().SetUIDMap(ctx, entries)
+	}
+	if err != nil {
+		return 0, err
+	}
+
+	// On success, Linux's kernel/user_namespace.c:map_write() always returns
+	// count, even if fewer bytes were used.
+	return int64(srclen), nil
+}
+
 // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
 //
 // +stateify savable
@@ -775,6 +836,8 @@ type namespaceInode struct {
 	kernfs.InodeNoopRefCount
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
+
+	locks vfs.FileLocks
 }
 
 var _ kernfs.Inode = (*namespaceInode)(nil)
@@ -791,6 +854,7 @@ func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32
 func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &namespaceFD{inode: i}
 	i.IncRef()
+	fd.LockFD.Init(&i.locks)
 	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
@@ -801,6 +865,7 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *
 // /proc/[pid]/ns/*.
 type namespaceFD struct {
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	vfsfd vfs.FileDescription
 	inode *namespaceInode
@@ -811,7 +876,7 @@ var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
 // Stat implements FileDescriptionImpl.
 func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return fd.inode.Stat(vfs, opts)
+	return fd.inode.Stat(ctx, vfs, opts)
 }
 
 // SetStat implements FileDescriptionImpl.
@@ -826,7 +891,12 @@ func (fd *namespaceFD) Release() {
 	fd.inode.DecRef()
 }
 
-// OnClose implements FileDescriptionImpl.
-func (*namespaceFD) OnClose(context.Context) error {
-	return nil
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *namespaceFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *namespaceFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
 }
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index b51d43954..6d2b90a8b 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -43,6 +43,8 @@ type tasksInode struct {
 	kernfs.OrderedChildren
 	kernfs.AlwaysValid
 
+	locks vfs.FileLocks
+
 	fs    *filesystem
 	pidns *kernel.PIDNamespace
 
@@ -197,15 +199,15 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 
 // Open implements kernfs.Inode.
 func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	stat, err := i.InodeAttrs.Stat(vsfs, opts)
+func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
 	if err != nil {
 		return linux.Statx{}, err
 	}
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
index d29ef3f83..242ba9b5d 100644
--- a/pkg/sentry/fsimpl/signalfd/signalfd.go
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -31,6 +31,7 @@ type SignalFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 
 	// target is the original signal target task.
 	//
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index a741e2bb6..1b548ccd4 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -29,6 +29,6 @@ go_test(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/vfs",
-        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 0af373604..01ce30a4d 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -98,8 +98,10 @@ type dir struct {
 	kernfs.InodeNoDynamicLookup
 	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
-
 	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
+
 	dentry kernfs.Dentry
 }
 
@@ -121,7 +123,7 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set
 
 // Open implements kernfs.Inode.Open.
 func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
@@ -136,7 +138,7 @@ type cpuFile struct {
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "0-%d", c.maxCores-1)
+	fmt.Fprintf(buf, "0-%d\n", c.maxCores-1)
 	return nil
 }
 
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 4b3602d47..242d5fd12 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -51,7 +51,7 @@ func TestReadCPUFile(t *testing.T) {
 	k := kernel.KernelFromContext(s.Ctx)
 	maxCPUCores := k.ApplicationCores()
 
-	expected := fmt.Sprintf("0-%d", maxCPUCores-1)
+	expected := fmt.Sprintf("0-%d\n", maxCPUCores-1)
 
 	for _, fname := range []string{"online", "possible", "present"} {
 		pop := s.PathOpAtRoot(fmt.Sprintf("devices/system/cpu/%s", fname))
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
index 0e4053a46..400a97996 100644
--- a/pkg/sentry/fsimpl/testutil/BUILD
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -32,6 +32,6 @@ go_library(
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/usermem",
-        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index c16a36cdb..e743e8114 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -62,6 +62,7 @@ func Boot() (*kernel.Kernel, error) {
 		return nil, fmt.Errorf("creating platform: %v", err)
 	}
 
+	kernel.VFS2Enabled = true
 	k := &kernel.Kernel{
 		Platform: plat,
 	}
@@ -73,7 +74,7 @@ func Boot() (*kernel.Kernel, error) {
 	k.SetMemoryFile(mf)
 
 	// Pass k as the platform since it is savable, unlike the actual platform.
-	vdso, err := loader.PrepareVDSO(nil, k)
+	vdso, err := loader.PrepareVDSO(k)
 	if err != nil {
 		return nil, fmt.Errorf("creating vdso: %v", err)
 	}
@@ -103,11 +104,6 @@ func Boot() (*kernel.Kernel, error) {
 		return nil, fmt.Errorf("initializing kernel: %v", err)
 	}
 
-	kernel.VFS2Enabled = true
-
-	if err := k.VFS().Init(); err != nil {
-		return nil, fmt.Errorf("VFS init: %v", err)
-	}
 	k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 		AllowUserList:  true,
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
index 60c92d626..2dc90d484 100644
--- a/pkg/sentry/fsimpl/timerfd/timerfd.go
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -32,6 +32,7 @@ type TimerFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 
 	events waiter.Queue
 	timer  *ktime.Timer
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 007be1572..e73732a6b 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -59,9 +59,9 @@ go_library(
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
-        "//pkg/sentry/vfs/lock",
         "//pkg/sentry/vfs/memxattr",
         "//pkg/sync",
         "//pkg/syserror",
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
index 83bf885ee..ac54d420d 100644
--- a/pkg/sentry/fsimpl/tmpfs/device_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -29,7 +29,7 @@ type deviceFile struct {
 	minor uint32
 }
 
-func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
+func (fs *filesystem) newDeviceFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
 	file := &deviceFile{
 		kind:  kind,
 		major: major,
@@ -43,7 +43,7 @@ func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode
 	default:
 		panic(fmt.Sprintf("invalid DeviceKind: %v", kind))
 	}
-	file.inode.init(file, fs, creds, mode)
+	file.inode.init(file, fs, kuid, kgid, mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index f2399981b..0a1ad4765 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -48,9 +48,9 @@ type directory struct {
 	childList dentryList
 }
 
-func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *directory {
+func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *directory {
 	dir := &directory{}
-	dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode)
+	dir.inode.init(dir, fs, kuid, kgid, linux.S_IFDIR|mode)
 	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
 	dir.dentry.inode = &dir.inode
 	dir.dentry.vfsd.Init(&dir.dentry)
@@ -81,6 +81,10 @@ func (dir *directory) removeChildLocked(child *dentry) {
 	dir.iterMu.Unlock()
 }
 
+func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error {
+	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid)))
+}
+
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
@@ -106,6 +110,8 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	fs := fd.filesystem()
 	dir := fd.inode().impl.(*directory)
 
+	defer fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+
 	// fs.mu is required to read d.parent and dentry.name.
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 80fa7b29d..ef210a69b 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -79,7 +79,7 @@ afterSymlink:
 	}
 	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
 		// Symlink traversal updates access time.
-		atomic.StoreInt64(&d.inode.atime, d.inode.fs.clock.Now().Nanoseconds())
+		child.inode.touchAtime(rp.Mount())
 		if err := rp.HandleSymlink(symlink.target); err != nil {
 			return nil, err
 		}
@@ -177,6 +177,12 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 	if err := create(parentDir, name); err != nil {
 		return err
 	}
+
+	ev := linux.IN_CREATE
+	if dir {
+		ev |= linux.IN_ISDIR
+	}
+	parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
 	parentDir.inode.touchCMtime()
 	return nil
 }
@@ -231,17 +237,22 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 			return syserror.EXDEV
 		}
 		d := vd.Dentry().Impl().(*dentry)
-		if d.inode.isDir() {
+		i := d.inode
+		if i.isDir() {
 			return syserror.EPERM
 		}
-		if d.inode.nlink == 0 {
+		if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+			return err
+		}
+		if i.nlink == 0 {
 			return syserror.ENOENT
 		}
-		if d.inode.nlink == maxLinks {
+		if i.nlink == maxLinks {
 			return syserror.EMLINK
 		}
-		d.inode.incLinksLocked()
-		parentDir.insertChildLocked(fs.newDentry(d.inode), name)
+		i.incLinksLocked()
+		i.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */)
+		parentDir.insertChildLocked(fs.newDentry(i), name)
 		return nil
 	})
 }
@@ -249,11 +260,12 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
 	return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error {
+		creds := rp.Credentials()
 		if parentDir.inode.nlink == maxLinks {
 			return syserror.EMLINK
 		}
 		parentDir.inode.incLinksLocked() // from child's ".."
-		childDir := fs.newDirectory(rp.Credentials(), opts.Mode)
+		childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
 		parentDir.insertChildLocked(&childDir.dentry, name)
 		return nil
 	})
@@ -262,18 +274,19 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
 	return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
+		creds := rp.Credentials()
 		var childInode *inode
 		switch opts.Mode.FileType() {
-		case 0, linux.S_IFREG:
-			childInode = fs.newRegularFile(rp.Credentials(), opts.Mode)
+		case linux.S_IFREG:
+			childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
 		case linux.S_IFIFO:
-			childInode = fs.newNamedPipe(rp.Credentials(), opts.Mode)
+			childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
 		case linux.S_IFBLK:
-			childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
+			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
 		case linux.S_IFCHR:
-			childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
+			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
 		case linux.S_IFSOCK:
-			childInode = fs.newSocketFile(rp.Credentials(), opts.Mode, opts.Endpoint)
+			childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint)
 		default:
 			return syserror.EINVAL
 		}
@@ -348,15 +361,20 @@ afterTrailingSymlink:
 		}
 		defer rp.Mount().EndWrite()
 		// Create and open the child.
-		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+		creds := rp.Credentials()
+		child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode))
 		parentDir.insertChildLocked(child, name)
 		fd, err := child.open(ctx, rp, &opts, true)
 		if err != nil {
 			return nil, err
 		}
+		parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
 		parentDir.inode.touchCMtime()
 		return fd, nil
 	}
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
 	// Is the file mounted over?
 	if err := rp.CheckMount(&child.vfsd); err != nil {
 		return nil, err
@@ -364,7 +382,7 @@ afterTrailingSymlink:
 	// Do we need to resolve a trailing symlink?
 	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
 		// Symlink traversal updates access time.
-		atomic.StoreInt64(&child.inode.atime, child.inode.fs.clock.Now().Nanoseconds())
+		child.inode.touchAtime(rp.Mount())
 		if err := rp.HandleSymlink(symlink.target); err != nil {
 			return nil, err
 		}
@@ -388,10 +406,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 	switch impl := d.inode.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		fd.LockFD.Init(&d.inode.locks)
+		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
 			return nil, err
 		}
-		if opts.Flags&linux.O_TRUNC != 0 {
+		if !afterCreate && opts.Flags&linux.O_TRUNC != 0 {
 			if _, err := impl.truncate(0); err != nil {
 				return nil, err
 			}
@@ -403,15 +422,16 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		fd.LockFD.Init(&d.inode.locks)
+		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
 			return nil, err
 		}
 		return &fd.vfsfd, nil
 	case *symlink:
-		// Can't open symlinks without O_PATH (which is unimplemented).
+		// TODO(gvisor.dev/issue/2782): Can't open symlinks without O_PATH.
 		return nil, syserror.ELOOP
 	case *namedPipe:
-		return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags)
+		return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks)
 	case *deviceFile:
 		return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
 	case *socketFile:
@@ -472,6 +492,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if !ok {
 		return syserror.ENOENT
 	}
+	if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil {
+		return err
+	}
 	// Note that we don't need to call rp.CheckMount(), since if renamed is a
 	// mount point then we want to rename the mount point, not anything in the
 	// mounted filesystem.
@@ -559,6 +582,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		newParentDir.inode.touchCMtime()
 	}
 	renamed.inode.touchCtime()
+
+	vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir())
 	return nil
 }
 
@@ -584,6 +609,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if !ok {
 		return syserror.ENOENT
 	}
+	if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
+		return err
+	}
 	childDir, ok := child.inode.impl.(*directory)
 	if !ok {
 		return syserror.ENOTDIR
@@ -603,8 +631,11 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 	parentDir.removeChildLocked(child)
-	parentDir.inode.decLinksLocked() // from child's ".."
+	parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
+	// Remove links for child, child/., and child/..
+	child.inode.decLinksLocked()
 	child.inode.decLinksLocked()
+	parentDir.inode.decLinksLocked()
 	vfsObj.CommitDeleteDentry(&child.vfsd)
 	parentDir.inode.touchCMtime()
 	return nil
@@ -613,12 +644,21 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	fs.mu.RLock()
-	defer fs.mu.RUnlock()
 	d, err := resolveLocked(rp)
 	if err != nil {
+		fs.mu.RUnlock()
+		return err
+	}
+	if err := d.inode.setStat(ctx, rp.Credentials(), &opts); err != nil {
+		fs.mu.RUnlock()
 		return err
 	}
-	return d.inode.setStat(ctx, rp.Credentials(), &opts.Stat)
+	fs.mu.RUnlock()
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+	}
+	return nil
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
@@ -656,7 +696,8 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
 	return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
-		child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+		creds := rp.Credentials()
+		child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target))
 		parentDir.insertChildLocked(child, name)
 		return nil
 	})
@@ -681,6 +722,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if !ok {
 		return syserror.ENOENT
 	}
+	if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
+		return err
+	}
 	if child.inode.isDir() {
 		return syserror.EISDIR
 	}
@@ -698,6 +742,12 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
 		return err
 	}
+
+	// Generate inotify events. Note that this must take place before the link
+	// count of the child is decremented, or else the watches may be dropped
+	// before these events are added.
+	vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name)
+
 	parentDir.removeChildLocked(child)
 	child.inode.decLinksLocked()
 	vfsObj.CommitDeleteDentry(&child.vfsd)
@@ -749,23 +799,37 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
 func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
 	fs.mu.RLock()
-	defer fs.mu.RUnlock()
 	d, err := resolveLocked(rp)
 	if err != nil {
+		fs.mu.RUnlock()
+		return err
+	}
+	if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
+		fs.mu.RUnlock()
 		return err
 	}
-	return d.inode.setxattr(rp.Credentials(), &opts)
+	fs.mu.RUnlock()
+
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
 func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
-	defer fs.mu.RUnlock()
 	d, err := resolveLocked(rp)
 	if err != nil {
+		fs.mu.RUnlock()
 		return err
 	}
-	return d.inode.removexattr(rp.Credentials(), name)
+	if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
+		fs.mu.RUnlock()
+		return err
+	}
+	fs.mu.RUnlock()
+
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 8d77b3fa8..739350cf0 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -30,9 +30,9 @@ type namedPipe struct {
 // Preconditions:
 //   * fs.mu must be locked.
 //   * rp.Mount().CheckBeginWrite() has been called successfully.
-func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
-	file.inode.init(file, fs, creds, linux.S_IFIFO|mode)
+	file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode)
 	file.inode.nlink = 1 // Only the parent has a link.
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 3f433d666..abbaa5d60 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -25,7 +25,6 @@ import (
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -85,12 +84,12 @@ type regularFile struct {
 	size uint64
 }
 
-func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &regularFile{
 		memFile: fs.memFile,
 		seals:   linux.F_SEAL_SEAL,
 	}
-	file.inode.init(file, fs, creds, linux.S_IFREG|mode)
+	file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
@@ -275,11 +274,35 @@ func (fd *regularFileFD) Release() {
 	// noop
 }
 
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	f := fd.inode().impl.(*regularFile)
+
+	f.inode.mu.Lock()
+	defer f.inode.mu.Unlock()
+	oldSize := f.size
+	size := offset + length
+	if oldSize >= size {
+		return nil
+	}
+	_, err := f.truncateLocked(size)
+	return err
+}
+
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
+
+	// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
+	// all state is in-memory.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
 	if dst.NumBytes() == 0 {
 		return 0, nil
 	}
@@ -302,40 +325,60 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
 
 // PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, _, err := fd.pwrite(ctx, src, offset, opts)
+	return n, err
+}
+
+// pwrite returns the number of bytes written, final offset and error. The
+// final offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
 	if offset < 0 {
-		return 0, syserror.EINVAL
+		return 0, offset, syserror.EINVAL
 	}
+
+	// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
+	// all state is in-memory.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+		return 0, offset, syserror.EOPNOTSUPP
+	}
+
 	srclen := src.NumBytes()
 	if srclen == 0 {
-		return 0, nil
+		return 0, offset, nil
 	}
 	f := fd.inode().impl.(*regularFile)
+	f.inode.mu.Lock()
+	defer f.inode.mu.Unlock()
+	// If the file is opened with O_APPEND, update offset to file size.
+	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+		// Locking f.inode.mu is sufficient for reading f.size.
+		offset = int64(f.size)
+	}
 	if end := offset + srclen; end < offset {
 		// Overflow.
-		return 0, syserror.EFBIG
+		return 0, offset, syserror.EINVAL
 	}
 
-	var err error
 	srclen, err = vfs.CheckLimit(ctx, offset, srclen)
 	if err != nil {
-		return 0, err
+		return 0, offset, err
 	}
 	src = src.TakeFirst64(srclen)
 
-	f.inode.mu.Lock()
 	rw := getRegularFileReadWriter(f, offset)
 	n, err := src.CopyInTo(ctx, rw)
-	fd.inode().touchCMtimeLocked()
-	f.inode.mu.Unlock()
+	f.inode.touchCMtimeLocked()
 	putRegularFileReadWriter(rw)
-	return n, err
+	return n, n + offset, err
 }
 
 // Write implements vfs.FileDescriptionImpl.Write.
 func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	fd.offMu.Lock()
-	n, err := fd.PWrite(ctx, src, fd.off, opts)
-	fd.off += n
+	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+	fd.off = off
 	fd.offMu.Unlock()
 	return n, err
 }
@@ -361,33 +404,6 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (
 	return offset, nil
 }
 
-// Sync implements vfs.FileDescriptionImpl.Sync.
-func (fd *regularFileFD) Sync(ctx context.Context) error {
-	return nil
-}
-
-// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
-func (fd *regularFileFD) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
-	return fd.inode().lockBSD(uid, t, block)
-}
-
-// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
-func (fd *regularFileFD) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
-	fd.inode().unlockBSD(uid)
-	return nil
-}
-
-// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
-func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
-	return fd.inode().lockPOSIX(uid, t, rng, block)
-}
-
-// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
-func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
-	fd.inode().unlockPOSIX(uid, rng)
-	return nil
-}
-
 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
 func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
 	file := fd.inode().impl.(*regularFile)
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index 64e1c40ad..146c7fdfe 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -138,48 +138,37 @@ func TestLocks(t *testing.T) {
 	}
 	defer cleanup()
 
-	var (
-		uid1 lock.UniqueID
-		uid2 lock.UniqueID
-		// Non-blocking.
-		block lock.Blocker
-	)
-
-	uid1 = 123
-	uid2 = 456
-
-	if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, block); err != nil {
+	uid1 := 123
+	uid2 := 456
+	if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, nil); err != nil {
 		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
 	}
-	if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, block); err != nil {
+	if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, nil); err != nil {
 		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
 	}
-	if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block), syserror.ErrWouldBlock; got != want {
+	if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want {
 		t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want)
 	}
 	if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil {
 		t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err)
 	}
-	if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block); err != nil {
+	if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil); err != nil {
 		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
 	}
 
-	rng1 := lock.LockRange{0, 1}
-	rng2 := lock.LockRange{1, 2}
-
-	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, rng1, block); err != nil {
+	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, 0, 1, linux.SEEK_SET, nil); err != nil {
 		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
 	}
-	if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng2, block); err != nil {
+	if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 1, 2, linux.SEEK_SET, nil); err != nil {
 		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
 	}
-	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, rng1, block); err != nil {
+	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, 0, 1, linux.SEEK_SET, nil); err != nil {
 		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
 	}
-	if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng1, block), syserror.ErrWouldBlock; got != want {
+	if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 0, 1, linux.SEEK_SET, nil), syserror.ErrWouldBlock; got != want {
 		t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want)
 	}
-	if err := fd.Impl().UnlockPOSIX(ctx, uid1, rng1); err != nil {
+	if err := fd.Impl().UnlockPOSIX(ctx, uid1, 0, 1, linux.SEEK_SET); err != nil {
 		t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err)
 	}
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
index 25c2321af..3ed650474 100644
--- a/pkg/sentry/fsimpl/tmpfs/socket_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -26,9 +26,9 @@ type socketFile struct {
 	ep    transport.BoundEndpoint
 }
 
-func (fs *filesystem) newSocketFile(creds *auth.Credentials, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
+func (fs *filesystem) newSocketFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
 	file := &socketFile{ep: ep}
-	file.inode.init(file, fs, creds, mode)
+	file.inode.init(file, fs, kuid, kgid, mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index 47e075ed4..b0de5fabe 100644
--- a/pkg/sentry/fsimpl/tmpfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -24,11 +24,11 @@ type symlink struct {
 	target string // immutable
 }
 
-func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
+func (fs *filesystem) newSymlink(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, target string) *inode {
 	link := &symlink{
 		target: target,
 	}
-	link.inode.init(link, fs, creds, linux.S_IFLNK|0777)
+	link.inode.init(link, fs, kuid, kgid, linux.S_IFLNK|mode)
 	link.inode.nlink = 1 // from parent directory
 	return &link.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 1e781aecd..2545d88e9 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -30,6 +30,7 @@ package tmpfs
 import (
 	"fmt"
 	"math"
+	"strconv"
 	"strings"
 	"sync/atomic"
 
@@ -40,7 +41,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/vfs/memxattr"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -112,6 +112,58 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		}
 	}
 
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	rootMode := linux.FileMode(0777)
+	if rootFileType == linux.S_IFDIR {
+		rootMode = 01777
+	}
+	modeStr, ok := mopts["mode"]
+	if ok {
+		delete(mopts, "mode")
+		mode, err := strconv.ParseUint(modeStr, 8, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr)
+			return nil, nil, syserror.EINVAL
+		}
+		rootMode = linux.FileMode(mode & 07777)
+	}
+	rootKUID := creds.EffectiveKUID
+	uidStr, ok := mopts["uid"]
+	if ok {
+		delete(mopts, "uid")
+		uid, err := strconv.ParseUint(uidStr, 10, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr)
+			return nil, nil, syserror.EINVAL
+		}
+		kuid := creds.UserNamespace.MapToKUID(auth.UID(uid))
+		if !kuid.Ok() {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid)
+			return nil, nil, syserror.EINVAL
+		}
+		rootKUID = kuid
+	}
+	rootKGID := creds.EffectiveKGID
+	gidStr, ok := mopts["gid"]
+	if ok {
+		delete(mopts, "gid")
+		gid, err := strconv.ParseUint(gidStr, 10, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr)
+			return nil, nil, syserror.EINVAL
+		}
+		kgid := creds.UserNamespace.MapToKGID(auth.GID(gid))
+		if !kgid.Ok() {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid)
+			return nil, nil, syserror.EINVAL
+		}
+		rootKGID = kgid
+	}
+	if len(mopts) != 0 {
+		ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
 	devMinor, err := vfsObj.GetAnonBlockDevMinor()
 	if err != nil {
 		return nil, nil, err
@@ -127,11 +179,11 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	var root *dentry
 	switch rootFileType {
 	case linux.S_IFREG:
-		root = fs.newDentry(fs.newRegularFile(creds, 0777))
+		root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode))
 	case linux.S_IFLNK:
-		root = fs.newDentry(fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget))
+		root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget))
 	case linux.S_IFDIR:
-		root = &fs.newDirectory(creds, 01777).dentry
+		root = &fs.newDirectory(rootKUID, rootKGID, rootMode).dentry
 	default:
 		fs.vfsfs.DecRef()
 		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
@@ -201,6 +253,33 @@ func (d *dentry) DecRef() {
 	d.inode.decRef()
 }
 
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {
+	if d.inode.isDir() {
+		events |= linux.IN_ISDIR
+	}
+
+	// tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
+	// that d was deleted.
+	deleted := d.vfsd.IsDead()
+
+	d.inode.fs.mu.RLock()
+	// The ordering below is important, Linux always notifies the parent first.
+	if d.parent != nil {
+		d.parent.inode.watches.Notify(d.name, events, cookie, et, deleted)
+	}
+	d.inode.watches.Notify("", events, cookie, et, deleted)
+	d.inode.fs.mu.RUnlock()
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+	return &d.inode.watches
+}
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+func (d *dentry) OnZeroWatches() {}
+
 // inode represents a filesystem object.
 type inode struct {
 	// fs is the owning filesystem. fs is immutable.
@@ -209,11 +288,9 @@ type inode struct {
 	// refs is a reference count. refs is accessed using atomic memory
 	// operations.
 	//
-	// A reference is held on all inodes that are reachable in the filesystem
-	// tree. For non-directories (which may have multiple hard links), this
-	// means that a reference is dropped when nlink reaches 0. For directories,
-	// nlink never reaches 0 due to the "." entry; instead,
-	// filesystem.RmdirAt() drops the reference.
+	// A reference is held on all inodes as long as they are reachable in the
+	// filesystem tree, i.e. nlink is nonzero. This reference is dropped when
+	// nlink reaches 0.
 	refs int64
 
 	// xattrs implements extended attributes.
@@ -235,23 +312,25 @@ type inode struct {
 	ctime int64 // nanoseconds
 	mtime int64 // nanoseconds
 
-	// Advisory file locks, which lock at the inode level.
-	locks lock.FileLocks
+	locks vfs.FileLocks
+
+	// Inotify watches for this inode.
+	watches vfs.Watches
 
 	impl interface{} // immutable
 }
 
 const maxLinks = math.MaxUint32
 
-func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
+func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) {
 	if mode.FileType() == 0 {
 		panic("file type is required in FileMode")
 	}
 	i.fs = fs
 	i.refs = 1
 	i.mode = uint32(mode)
-	i.uid = uint32(creds.EffectiveKUID)
-	i.gid = uint32(creds.EffectiveKGID)
+	i.uid = uint32(kuid)
+	i.gid = uint32(kgid)
 	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
 	// Tmpfs creation sets atime, ctime, and mtime to current time.
 	now := fs.clock.Now().Nanoseconds()
@@ -276,14 +355,17 @@ func (i *inode) incLinksLocked() {
 	atomic.AddUint32(&i.nlink, 1)
 }
 
-// decLinksLocked decrements i's link count.
+// decLinksLocked decrements i's link count. If the link count reaches 0, we
+// remove a reference on i as well.
 //
 // Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
 func (i *inode) decLinksLocked() {
 	if i.nlink == 0 {
 		panic("tmpfs.inode.decLinksLocked() called with no existing links")
 	}
-	atomic.AddUint32(&i.nlink, ^uint32(0))
+	if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 {
+		i.decRef()
+	}
 }
 
 func (i *inode) incRef() {
@@ -306,6 +388,7 @@ func (i *inode) tryIncRef() bool {
 
 func (i *inode) decRef() {
 	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+		i.watches.HandleDeletion()
 		if regFile, ok := i.impl.(*regularFile); ok {
 			// Release memory used by regFile to store data. Since regFile is
 			// no longer usable, we don't need to grab any locks or update any
@@ -369,7 +452,8 @@ func (i *inode) statTo(stat *linux.Statx) {
 	}
 }
 
-func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error {
+func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error {
+	stat := &opts.Stat
 	if stat.Mask == 0 {
 		return nil
 	}
@@ -377,7 +461,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 		return syserror.EPERM
 	}
 	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
-	if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
 		return err
 	}
 	i.mu.Lock()
@@ -455,44 +539,6 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 	return nil
 }
 
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
-	switch i.impl.(type) {
-	case *regularFile:
-		return i.locks.LockBSD(uid, t, block)
-	}
-	return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) unlockBSD(uid fslock.UniqueID) error {
-	switch i.impl.(type) {
-	case *regularFile:
-		i.locks.UnlockBSD(uid)
-		return nil
-	}
-	return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
-	switch i.impl.(type) {
-	case *regularFile:
-		return i.locks.LockPOSIX(uid, t, rng, block)
-	}
-	return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error {
-	switch i.impl.(type) {
-	case *regularFile:
-		i.locks.UnlockPOSIX(uid, rng)
-		return nil
-	}
-	return syserror.EBADF
-}
-
 // allocatedBlocksForSize returns the number of 512B blocks needed to
 // accommodate the given size in bytes, as appropriate for struct
 // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
@@ -531,6 +577,9 @@ func (i *inode) isDir() bool {
 }
 
 func (i *inode) touchAtime(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime {
+		return
+	}
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return
 	}
@@ -621,14 +670,19 @@ func (i *inode) userXattrSupported() bool {
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
 	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
 }
 
+func (fd *fileDescription) dentry() *dentry {
+	return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
 func (fd *fileDescription) inode() *inode {
-	return fd.vfsfd.Dentry().Impl().(*dentry).inode
+	return fd.dentry().inode
 }
 
 // Stat implements vfs.FileDescriptionImpl.Stat.
@@ -641,7 +695,15 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	creds := auth.CredentialsFromContext(ctx)
-	return fd.inode().setStat(ctx, creds, &opts.Stat)
+	d := fd.dentry()
+	if err := d.inode.setStat(ctx, creds, &opts); err != nil {
+		return err
+	}
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+	}
+	return nil
 }
 
 // Listxattr implements vfs.FileDescriptionImpl.Listxattr.
@@ -656,12 +718,26 @@ func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOption
 
 // Setxattr implements vfs.FileDescriptionImpl.Setxattr.
 func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
-	return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts)
+	d := fd.dentry()
+	if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+		return err
+	}
+
+	// Generate inotify events.
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // Removexattr implements vfs.FileDescriptionImpl.Removexattr.
 func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
-	return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name)
+	d := fd.dentry()
+	if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+		return err
+	}
+
+	// Generate inotify events.
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // NewMemfd creates a new tmpfs regular file and file description that can back
@@ -674,8 +750,7 @@ func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name s
 
 	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
 	// S_IRWXUGO.
-	mode := linux.FileMode(0777)
-	inode := fs.newRegularFile(creds, mode)
+	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
 	rf := inode.impl.(*regularFile)
 	if allowSeals {
 		rf.seals = 0
@@ -688,9 +763,26 @@ func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name s
 	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
 	// FMODE_READ | FMODE_WRITE.
 	var fd regularFileFD
+	fd.Init(&inode.locks)
 	flags := uint32(linux.O_RDWR)
 	if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all
+// filesystem state is in-memory.
+func (*fileDescription) Sync(context.Context) error {
+	return nil
+}