104 files changed, 5422 insertions, 1620 deletions
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 498ca4669..81ec98a77 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -125,9 +125,9 @@ type Context interface {
 	// SetTLS sets the current TLS pointer. Returns false if value is invalid.
 	SetTLS(value uintptr) bool
 
-	// SetRSEQInterruptedIP sets the register that contains the old IP when a
-	// restartable sequence is interrupted.
-	SetRSEQInterruptedIP(value uintptr)
+	// SetOldRSeqInterruptedIP sets the register that contains the old IP
+	// when an "old rseq" restartable sequence is interrupted.
+	SetOldRSeqInterruptedIP(value uintptr)
 
 	// StateData returns a pointer to underlying architecture state.
 	StateData() *State
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 67daa6c24..2aa08b1a9 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -174,8 +174,8 @@ func (c *context64) SetTLS(value uintptr) bool {
 	return true
 }
 
-// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP.
-func (c *context64) SetRSEQInterruptedIP(value uintptr) {
+// SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP.
+func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
 	c.Regs.R10 = uint64(value)
 }
 
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index c0a6e884b..a2f966cb6 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -555,6 +555,10 @@ type lockedWriter struct {
 	//
 	// This applies only to Write, not WriteAt.
 	Offset int64
+
+	// Err contains the first error encountered while copying. This is
+	// useful to determine whether Writer or Reader failed during io.Copy.
+	Err error
 }
 
 // Write implements io.Writer.Write.
@@ -590,5 +594,8 @@ func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) {
 			break
 		}
 	}
+	if w.Err == nil {
+		w.Err = err
+	}
 	return written, err
 }
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index b2e8d9c77..9ca695a95 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -53,7 +53,7 @@ go_template_instance(
         "Key": "uint64",
         "Range": "memmap.MappableRange",
         "Value": "uint64",
-        "Functions": "fileRangeSetFunctions",
+        "Functions": "FileRangeSetFunctions",
     },
 )
 
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index 0a5466b0a..f52d712e3 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -34,25 +34,25 @@ import (
 //
 // type FileRangeSet <generated by go_generics>
 
-// fileRangeSetFunctions implements segment.Functions for FileRangeSet.
-type fileRangeSetFunctions struct{}
+// FileRangeSetFunctions implements segment.Functions for FileRangeSet.
+type FileRangeSetFunctions struct{}
 
 // MinKey implements segment.Functions.MinKey.
-func (fileRangeSetFunctions) MinKey() uint64 {
+func (FileRangeSetFunctions) MinKey() uint64 {
 	return 0
 }
 
 // MaxKey implements segment.Functions.MaxKey.
-func (fileRangeSetFunctions) MaxKey() uint64 {
+func (FileRangeSetFunctions) MaxKey() uint64 {
 	return math.MaxUint64
 }
 
 // ClearValue implements segment.Functions.ClearValue.
-func (fileRangeSetFunctions) ClearValue(_ *uint64) {
+func (FileRangeSetFunctions) ClearValue(_ *uint64) {
 }
 
 // Merge implements segment.Functions.Merge.
-func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) {
+func (FileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) {
 	if frstart1+mr1.Length() != frstart2 {
 		return 0, false
 	}
@@ -60,7 +60,7 @@ func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _
 }
 
 // Split implements segment.Functions.Split.
-func (fileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) {
+func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) {
 	return frstart, frstart + (split - mr.Start)
 }
 
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 0e46c5fb7..9bf4b4527 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -604,6 +604,10 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) (
 	fmt.Fprintf(&buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
 	fmt.Fprintf(&buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
 	fmt.Fprintf(&buf, "Seccomp:\t%d\n", s.t.SeccompMode())
+	// We unconditionally report a single NUMA node. See
+	// pkg/sentry/syscalls/linux/sys_mempolicy.go.
+	fmt.Fprintf(&buf, "Mems_allowed:\t1\n")
+	fmt.Fprintf(&buf, "Mems_allowed_list:\t0\n")
 	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*statusData)(nil)}}, 0
 }
 
diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go
index 311798811..389c330a0 100644
--- a/pkg/sentry/fs/splice.go
+++ b/pkg/sentry/fs/splice.go
@@ -167,6 +167,11 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64,
 		if !srcPipe && !opts.SrcOffset {
 			atomic.StoreInt64(&src.offset, src.offset+n)
 		}
+
+		// Don't report any errors if we have some progress without data loss.
+		if w.Err == nil {
+			err = nil
+		}
 	}
 
 	// Drop locks.
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 934828c12..6b07f6bf2 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -77,7 +77,7 @@ func (mi *masterInodeOperations) Release(ctx context.Context) {
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (masterInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+func (*masterInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 880b7bcd3..bc90330bc 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -74,6 +74,7 @@ go_test(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/fspath",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fsimpl/ext/disklayout",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD
index bfc46dfa6..4fc8296ef 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/BUILD
+++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD
@@ -7,6 +7,7 @@ go_test(
     size = "small",
     srcs = ["benchmark_test.go"],
     deps = [
+        "//pkg/fspath",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fsimpl/ext",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 177ce2cb9..a56b03711 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -24,6 +24,7 @@ import (
 	"strings"
 	"testing"
 
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext"
@@ -49,7 +50,9 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 
 	// Create VFS.
 	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{})
+	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
@@ -121,7 +124,7 @@ func BenchmarkVFS2Ext4fsStat(b *testing.B) {
 				stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               *root,
 					Start:              *root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
@@ -150,9 +153,9 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
 			creds := auth.CredentialsFromContext(ctx)
 			mountPointName := "/1/"
 			pop := vfs.PathOperation{
-				Root:     *root,
-				Start:    *root,
-				Pathname: mountPointName,
+				Root:  *root,
+				Start: *root,
+				Path:  fspath.Parse(mountPointName),
 			}
 
 			// Save the mount point for later use.
@@ -181,7 +184,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
 				stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               *root,
 					Start:              *root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go
index 567523d32..4110649ab 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go
@@ -29,8 +29,12 @@ package disklayout
 //       byte (i * sb.BlockSize()) to ((i+1) * sb.BlockSize()).
 
 const (
-	// ExtentStructsSize is the size of all the three extent on-disk structs.
-	ExtentStructsSize = 12
+	// ExtentHeaderSize is the size of the header of an extent tree node.
+	ExtentHeaderSize = 12
+
+	// ExtentEntrySize is the size of an entry in an extent tree node.
+	// This size is the same for both leaf and internal nodes.
+	ExtentEntrySize = 12
 
 	// ExtentMagic is the magic number which must be present in the header.
 	ExtentMagic = 0xf30a
@@ -57,7 +61,7 @@ type ExtentNode struct {
 	Entries []ExtentEntryPair
 }
 
-// ExtentEntry reprsents an extent tree node entry. The entry can either be
+// ExtentEntry represents an extent tree node entry. The entry can either be
 // an ExtentIdx or Extent itself. This exists to simplify navigation logic.
 type ExtentEntry interface {
 	// FileBlock returns the first file block number covered by this entry.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
index b0fad9b71..8762b90db 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
@@ -21,7 +21,7 @@ import (
 // TestExtentSize tests that the extent structs are of the correct
 // size.
 func TestExtentSize(t *testing.T) {
-	assertSize(t, ExtentHeader{}, ExtentStructsSize)
-	assertSize(t, ExtentIdx{}, ExtentStructsSize)
-	assertSize(t, Extent{}, ExtentStructsSize)
+	assertSize(t, ExtentHeader{}, ExtentHeaderSize)
+	assertSize(t, ExtentIdx{}, ExtentEntrySize)
+	assertSize(t, Extent{}, ExtentEntrySize)
 }
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index e9f756732..6c14a1e2d 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -25,6 +25,7 @@ import (
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
@@ -65,7 +66,9 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 
 	// Create VFS.
 	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{})
+	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
@@ -140,7 +143,7 @@ func TestSeek(t *testing.T) {
 			fd, err := vfsfs.OpenAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
@@ -359,7 +362,7 @@ func TestStatAt(t *testing.T) {
 
 			got, err := vfsfs.StatAt(ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
 				&vfs.StatOptions{},
 			)
 			if err != nil {
@@ -429,7 +432,7 @@ func TestRead(t *testing.T) {
 			fd, err := vfsfs.OpenAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.absPath},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.absPath)},
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
@@ -565,7 +568,7 @@ func TestIterDirents(t *testing.T) {
 			fd, err := vfsfs.OpenAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index 3d3ebaca6..11dcc0346 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -57,7 +57,7 @@ func newExtentFile(regFile regularFile) (*extentFile, error) {
 func (f *extentFile) buildExtTree() error {
 	rootNodeData := f.regFile.inode.diskInode.Data()
 
-	binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &f.root.Header)
+	binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header)
 
 	// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
 	if f.root.Header.NumEntries > 4 {
@@ -67,7 +67,7 @@ func (f *extentFile) buildExtTree() error {
 	}
 
 	f.root.Entries = make([]disklayout.ExtentEntryPair, f.root.Header.NumEntries)
-	for i, off := uint16(0), disklayout.ExtentStructsSize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+	for i, off := uint16(0), disklayout.ExtentEntrySize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize {
 		var curEntry disklayout.ExtentEntry
 		if f.root.Header.Height == 0 {
 			// Leaf node.
@@ -76,7 +76,7 @@ func (f *extentFile) buildExtTree() error {
 			// Internal node.
 			curEntry = &disklayout.ExtentIdx{}
 		}
-		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry)
+		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry)
 		f.root.Entries[i].Entry = curEntry
 	}
 
@@ -105,7 +105,7 @@ func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*diskla
 	}
 
 	entries := make([]disklayout.ExtentEntryPair, header.NumEntries)
-	for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+	for i, off := uint16(0), off+disklayout.ExtentEntrySize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize {
 		var curEntry disklayout.ExtentEntry
 		if header.Height == 0 {
 			// Leaf node.
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index 5eca2b83f..841274daf 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -26,13 +26,6 @@ import (
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
-
-	// flags is the same as vfs.OpenOptions.Flags which are passed to
-	// vfs.FilesystemImpl.OpenAt.
-	// TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2),
-	// fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set.
-	// Only close(2), fstat(2), fstatfs(2) should work.
-	flags uint32
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
@@ -43,18 +36,6 @@ func (fd *fileDescription) inode() *inode {
 	return fd.vfsfd.Dentry().Impl().(*dentry).inode
 }
 
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
-}
-
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	var stat linux.Statx
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index e7aa3b41b..616fc002a 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -275,6 +275,16 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 	return vfsd, nil
 }
 
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	vfsd, inode, err := fs.walk(rp, true)
+	if err != nil {
+		return nil, err
+	}
+	inode.incRef()
+	return vfsd, nil
+}
+
 // OpenAt implements vfs.FilesystemImpl.OpenAt.
 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	vfsd, inode, err := fs.walk(rp, false)
@@ -378,7 +388,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
 	if rp.Done() {
 		return syserror.ENOENT
 	}
@@ -443,6 +453,42 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return syserror.EROFS
 }
 
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return nil, err
+	}
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return "", err
+	}
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 24249525c..8608805bf 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -157,10 +157,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		fd.flags = flags
-		mnt.IncRef()
-		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
+		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *directory:
 		// Can't open directories writably. This check is not necessary for a read
@@ -169,10 +166,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		fd.flags = flags
-		mnt.IncRef()
-		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
+		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *symlink:
 		if flags&linux.O_PATH == 0 {
@@ -180,10 +174,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.ELOOP
 		}
 		var fd symlinkFD
-		fd.flags = flags
-		mnt.IncRef()
-		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
+		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", in.impl))
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 52596c090..39c03ee9d 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -25,6 +25,7 @@ go_library(
         "inode_impl_util.go",
         "kernfs.go",
         "slot_list.go",
+        "symlink.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs",
     visibility = ["//pkg/sentry:internal"],
@@ -49,6 +50,7 @@ go_test(
     deps = [
         ":kernfs",
         "//pkg/abi/linux",
+        "//pkg/fspath",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 30c06baf0..606ca692d 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -15,6 +15,8 @@
 package kernfs
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -26,7 +28,10 @@ import (
 // DynamicBytesFile implements kernfs.Inode and represents a read-only
 // file whose contents are backed by a vfs.DynamicBytesSource.
 //
-// Must be initialized with Init before first use.
+// Must be instantiated with NewDynamicBytesFile or initialized with Init
+// before first use.
+//
+// +stateify savable
 type DynamicBytesFile struct {
 	InodeAttrs
 	InodeNoopRefCount
@@ -36,9 +41,14 @@ type DynamicBytesFile struct {
 	data vfs.DynamicBytesSource
 }
 
-// Init intializes a dynamic bytes file.
-func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource) {
-	f.InodeAttrs.Init(creds, ino, linux.ModeRegular|0444)
+var _ Inode = (*DynamicBytesFile)(nil)
+
+// Init initializes a dynamic bytes file.
+func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
+	}
+	f.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm)
 	f.data = data
 }
 
@@ -59,23 +69,21 @@ func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
 // DynamicBytesFile.
 //
 // Must be initialized with Init before first use.
+//
+// +stateify savable
 type DynamicBytesFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.DynamicBytesFileDescriptionImpl
 
 	vfsfd vfs.FileDescription
 	inode Inode
-	flags uint32
 }
 
 // Init initializes a DynamicBytesFD.
 func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) {
-	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
-	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
-	fd.flags = flags
 	fd.inode = d.Impl().(*Dentry).inode
 	fd.SetDataSource(data)
-	fd.vfsfd.Init(fd, m, d)
+	fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
 }
 
 // Seek implements vfs.FileDescriptionImpl.Seek.
@@ -117,15 +125,3 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error {
 	// DynamicBytesFiles are immutable.
 	return syserror.EPERM
 }
-
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *DynamicBytesFD) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *DynamicBytesFD) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index d6c18937a..bcf069b5f 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -39,17 +39,13 @@ type GenericDirectoryFD struct {
 
 	vfsfd    vfs.FileDescription
 	children *OrderedChildren
-	flags    uint32
 	off      int64
 }
 
 // Init initializes a GenericDirectoryFD.
 func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) {
-	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
-	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
 	fd.children = children
-	fd.flags = flags
-	fd.vfsfd.Init(fd, m, d)
+	fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
 }
 
 // VFSFileDescription returns a pointer to the vfs.FileDescription representing
@@ -156,7 +152,10 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 		fd.off++
 	}
 
-	return nil
+	var err error
+	relOffset := fd.off - int64(len(fd.children.set)) - 2
+	fd.off, err = fd.inode().IterDirents(ctx, cb, fd.off, relOffset)
+	return err
 }
 
 // Seek implements vfs.FileDecriptionImpl.Seek.
@@ -180,18 +179,6 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
 	return offset, nil
 }
 
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *GenericDirectoryFD) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *GenericDirectoryFD) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
-}
-
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := fd.filesystem()
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index db486b6c1..79759e0fc 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -44,39 +44,37 @@ func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingP
 		return nil, err
 	}
 afterSymlink:
+	name := rp.Component()
+	// Revalidation must be skipped if name is "." or ".."; d or its parent
+	// respectively can't be expected to transition from invalidated back to
+	// valid, so detecting invalidation and retrying would loop forever. This
+	// is consistent with Linux: fs/namei.c:walk_component() => lookup_fast()
+	// calls d_revalidate(), but walk_component() => handle_dots() does not.
+	if name == "." {
+		rp.Advance()
+		return vfsd, nil
+	}
+	if name == ".." {
+		nextVFSD, err := rp.ResolveParent(vfsd)
+		if err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return nextVFSD, nil
+	}
 	d.dirMu.Lock()
-	nextVFSD, err := rp.ResolveComponent(vfsd)
-	d.dirMu.Unlock()
+	nextVFSD, err := rp.ResolveChild(vfsd, name)
 	if err != nil {
+		d.dirMu.Unlock()
 		return nil, err
 	}
-	if nextVFSD != nil {
-		// Cached dentry exists, revalidate.
-		next := nextVFSD.Impl().(*Dentry)
-		if !next.inode.Valid(ctx) {
-			d.dirMu.Lock()
-			rp.VirtualFilesystem().ForceDeleteDentry(nextVFSD)
-			d.dirMu.Unlock()
-			fs.deferDecRef(nextVFSD) // Reference from Lookup.
-			nextVFSD = nil
-		}
-	}
-	if nextVFSD == nil {
-		// Dentry isn't cached; it either doesn't exist or failed
-		// revalidation. Attempt to resolve it via Lookup.
-		name := rp.Component()
-		var err error
-		nextVFSD, err = d.inode.Lookup(ctx, name)
-		// Reference on nextVFSD dropped by a corresponding Valid.
-		if err != nil {
-			return nil, err
-		}
-		d.InsertChild(name, nextVFSD)
+	next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, nextVFSD)
+	d.dirMu.Unlock()
+	if err != nil {
+		return nil, err
 	}
-	next := nextVFSD.Impl().(*Dentry)
-
 	// Resolve any symlink at current path component.
-	if rp.ShouldFollowSymlink() && d.isSymlink() {
+	if rp.ShouldFollowSymlink() && next.isSymlink() {
 		// TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks".
 		target, err := next.inode.Readlink(ctx)
 		if err != nil {
@@ -89,7 +87,44 @@ afterSymlink:
 
 	}
 	rp.Advance()
-	return nextVFSD, nil
+	return &next.vfsd, nil
+}
+
+// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
+// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
+// nil) to verify that the returned child (or lack thereof) is correct.
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+// parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, childVFSD *vfs.Dentry) (*Dentry, error) {
+	if childVFSD != nil {
+		// Cached dentry exists, revalidate.
+		child := childVFSD.Impl().(*Dentry)
+		if !child.inode.Valid(ctx) {
+			vfsObj.ForceDeleteDentry(childVFSD)
+			fs.deferDecRef(childVFSD) // Reference from Lookup.
+			childVFSD = nil
+		}
+	}
+	if childVFSD == nil {
+		// Dentry isn't cached; it either doesn't exist or failed
+		// revalidation. Attempt to resolve it via Lookup.
+		//
+		// FIXME(b/144498111): Inode.Lookup() should return *(kernfs.)Dentry,
+		// not *vfs.Dentry, since (kernfs.)Filesystem assumes that all dentries
+		// in the filesystem are (kernfs.)Dentry and performs vfs.DentryImpl
+		// casts accordingly.
+		var err error
+		childVFSD, err = parent.inode.Lookup(ctx, name)
+		if err != nil {
+			return nil, err
+		}
+		// Reference on childVFSD dropped by a corresponding Valid.
+		parent.insertChildLocked(name, childVFSD)
+	}
+	return childVFSD.Impl().(*Dentry), nil
 }
 
 // walkExistingLocked resolves rp to an existing file.
@@ -242,6 +277,19 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 	return vfsd, nil
 }
 
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.processDeferredDecRefs()
+	defer fs.mu.RUnlock()
+	vfsd, _, err := fs.walkParentDirLocked(ctx, rp)
+	if err != nil {
+		return nil, err
+	}
+	vfsd.IncRef() // Ownership transferred to caller.
+	return vfsd, nil
+}
+
 // LinkAt implements vfs.FilesystemImpl.LinkAt.
 func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
 	if rp.Done() {
@@ -459,40 +507,42 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
-	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
-	exchange := opts.Flags&linux.RENAME_EXCHANGE != 0
-	whiteout := opts.Flags&linux.RENAME_WHITEOUT != 0
-	if exchange && (noReplace || whiteout) {
-		// Can't specify RENAME_NOREPLACE or RENAME_WHITEOUT with RENAME_EXCHANGE.
-		return syserror.EINVAL
-	}
-	if exchange || whiteout {
-		// Exchange and Whiteout flags are not supported on kernfs.
+func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	// Only RENAME_NOREPLACE is supported.
+	if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
 		return syserror.EINVAL
 	}
+	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
 
 	fs.mu.Lock()
 	defer fs.mu.Lock()
 
+	// Resolve the destination directory first to verify that it's on this
+	// Mount.
+	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
 	mnt := rp.Mount()
-	if mnt != vd.Mount() {
+	if mnt != oldParentVD.Mount() {
 		return syserror.EXDEV
 	}
-
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return err
 	}
 	defer mnt.EndWrite()
 
-	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+	srcDirVFSD := oldParentVD.Dentry()
+	srcDir := srcDirVFSD.Impl().(*Dentry)
+	srcDir.dirMu.Lock()
+	src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDirVFSD.Child(oldName))
+	srcDir.dirMu.Unlock()
 	fs.processDeferredDecRefsLocked()
 	if err != nil {
 		return err
 	}
-
-	srcVFSD := vd.Dentry()
-	srcDirVFSD := srcVFSD.Parent()
+	srcVFSD := &src.vfsd
 
 	// Can we remove the src dentry?
 	if err := checkDeleteLocked(rp, srcVFSD); err != nil {
@@ -683,6 +733,58 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return nil
 }
 
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return nil, err
+	}
+	// kernfs currently does not support extended attributes.
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return "", err
+	}
+	// kernfs currently does not support extended attributes.
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return err
+	}
+	// kernfs currently does not support extended attributes.
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return err
+	}
+	// kernfs currently does not support extended attributes.
+	return syserror.ENOTSUP
+}
+
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
 func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 7b45b702a..752e0f659 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -139,6 +139,11 @@ func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry,
 	panic("Lookup called on non-directory inode")
 }
 
+// IterDirents implements Inode.IterDirents.
+func (*InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+	panic("IterDirents called on non-directory inode")
+}
+
 // Valid implements Inode.Valid.
 func (*InodeNotDirectory) Valid(context.Context) bool {
 	return true
@@ -156,6 +161,11 @@ func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dent
 	return nil, syserror.ENOENT
 }
 
+// IterDirents implements Inode.IterDirents.
+func (*InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	return offset, nil
+}
+
 // Valid implements Inode.Valid.
 func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool {
 	return true
@@ -490,3 +500,13 @@ func (o *OrderedChildren) nthLocked(i int64) *slot {
 	}
 	return nil
 }
+
+// InodeSymlink partially implements Inode interface for symlinks.
+type InodeSymlink struct {
+	InodeNotDirectory
+}
+
+// Open implements Inode.Open.
+func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	return nil, syserror.ELOOP
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index bb01c3d01..d69b299ae 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -239,14 +239,22 @@ func (d *Dentry) destroy() {
 //
 // Precondition: d must represent a directory inode.
 func (d *Dentry) InsertChild(name string, child *vfs.Dentry) {
+	d.dirMu.Lock()
+	d.insertChildLocked(name, child)
+	d.dirMu.Unlock()
+}
+
+// insertChildLocked is equivalent to InsertChild, with additional
+// preconditions.
+//
+// Precondition: d.dirMu must be locked.
+func (d *Dentry) insertChildLocked(name string, child *vfs.Dentry) {
 	if !d.isDir() {
 		panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
 	}
 	vfsDentry := d.VFSDentry()
 	vfsDentry.IncRef() // DecRef in child's Dentry.destroy.
-	d.dirMu.Lock()
 	vfsDentry.InsertChild(child, name)
-	d.dirMu.Unlock()
 }
 
 // The Inode interface maps filesystem-level operations that operate on paths to
@@ -396,6 +404,15 @@ type inodeDynamicLookup interface {
 	// Valid should return true if this inode is still valid, or needs to
 	// be resolved again by a call to Lookup.
 	Valid(ctx context.Context) bool
+
+	// IterDirents is used to iterate over dynamically created entries. It invokes
+	// cb on each entry in the directory represented by the FileDescription.
+	// 'offset' is the offset for the entire IterDirents call, which may include
+	// results from the caller. 'relOffset' is the offset inside the entries
+	// returned by this IterDirents invocation. In other words,
+	// 'offset+relOffset+1' is the value that should be set in vfs.Dirent.NextOff,
+	// while 'relOffset' is the place where iteration should start from.
+	IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
 }
 
 type inodeSymlink interface {
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index f78bb7b04..4b6b95f5f 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -24,6 +24,7 @@ import (
 
 	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
@@ -58,7 +59,9 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem {
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
 	v := vfs.New()
-	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn})
+	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
 	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create testfs root mount: %v", err)
@@ -82,9 +85,9 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem {
 // Precondition: path should be relative path.
 func (s *TestSystem) PathOpAtRoot(path string) vfs.PathOperation {
 	return vfs.PathOperation{
-		Root:     s.root,
-		Start:    s.root,
-		Pathname: path,
+		Root:  s.root,
+		Start: s.root,
+		Path:  fspath.Parse(path),
 	}
 }
 
@@ -132,7 +135,7 @@ type file struct {
 func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry {
 	f := &file{}
 	f.content = content
-	f.DynamicBytesFile.Init(creds, fs.NextIno(), f)
+	f.DynamicBytesFile.Init(creds, fs.NextIno(), f, 0777)
 
 	d := &kernfs.Dentry{}
 	d.Init(f)
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
new file mode 100644
index 000000000..068063f4e
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+type staticSymlink struct {
+	InodeAttrs
+	InodeNoopRefCount
+	InodeSymlink
+
+	target string
+}
+
+var _ Inode = (*staticSymlink)(nil)
+
+// NewStaticSymlink creates a new symlink file pointing to 'target'.
+func NewStaticSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, target string) *Dentry {
+	inode := &staticSymlink{target: target}
+	inode.Init(creds, ino, linux.ModeSymlink|perm)
+
+	d := &Dentry{}
+	d.Init(inode)
+	return d
+}
+
+func (s *staticSymlink) Readlink(_ context.Context) (string, error) {
+	return s.target, nil
+}
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
deleted file mode 100644
index 1f2a5122a..000000000
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ /dev/null
@@ -1,592 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"fmt"
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// stepLocked resolves rp.Component() in parent directory vfsd.
-//
-// stepLocked is loosely analogous to fs/namei.c:walk_component().
-//
-// Preconditions: filesystem.mu must be locked. !rp.Done(). inode ==
-// vfsd.Impl().(*dentry).inode.
-func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode) (*vfs.Dentry, *inode, error) {
-	if !inode.isDir() {
-		return nil, nil, syserror.ENOTDIR
-	}
-	if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
-		return nil, nil, err
-	}
-afterSymlink:
-	nextVFSD, err := rp.ResolveComponent(vfsd)
-	if err != nil {
-		return nil, nil, err
-	}
-	if nextVFSD == nil {
-		// Since the Dentry tree is the sole source of truth for memfs, if it's
-		// not in the Dentry tree, it doesn't exist.
-		return nil, nil, syserror.ENOENT
-	}
-	nextInode := nextVFSD.Impl().(*dentry).inode
-	if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
-		// TODO: symlink traversals update access time
-		if err := rp.HandleSymlink(symlink.target); err != nil {
-			return nil, nil, err
-		}
-		goto afterSymlink // don't check the current directory again
-	}
-	rp.Advance()
-	return nextVFSD, nextInode, nil
-}
-
-// walkExistingLocked resolves rp to an existing file.
-//
-// walkExistingLocked is loosely analogous to Linux's
-// fs/namei.c:path_lookupat().
-//
-// Preconditions: filesystem.mu must be locked.
-func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*dentry).inode
-	for !rp.Done() {
-		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode)
-		if err != nil {
-			return nil, nil, err
-		}
-	}
-	if rp.MustBeDir() && !inode.isDir() {
-		return nil, nil, syserror.ENOTDIR
-	}
-	return vfsd, inode, nil
-}
-
-// walkParentDirLocked resolves all but the last path component of rp to an
-// existing directory. It does not check that the returned directory is
-// searchable by the provider of rp.
-//
-// walkParentDirLocked is loosely analogous to Linux's
-// fs/namei.c:path_parentat().
-//
-// Preconditions: filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*dentry).inode
-	for !rp.Final() {
-		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode)
-		if err != nil {
-			return nil, nil, err
-		}
-	}
-	if !inode.isDir() {
-		return nil, nil, syserror.ENOTDIR
-	}
-	return vfsd, inode, nil
-}
-
-// checkCreateLocked checks that a file named rp.Component() may be created in
-// directory parentVFSD, then returns rp.Component().
-//
-// Preconditions: filesystem.mu must be locked. parentInode ==
-// parentVFSD.Impl().(*dentry).inode. parentInode.isDir() == true.
-func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *inode) (string, error) {
-	if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
-		return "", err
-	}
-	pc := rp.Component()
-	if pc == "." || pc == ".." {
-		return "", syserror.EEXIST
-	}
-	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
-	if err != nil {
-		return "", err
-	}
-	if childVFSD != nil {
-		return "", syserror.EEXIST
-	}
-	if parentVFSD.IsDisowned() {
-		return "", syserror.ENOENT
-	}
-	return pc, nil
-}
-
-// checkDeleteLocked checks that the file represented by vfsd may be deleted.
-func checkDeleteLocked(vfsd *vfs.Dentry) error {
-	parentVFSD := vfsd.Parent()
-	if parentVFSD == nil {
-		return syserror.EBUSY
-	}
-	if parentVFSD.IsDisowned() {
-		return syserror.ENOENT
-	}
-	return nil
-}
-
-// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
-func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	vfsd, inode, err := walkExistingLocked(rp)
-	if err != nil {
-		return nil, err
-	}
-	if opts.CheckSearchable {
-		if !inode.isDir() {
-			return nil, syserror.ENOTDIR
-		}
-		if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
-			return nil, err
-		}
-	}
-	inode.incRef()
-	return vfsd, nil
-}
-
-// LinkAt implements vfs.FilesystemImpl.LinkAt.
-func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if rp.Mount() != vd.Mount() {
-		return syserror.EXDEV
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	d := vd.Dentry().Impl().(*dentry)
-	if d.inode.isDir() {
-		return syserror.EPERM
-	}
-	d.inode.incLinksLocked()
-	child := fs.newDentry(d.inode)
-	parentVFSD.InsertChild(&child.vfsd, pc)
-	parentInode.impl.(*directory).childList.PushBack(child)
-	return nil
-}
-
-// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
-func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
-	parentVFSD.InsertChild(&child.vfsd, pc)
-	parentInode.impl.(*directory).childList.PushBack(child)
-	parentInode.incLinksLocked() // from child's ".."
-	return nil
-}
-
-// MknodAt implements vfs.FilesystemImpl.MknodAt.
-func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-
-	switch opts.Mode.FileType() {
-	case 0:
-		// "Zero file type is equivalent to type S_IFREG." - mknod(2)
-		fallthrough
-	case linux.ModeRegular:
-		// TODO(b/138862511): Implement.
-		return syserror.EINVAL
-
-	case linux.ModeNamedPipe:
-		child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
-		parentVFSD.InsertChild(&child.vfsd, pc)
-		parentInode.impl.(*directory).childList.PushBack(child)
-		return nil
-
-	case linux.ModeSocket:
-		// TODO(b/138862511): Implement.
-		return syserror.EINVAL
-
-	case linux.ModeCharacterDevice:
-		fallthrough
-	case linux.ModeBlockDevice:
-		// TODO(b/72101894): We don't support creating block or character
-		// devices at the moment.
-		//
-		// When we start supporting block and character devices, we'll
-		// need to check for CAP_MKNOD here.
-		return syserror.EPERM
-
-	default:
-		// "EINVAL - mode requested creation of something other than a
-		// regular file, device special file, FIFO or socket." - mknod(2)
-		return syserror.EINVAL
-	}
-}
-
-// OpenAt implements vfs.FilesystemImpl.OpenAt.
-func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	// Filter out flags that are not supported by memfs. O_DIRECTORY and
-	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
-	// appropriate bits in rp), but are returned by
-	// FileDescriptionImpl.StatusFlags(). O_NONBLOCK is supported only by
-	// pipes.
-	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK
-
-	if opts.Flags&linux.O_CREAT == 0 {
-		fs.mu.RLock()
-		defer fs.mu.RUnlock()
-		vfsd, inode, err := walkExistingLocked(rp)
-		if err != nil {
-			return nil, err
-		}
-		return inode.open(ctx, rp, vfsd, opts.Flags, false)
-	}
-
-	mustCreate := opts.Flags&linux.O_EXCL != 0
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*dentry).inode
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	if rp.Done() {
-		if rp.MustBeDir() {
-			return nil, syserror.EISDIR
-		}
-		if mustCreate {
-			return nil, syserror.EEXIST
-		}
-		return inode.open(ctx, rp, vfsd, opts.Flags, false)
-	}
-afterTrailingSymlink:
-	// Walk to the parent directory of the last path component.
-	for !rp.Final() {
-		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode)
-		if err != nil {
-			return nil, err
-		}
-	}
-	if !inode.isDir() {
-		return nil, syserror.ENOTDIR
-	}
-	// Check for search permission in the parent directory.
-	if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
-		return nil, err
-	}
-	// Reject attempts to open directories with O_CREAT.
-	if rp.MustBeDir() {
-		return nil, syserror.EISDIR
-	}
-	pc := rp.Component()
-	if pc == "." || pc == ".." {
-		return nil, syserror.EISDIR
-	}
-	// Determine whether or not we need to create a file.
-	childVFSD, err := rp.ResolveChild(vfsd, pc)
-	if err != nil {
-		return nil, err
-	}
-	if childVFSD == nil {
-		// Already checked for searchability above; now check for writability.
-		if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
-			return nil, err
-		}
-		if err := rp.Mount().CheckBeginWrite(); err != nil {
-			return nil, err
-		}
-		defer rp.Mount().EndWrite()
-		// Create and open the child.
-		childInode := fs.newRegularFile(rp.Credentials(), opts.Mode)
-		child := fs.newDentry(childInode)
-		vfsd.InsertChild(&child.vfsd, pc)
-		inode.impl.(*directory).childList.PushBack(child)
-		return childInode.open(ctx, rp, &child.vfsd, opts.Flags, true)
-	}
-	// Open existing file or follow symlink.
-	if mustCreate {
-		return nil, syserror.EEXIST
-	}
-	childInode := childVFSD.Impl().(*dentry).inode
-	if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
-		// TODO: symlink traversals update access time
-		if err := rp.HandleSymlink(symlink.target); err != nil {
-			return nil, err
-		}
-		// rp.Final() may no longer be true since we now need to resolve the
-		// symlink target.
-		goto afterTrailingSymlink
-	}
-	return childInode.open(ctx, rp, childVFSD, opts.Flags, false)
-}
-
-func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
-	ats := vfs.AccessTypesForOpenFlags(flags)
-	if !afterCreate {
-		if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil {
-			return nil, err
-		}
-	}
-	mnt := rp.Mount()
-	switch impl := i.impl.(type) {
-	case *regularFile:
-		var fd regularFileFD
-		fd.flags = flags
-		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
-		fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
-		if fd.writable {
-			if err := mnt.CheckBeginWrite(); err != nil {
-				return nil, err
-			}
-			// mnt.EndWrite() is called by regularFileFD.Release().
-		}
-		mnt.IncRef()
-		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
-		if flags&linux.O_TRUNC != 0 {
-			impl.mu.Lock()
-			impl.data = impl.data[:0]
-			atomic.StoreInt64(&impl.dataLen, 0)
-			impl.mu.Unlock()
-		}
-		return &fd.vfsfd, nil
-	case *directory:
-		// Can't open directories writably.
-		if ats&vfs.MayWrite != 0 {
-			return nil, syserror.EISDIR
-		}
-		var fd directoryFD
-		mnt.IncRef()
-		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
-		fd.flags = flags
-		return &fd.vfsfd, nil
-	case *symlink:
-		// Can't open symlinks without O_PATH (which is unimplemented).
-		return nil, syserror.ELOOP
-	case *namedPipe:
-		return newNamedPipeFD(ctx, impl, rp, vfsd, flags)
-	default:
-		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
-	}
-}
-
-// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
-func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
-	fs.mu.RLock()
-	_, inode, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
-	if err != nil {
-		return "", err
-	}
-	symlink, ok := inode.impl.(*symlink)
-	if !ok {
-		return "", syserror.EINVAL
-	}
-	return symlink.target, nil
-}
-
-// RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
-	if rp.Done() {
-		return syserror.ENOENT
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	_, err = checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	// TODO: actually implement RenameAt
-	return syserror.EPERM
-}
-
-// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
-func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	vfsd, inode, err := walkExistingLocked(rp)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(vfsd); err != nil {
-		return err
-	}
-	if !inode.isDir() {
-		return syserror.ENOTDIR
-	}
-	if vfsd.HasChildren() {
-		return syserror.ENOTEMPTY
-	}
-	if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
-		return err
-	}
-	// Remove from parent directory's childList.
-	vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
-	inode.decRef()
-	return nil
-}
-
-// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
-func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
-	fs.mu.RLock()
-	_, _, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
-	if err != nil {
-		return err
-	}
-	if opts.Stat.Mask == 0 {
-		return nil
-	}
-	// TODO: implement inode.setStat
-	return syserror.EPERM
-}
-
-// StatAt implements vfs.FilesystemImpl.StatAt.
-func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
-	fs.mu.RLock()
-	_, inode, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
-	if err != nil {
-		return linux.Statx{}, err
-	}
-	var stat linux.Statx
-	inode.statTo(&stat)
-	return stat, nil
-}
-
-// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
-func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
-	fs.mu.RLock()
-	_, _, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
-	if err != nil {
-		return linux.Statfs{}, err
-	}
-	// TODO: actually implement statfs
-	return linux.Statfs{}, syserror.ENOSYS
-}
-
-// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
-func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
-	parentVFSD.InsertChild(&child.vfsd, pc)
-	parentInode.impl.(*directory).childList.PushBack(child)
-	return nil
-}
-
-// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
-func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	vfsd, inode, err := walkExistingLocked(rp)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(vfsd); err != nil {
-		return err
-	}
-	if inode.isDir() {
-		return syserror.EISDIR
-	}
-	if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
-		return err
-	}
-	// Remove from parent directory's childList.
-	vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
-	inode.decLinksLocked()
-	return nil
-}
-
-// PrependPath implements vfs.FilesystemImpl.PrependPath.
-func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	return vfs.GenericPrependPath(vfsroot, vd, b)
-}
diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go
deleted file mode 100644
index b7f4853b3..000000000
--- a/pkg/sentry/fsimpl/memfs/regular_file.go
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"io"
-	"sync"
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-type regularFile struct {
-	inode inode
-
-	mu   sync.RWMutex
-	data []byte
-	// dataLen is len(data), but accessed using atomic memory operations to
-	// avoid locking in inode.stat().
-	dataLen int64
-}
-
-func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
-	file := &regularFile{}
-	file.inode.init(file, fs, creds, mode)
-	file.inode.nlink = 1 // from parent directory
-	return &file.inode
-}
-
-type regularFileFD struct {
-	fileDescription
-
-	// These are immutable.
-	readable bool
-	writable bool
-
-	// off is the file offset. off is accessed using atomic memory operations.
-	// offMu serializes operations that may mutate off.
-	off   int64
-	offMu sync.Mutex
-}
-
-// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *regularFileFD) Release() {
-	if fd.writable {
-		fd.vfsfd.VirtualDentry().Mount().EndWrite()
-	}
-}
-
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
-	if !fd.readable {
-		return 0, syserror.EINVAL
-	}
-	f := fd.inode().impl.(*regularFile)
-	f.mu.RLock()
-	if offset >= int64(len(f.data)) {
-		f.mu.RUnlock()
-		return 0, io.EOF
-	}
-	n, err := dst.CopyOut(ctx, f.data[offset:])
-	f.mu.RUnlock()
-	return int64(n), err
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	fd.offMu.Lock()
-	n, err := fd.PRead(ctx, dst, fd.off, opts)
-	fd.off += n
-	fd.offMu.Unlock()
-	return n, err
-}
-
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	if !fd.writable {
-		return 0, syserror.EINVAL
-	}
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
-	srclen := src.NumBytes()
-	if srclen == 0 {
-		return 0, nil
-	}
-	f := fd.inode().impl.(*regularFile)
-	f.mu.Lock()
-	end := offset + srclen
-	if end < offset {
-		// Overflow.
-		f.mu.Unlock()
-		return 0, syserror.EFBIG
-	}
-	if end > f.dataLen {
-		f.data = append(f.data, make([]byte, end-f.dataLen)...)
-		atomic.StoreInt64(&f.dataLen, end)
-	}
-	n, err := src.CopyIn(ctx, f.data[offset:end])
-	f.mu.Unlock()
-	return int64(n), err
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
-	fd.offMu.Lock()
-	n, err := fd.PWrite(ctx, src, fd.off, opts)
-	fd.off += n
-	fd.offMu.Unlock()
-	return n, err
-}
-
-// Seek implements vfs.FileDescriptionImpl.Seek.
-func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	fd.offMu.Lock()
-	defer fd.offMu.Unlock()
-	switch whence {
-	case linux.SEEK_SET:
-		// use offset as specified
-	case linux.SEEK_CUR:
-		offset += fd.off
-	case linux.SEEK_END:
-		offset += atomic.LoadInt64(&fd.inode().impl.(*regularFile).dataLen)
-	default:
-		return 0, syserror.EINVAL
-	}
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
-	fd.off = offset
-	return offset, nil
-}
-
-// Sync implements vfs.FileDescriptionImpl.Sync.
-func (fd *regularFileFD) Sync(ctx context.Context) error {
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index ade6ac946..1f44b3217 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -6,15 +6,17 @@ package(licenses = ["notice"])
 go_library(
     name = "proc",
     srcs = [
-        "filesystems.go",
+        "filesystem.go",
         "loadavg.go",
         "meminfo.go",
         "mounts.go",
         "net.go",
-        "proc.go",
         "stat.go",
         "sys.go",
         "task.go",
+        "task_files.go",
+        "tasks.go",
+        "tasks_files.go",
         "version.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc",
@@ -24,8 +26,10 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
+        "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/limits",
         "//pkg/sentry/mm",
         "//pkg/sentry/socket",
@@ -34,17 +38,40 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
+        "//pkg/syserror",
     ],
 )
 
 go_test(
     name = "proc_test",
     size = "small",
-    srcs = ["net_test.go"],
+    srcs = [
+        "boot_test.go",
+        "net_test.go",
+        "tasks_test.go",
+    ],
     embed = [":proc"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/cpuid",
+        "//pkg/fspath",
+        "//pkg/memutil",
+        "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
         "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/kvm",
+        "//pkg/sentry/platform/ptrace",
+        "//pkg/sentry/time",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fsimpl/proc/boot_test.go b/pkg/sentry/fsimpl/proc/boot_test.go
new file mode 100644
index 000000000..84a93ee56
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/boot_test.go
@@ -0,0 +1,149 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"runtime"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/memutil"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/time"
+
+	// Platforms are plugable.
+	_ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
+	_ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
+)
+
+var (
+	platformFlag = flag.String("platform", "ptrace", "specify which platform to use")
+)
+
+// boot initializes a new bare bones kernel for test.
+func boot() (*kernel.Kernel, error) {
+	platformCtr, err := platform.Lookup(*platformFlag)
+	if err != nil {
+		return nil, fmt.Errorf("platform not found: %v", err)
+	}
+	deviceFile, err := platformCtr.OpenDevice()
+	if err != nil {
+		return nil, fmt.Errorf("creating platform: %v", err)
+	}
+	plat, err := platformCtr.New(deviceFile)
+	if err != nil {
+		return nil, fmt.Errorf("creating platform: %v", err)
+	}
+
+	k := &kernel.Kernel{
+		Platform: plat,
+	}
+
+	mf, err := createMemoryFile()
+	if err != nil {
+		return nil, err
+	}
+	k.SetMemoryFile(mf)
+
+	// Pass k as the platform since it is savable, unlike the actual platform.
+	vdso, err := loader.PrepareVDSO(nil, k)
+	if err != nil {
+		return nil, fmt.Errorf("creating vdso: %v", err)
+	}
+
+	// Create timekeeper.
+	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+	if err != nil {
+		return nil, fmt.Errorf("creating timekeeper: %v", err)
+	}
+	tk.SetClocks(time.NewCalibratedClocks())
+
+	creds := auth.NewRootCredentials(auth.NewRootUserNamespace())
+
+	// Initiate the Kernel object, which is required by the Context passed
+	// to createVFS in order to mount (among other things) procfs.
+	if err = k.Init(kernel.InitKernelArgs{
+		ApplicationCores:            uint(runtime.GOMAXPROCS(-1)),
+		FeatureSet:                  cpuid.HostFeatureSet(),
+		Timekeeper:                  tk,
+		RootUserNamespace:           creds.UserNamespace,
+		Vdso:                        vdso,
+		RootUTSNamespace:            kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace),
+		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
+		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+		PIDNamespace:                kernel.NewRootPIDNamespace(creds.UserNamespace),
+	}); err != nil {
+		return nil, fmt.Errorf("initializing kernel: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+
+	// Create mount namespace without root as it's the minimum required to create
+	// the global thread group.
+	mntns, err := fs.NewMountNamespace(ctx, nil)
+	if err != nil {
+		return nil, err
+	}
+	ls, err := limits.NewLinuxLimitSet()
+	if err != nil {
+		return nil, err
+	}
+	tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
+	k.TestOnly_SetGlobalInit(tg)
+
+	return k, nil
+}
+
+// createTask creates a new bare bones task for tests.
+func createTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) {
+	k := kernel.KernelFromContext(ctx)
+	config := &kernel.TaskConfig{
+		Kernel:                  k,
+		ThreadGroup:             tc,
+		TaskContext:             &kernel.TaskContext{Name: name},
+		Credentials:             auth.CredentialsFromContext(ctx),
+		AllowedCPUMask:          sched.NewFullCPUSet(k.ApplicationCores()),
+		UTSNamespace:            kernel.UTSNamespaceFromContext(ctx),
+		IPCNamespace:            kernel.IPCNamespaceFromContext(ctx),
+		AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+	}
+	return k.TaskSet().NewTask(config)
+}
+
+func createMemoryFile() (*pgalloc.MemoryFile, error) {
+	const memfileName = "test-memory"
+	memfd, err := memutil.CreateMemFD(memfileName, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error creating memfd: %v", err)
+	}
+	memfile := os.NewFile(uintptr(memfd), memfileName)
+	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
+	if err != nil {
+		memfile.Close()
+		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
+	}
+	return mf, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
new file mode 100644
index 000000000..d09182c77
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -0,0 +1,69 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package proc implements a partial in-memory file system for procfs.
+package proc
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// procFSType is the factory class for procfs.
+//
+// +stateify savable
+type procFSType struct{}
+
+var _ vfs.FilesystemType = (*procFSType)(nil)
+
+// GetFilesystem implements vfs.FilesystemType.
+func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	k := kernel.KernelFromContext(ctx)
+	if k == nil {
+		return nil, nil, fmt.Errorf("procfs requires a kernel")
+	}
+	pidns := kernel.PIDNamespaceFromContext(ctx)
+	if pidns == nil {
+		return nil, nil, fmt.Errorf("procfs requires a PID namespace")
+	}
+
+	procfs := &kernfs.Filesystem{}
+	procfs.VFSFilesystem().Init(vfsObj, procfs)
+
+	_, dentry := newTasksInode(procfs, k, pidns)
+	return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
+}
+
+// dynamicInode is an overfitted interface for common Inodes with
+// dynamicByteSource types used in procfs.
+type dynamicInode interface {
+	kernfs.Inode
+	vfs.DynamicBytesSource
+
+	Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
+}
+
+func newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+	inode.Init(creds, ino, inode, perm)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
diff --git a/pkg/sentry/fsimpl/proc/loadavg.go b/pkg/sentry/fsimpl/proc/loadavg.go
index 9135afef1..5351d86e8 100644
--- a/pkg/sentry/fsimpl/proc/loadavg.go
+++ b/pkg/sentry/fsimpl/proc/loadavg.go
@@ -19,15 +19,17 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 )
 
 // loadavgData backs /proc/loadavg.
 //
 // +stateify savable
-type loadavgData struct{}
+type loadavgData struct {
+	kernfs.DynamicBytesFile
+}
 
-var _ vfs.DynamicBytesSource = (*loadavgData)(nil)
+var _ dynamicInode = (*loadavgData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
diff --git a/pkg/sentry/fsimpl/proc/meminfo.go b/pkg/sentry/fsimpl/proc/meminfo.go
index 9a827cd66..cbdd4f3fc 100644
--- a/pkg/sentry/fsimpl/proc/meminfo.go
+++ b/pkg/sentry/fsimpl/proc/meminfo.go
@@ -19,21 +19,23 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
 // meminfoData implements vfs.DynamicBytesSource for /proc/meminfo.
 //
 // +stateify savable
 type meminfoData struct {
+	kernfs.DynamicBytesFile
+
 	// k is the owning Kernel.
 	k *kernel.Kernel
 }
 
-var _ vfs.DynamicBytesSource = (*meminfoData)(nil)
+var _ dynamicInode = (*meminfoData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
diff --git a/pkg/sentry/fsimpl/proc/stat.go b/pkg/sentry/fsimpl/proc/stat.go
index 720db3828..50894a534 100644
--- a/pkg/sentry/fsimpl/proc/stat.go
+++ b/pkg/sentry/fsimpl/proc/stat.go
@@ -20,8 +20,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
 // cpuStats contains the breakdown of CPU time for /proc/stat.
@@ -66,11 +66,13 @@ func (c cpuStats) String() string {
 //
 // +stateify savable
 type statData struct {
+	kernfs.DynamicBytesFile
+
 	// k is the owning Kernel.
 	k *kernel.Kernel
 }
 
-var _ vfs.DynamicBytesSource = (*statData)(nil)
+var _ dynamicInode = (*statData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index c46e05c3a..11a64c777 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -15,247 +15,176 @@
 package proc
 
 import (
-	"bytes"
-	"fmt"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// mapsCommon is embedded by mapsData and smapsData.
-type mapsCommon struct {
-	t *kernel.Task
-}
-
-// mm gets the kernel task's MemoryManager. No additional reference is taken on
-// mm here. This is safe because MemoryManager.destroy is required to leave the
-// MemoryManager in a state where it's still usable as a DynamicBytesSource.
-func (md *mapsCommon) mm() *mm.MemoryManager {
-	var tmm *mm.MemoryManager
-	md.t.WithMuLocked(func(t *kernel.Task) {
-		if mm := t.MemoryManager(); mm != nil {
-			tmm = mm
-		}
-	})
-	return tmm
-}
-
-// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
+// taskInode represents the inode for /proc/PID/ directory.
 //
 // +stateify savable
-type mapsData struct {
-	mapsCommon
+type taskInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNoDynamicLookup
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
+
+	task *kernel.Task
 }
 
-var _ vfs.DynamicBytesSource = (*mapsData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (md *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	if mm := md.mm(); mm != nil {
-		mm.ReadMapsDataInto(ctx, buf)
+var _ kernfs.Inode = (*taskInode)(nil)
+
+func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool) *kernfs.Dentry {
+	contents := map[string]*kernfs.Dentry{
+		//"auxv":      newAuxvec(t, msrc),
+		//"cmdline":   newExecArgInode(t, msrc, cmdlineExecArg),
+		//"comm":      newComm(t, msrc),
+		//"environ":   newExecArgInode(t, msrc, environExecArg),
+		//"exe":       newExe(t, msrc),
+		//"fd":        newFdDir(t, msrc),
+		//"fdinfo":    newFdInfoDir(t, msrc),
+		//"gid_map":   newGIDMap(t, msrc),
+		"io":   newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, newIO(task, isThreadGroup)),
+		"maps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &mapsData{task: task}),
+		//"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
+		//"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
+		//"ns":        newNamespaceDir(t, msrc),
+		"smaps":  newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &smapsData{task: task}),
+		"stat":   newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}),
+		"statm":  newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statmData{t: task}),
+		"status": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statusData{t: task, pidns: pidns}),
+		//"uid_map":   newUIDMap(t, msrc),
 	}
-	return nil
-}
-
-// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps.
-//
-// +stateify savable
-type smapsData struct {
-	mapsCommon
-}
-
-var _ vfs.DynamicBytesSource = (*smapsData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (sd *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	if mm := sd.mm(); mm != nil {
-		mm.ReadSmapsDataInto(ctx, buf)
+	if isThreadGroup {
+		//contents["task"] = p.newSubtasks(t, msrc)
 	}
-	return nil
-}
-
-// +stateify savable
-type taskStatData struct {
-	t *kernel.Task
+	//if len(p.cgroupControllers) > 0 {
+	//	contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers)
+	//}
 
-	// If tgstats is true, accumulate fault stats (not implemented) and CPU
-	// time across all tasks in t's thread group.
-	tgstats bool
+	taskInode := &taskInode{task: task}
+	// Note: credentials are overridden by taskOwnedInode.
+	taskInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555)
 
-	// pidns is the PID namespace associated with the proc filesystem that
-	// includes the file using this statData.
-	pidns *kernel.PIDNamespace
-}
-
-var _ vfs.DynamicBytesSource = (*taskStatData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t))
-	fmt.Fprintf(buf, "(%s) ", s.t.Name())
-	fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0])
-	ppid := kernel.ThreadID(0)
-	if parent := s.t.Parent(); parent != nil {
-		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
-	}
-	fmt.Fprintf(buf, "%d ", ppid)
-	fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup()))
-	fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session()))
-	fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
-	fmt.Fprintf(buf, "0 " /* flags */)
-	fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
-	var cputime usage.CPUStats
-	if s.tgstats {
-		cputime = s.t.ThreadGroup().CPUStats()
-	} else {
-		cputime = s.t.CPUStats()
-	}
-	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
-	cputime = s.t.ThreadGroup().JoinedChildCPUStats()
-	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
-	fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness())
-	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count())
+	inode := &taskOwnedInode{Inode: taskInode, owner: task}
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
 
-	// itrealvalue. Since kernel 2.6.17, this field is no longer
-	// maintained, and is hard coded as 0.
-	fmt.Fprintf(buf, "0 ")
+	taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	links := taskInode.OrderedChildren.Populate(dentry, contents)
+	taskInode.IncLinks(links)
 
-	// Start time is relative to boot time, expressed in clock ticks.
-	fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime())))
+	return dentry
+}
 
-	var vss, rss uint64
-	s.t.WithMuLocked(func(t *kernel.Task) {
-		if mm := t.MemoryManager(); mm != nil {
-			vss = mm.VirtualMemorySize()
-			rss = mm.ResidentSetSize()
-		}
-	})
-	fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize)
+// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long
+// as the task is still running. When it's dead, another tasks with the same
+// PID could replace it.
+func (i *taskInode) Valid(ctx context.Context) bool {
+	return i.task.ExitState() != kernel.TaskExitDead
+}
 
-	// rsslim.
-	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur)
+// Open implements kernfs.Inode.
+func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	fd := &kernfs.GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+	return fd.VFSFileDescription(), nil
+}
 
-	fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
-	fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
-	fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
-	terminationSignal := linux.Signal(0)
-	if s.t == s.t.ThreadGroup().Leader() {
-		terminationSignal = s.t.ThreadGroup().TerminationSignal()
+// SetStat implements kernfs.Inode.
+func (i *taskInode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
+	stat := opts.Stat
+	if stat.Mask&linux.STATX_MODE != 0 {
+		return syserror.EPERM
 	}
-	fmt.Fprintf(buf, "%d ", terminationSignal)
-	fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
-	fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
-	fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
-	fmt.Fprintf(buf, "0\n" /* exit_code */)
-
 	return nil
 }
 
-// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm.
-//
-// +stateify savable
-type statmData struct {
-	t *kernel.Task
+// taskOwnedInode implements kernfs.Inode and overrides inode owner with task
+// effective user and group.
+type taskOwnedInode struct {
+	kernfs.Inode
+
+	// owner is the task that owns this inode.
+	owner *kernel.Task
 }
 
-var _ vfs.DynamicBytesSource = (*statmData)(nil)
+var _ kernfs.Inode = (*taskOwnedInode)(nil)
 
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	var vss, rss uint64
-	s.t.WithMuLocked(func(t *kernel.Task) {
-		if mm := t.MemoryManager(); mm != nil {
-			vss = mm.VirtualMemorySize()
-			rss = mm.ResidentSetSize()
-		}
-	})
+func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+	// Note: credentials are overridden by taskOwnedInode.
+	inode.Init(task.Credentials(), ino, inode, perm)
 
-	fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize)
-	return nil
+	taskInode := &taskOwnedInode{Inode: inode, owner: task}
+	d := &kernfs.Dentry{}
+	d.Init(taskInode)
+	return d
 }
 
-// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status.
-//
-// +stateify savable
-type statusData struct {
-	t     *kernel.Task
-	pidns *kernel.PIDNamespace
+// Stat implements kernfs.Inode.
+func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx {
+	stat := i.Inode.Stat(fs)
+	uid, gid := i.getOwner(linux.FileMode(stat.Mode))
+	stat.UID = uint32(uid)
+	stat.GID = uint32(gid)
+	return stat
 }
 
-var _ vfs.DynamicBytesSource = (*statusData)(nil)
+// CheckPermissions implements kernfs.Inode.
+func (i *taskOwnedInode) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	mode := i.Mode()
+	uid, gid := i.getOwner(mode)
+	return vfs.GenericCheckPermissions(
+		creds,
+		ats,
+		mode.FileType() == linux.ModeDirectory,
+		uint16(mode),
+		uid,
+		gid,
+	)
+}
 
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name())
-	fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus())
-	fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup()))
-	fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t))
-	ppid := kernel.ThreadID(0)
-	if parent := s.t.Parent(); parent != nil {
-		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) {
+	// By default, set the task owner as the file owner.
+	creds := i.owner.Credentials()
+	uid := creds.EffectiveKUID
+	gid := creds.EffectiveKGID
+
+	// Linux doesn't apply dumpability adjustments to world readable/executable
+	// directories so that applications can stat /proc/PID to determine the
+	// effective UID of a process. See fs/proc/base.c:task_dump_owner.
+	if mode.FileType() == linux.ModeDirectory && mode.Permissions() == 0555 {
+		return uid, gid
 	}
-	fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
-	tpid := kernel.ThreadID(0)
-	if tracer := s.t.Tracer(); tracer != nil {
-		tpid = s.pidns.IDOfTask(tracer)
+
+	// If the task is not dumpable, then root (in the namespace preferred)
+	// owns the file.
+	m := getMM(i.owner)
+	if m == nil {
+		return auth.RootKUID, auth.RootKGID
 	}
-	fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
-	var fds int
-	var vss, rss, data uint64
-	s.t.WithMuLocked(func(t *kernel.Task) {
-		if fdTable := t.FDTable(); fdTable != nil {
-			fds = fdTable.Size()
+	if m.Dumpability() != mm.UserDumpable {
+		uid = auth.RootKUID
+		if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() {
+			uid = kuid
 		}
-		if mm := t.MemoryManager(); mm != nil {
-			vss = mm.VirtualMemorySize()
-			rss = mm.ResidentSetSize()
-			data = mm.VirtualDataSize()
+		gid = auth.RootKGID
+		if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() {
+			gid = kgid
 		}
-	})
-	fmt.Fprintf(buf, "FDSize:\t%d\n", fds)
-	fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
-	fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
-	fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
-	fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count())
-	creds := s.t.Credentials()
-	fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
-	fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
-	fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
-	fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
-	fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode())
-	return nil
-}
-
-// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider.
-type ioUsage interface {
-	// IOUsage returns the io usage data.
-	IOUsage() *usage.IO
-}
-
-// +stateify savable
-type ioData struct {
-	ioUsage
+	}
+	return uid, gid
 }
 
-var _ vfs.DynamicBytesSource = (*ioData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	io := usage.IO{}
-	io.Accumulate(i.IOUsage())
-
-	fmt.Fprintf(buf, "char: %d\n", io.CharsRead)
-	fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten)
-	fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls)
-	fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls)
-	fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead)
-	fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten)
-	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
-	return nil
+func newIO(t *kernel.Task, isThreadGroup bool) *ioData {
+	if isThreadGroup {
+		return &ioData{ioUsage: t.ThreadGroup()}
+	}
+	return &ioData{ioUsage: t}
 }
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
new file mode 100644
index 000000000..93f0e1aa8
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -0,0 +1,272 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// mm gets the kernel task's MemoryManager. No additional reference is taken on
+// mm here. This is safe because MemoryManager.destroy is required to leave the
+// MemoryManager in a state where it's still usable as a DynamicBytesSource.
+func getMM(task *kernel.Task) *mm.MemoryManager {
+	var tmm *mm.MemoryManager
+	task.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			tmm = mm
+		}
+	})
+	return tmm
+}
+
+// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
+//
+// +stateify savable
+type mapsData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*mapsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if mm := getMM(d.task); mm != nil {
+		mm.ReadMapsDataInto(ctx, buf)
+	}
+	return nil
+}
+
+// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps.
+//
+// +stateify savable
+type smapsData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*smapsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if mm := getMM(d.task); mm != nil {
+		mm.ReadSmapsDataInto(ctx, buf)
+	}
+	return nil
+}
+
+// +stateify savable
+type taskStatData struct {
+	kernfs.DynamicBytesFile
+
+	t *kernel.Task
+
+	// If tgstats is true, accumulate fault stats (not implemented) and CPU
+	// time across all tasks in t's thread group.
+	tgstats bool
+
+	// pidns is the PID namespace associated with the proc filesystem that
+	// includes the file using this statData.
+	pidns *kernel.PIDNamespace
+}
+
+var _ dynamicInode = (*taskStatData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t))
+	fmt.Fprintf(buf, "(%s) ", s.t.Name())
+	fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0])
+	ppid := kernel.ThreadID(0)
+	if parent := s.t.Parent(); parent != nil {
+		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+	}
+	fmt.Fprintf(buf, "%d ", ppid)
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup()))
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session()))
+	fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
+	fmt.Fprintf(buf, "0 " /* flags */)
+	fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
+	var cputime usage.CPUStats
+	if s.tgstats {
+		cputime = s.t.ThreadGroup().CPUStats()
+	} else {
+		cputime = s.t.CPUStats()
+	}
+	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+	cputime = s.t.ThreadGroup().JoinedChildCPUStats()
+	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+	fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness())
+	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count())
+
+	// itrealvalue. Since kernel 2.6.17, this field is no longer
+	// maintained, and is hard coded as 0.
+	fmt.Fprintf(buf, "0 ")
+
+	// Start time is relative to boot time, expressed in clock ticks.
+	fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime())))
+
+	var vss, rss uint64
+	s.t.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+		}
+	})
+	fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize)
+
+	// rsslim.
+	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur)
+
+	fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
+	fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
+	fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
+	terminationSignal := linux.Signal(0)
+	if s.t == s.t.ThreadGroup().Leader() {
+		terminationSignal = s.t.ThreadGroup().TerminationSignal()
+	}
+	fmt.Fprintf(buf, "%d ", terminationSignal)
+	fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
+	fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
+	fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
+	fmt.Fprintf(buf, "0\n" /* exit_code */)
+
+	return nil
+}
+
+// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm.
+//
+// +stateify savable
+type statmData struct {
+	kernfs.DynamicBytesFile
+
+	t *kernel.Task
+}
+
+var _ dynamicInode = (*statmData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	var vss, rss uint64
+	s.t.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+		}
+	})
+
+	fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize)
+	return nil
+}
+
+// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status.
+//
+// +stateify savable
+type statusData struct {
+	kernfs.DynamicBytesFile
+
+	t     *kernel.Task
+	pidns *kernel.PIDNamespace
+}
+
+var _ dynamicInode = (*statusData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name())
+	fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus())
+	fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup()))
+	fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t))
+	ppid := kernel.ThreadID(0)
+	if parent := s.t.Parent(); parent != nil {
+		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+	}
+	fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
+	tpid := kernel.ThreadID(0)
+	if tracer := s.t.Tracer(); tracer != nil {
+		tpid = s.pidns.IDOfTask(tracer)
+	}
+	fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
+	var fds int
+	var vss, rss, data uint64
+	s.t.WithMuLocked(func(t *kernel.Task) {
+		if fdTable := t.FDTable(); fdTable != nil {
+			fds = fdTable.Size()
+		}
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+			data = mm.VirtualDataSize()
+		}
+	})
+	fmt.Fprintf(buf, "FDSize:\t%d\n", fds)
+	fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
+	fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
+	fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
+	fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count())
+	creds := s.t.Credentials()
+	fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
+	fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
+	fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
+	fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
+	fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode())
+	// We unconditionally report a single NUMA node. See
+	// pkg/sentry/syscalls/linux/sys_mempolicy.go.
+	fmt.Fprintf(buf, "Mems_allowed:\t1\n")
+	fmt.Fprintf(buf, "Mems_allowed_list:\t0\n")
+	return nil
+}
+
+// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider.
+type ioUsage interface {
+	// IOUsage returns the io usage data.
+	IOUsage() *usage.IO
+}
+
+// +stateify savable
+type ioData struct {
+	kernfs.DynamicBytesFile
+
+	ioUsage
+}
+
+var _ dynamicInode = (*ioData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	io := usage.IO{}
+	io.Accumulate(i.IOUsage())
+
+	fmt.Fprintf(buf, "char: %d\n", io.CharsRead)
+	fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten)
+	fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls)
+	fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls)
+	fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead)
+	fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten)
+	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
new file mode 100644
index 000000000..d8f92d52f
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -0,0 +1,218 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"sort"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+	defaultPermission = 0444
+	selfName          = "self"
+	threadSelfName    = "thread-self"
+)
+
+// InoGenerator generates unique inode numbers for a given filesystem.
+type InoGenerator interface {
+	NextIno() uint64
+}
+
+// tasksInode represents the inode for /proc/ directory.
+//
+// +stateify savable
+type tasksInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
+
+	inoGen InoGenerator
+	pidns  *kernel.PIDNamespace
+
+	// '/proc/self' and '/proc/thread-self' have custom directory offsets in
+	// Linux. So handle them outside of OrderedChildren.
+	selfSymlink       *vfs.Dentry
+	threadSelfSymlink *vfs.Dentry
+}
+
+var _ kernfs.Inode = (*tasksInode)(nil)
+
+func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace) (*tasksInode, *kernfs.Dentry) {
+	root := auth.NewRootCredentials(pidns.UserNamespace())
+	contents := map[string]*kernfs.Dentry{
+		//"cpuinfo":     newCPUInfo(ctx, msrc),
+		//"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc),
+		"loadavg": newDentry(root, inoGen.NextIno(), defaultPermission, &loadavgData{}),
+		"meminfo": newDentry(root, inoGen.NextIno(), defaultPermission, &meminfoData{k: k}),
+		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), defaultPermission, "self/mounts"),
+		"stat":    newDentry(root, inoGen.NextIno(), defaultPermission, &statData{k: k}),
+		//"uptime":      newUptime(ctx, msrc),
+		//"version": newVersionData(root, inoGen.NextIno(), k),
+		"version": newDentry(root, inoGen.NextIno(), defaultPermission, &versionData{k: k}),
+	}
+
+	inode := &tasksInode{
+		pidns:             pidns,
+		inoGen:            inoGen,
+		selfSymlink:       newSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(),
+		threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(),
+	}
+	inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555)
+
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
+
+	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	links := inode.OrderedChildren.Populate(dentry, contents)
+	inode.IncLinks(links)
+
+	return inode, dentry
+}
+
+// Lookup implements kernfs.inodeDynamicLookup.
+func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	// Try to lookup a corresponding task.
+	tid, err := strconv.ParseUint(name, 10, 64)
+	if err != nil {
+		// If it failed to parse, check if it's one of the special handled files.
+		switch name {
+		case selfName:
+			return i.selfSymlink, nil
+		case threadSelfName:
+			return i.threadSelfSymlink, nil
+		}
+		return nil, syserror.ENOENT
+	}
+
+	task := i.pidns.TaskWithID(kernel.ThreadID(tid))
+	if task == nil {
+		return nil, syserror.ENOENT
+	}
+
+	taskDentry := newTaskInode(i.inoGen, task, i.pidns, true)
+	return taskDentry.VFSDentry(), nil
+}
+
+// Valid implements kernfs.inodeDynamicLookup.
+func (i *tasksInode) Valid(ctx context.Context) bool {
+	return true
+}
+
+// IterDirents implements kernfs.inodeDynamicLookup.
+func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
+	// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
+	const FIRST_PROCESS_ENTRY = 256
+
+	// Use maxTaskID to shortcut searches that will result in 0 entries.
+	const maxTaskID = kernel.TasksLimit + 1
+	if offset >= maxTaskID {
+		return offset, nil
+	}
+
+	// According to Linux (fs/proc/base.c:proc_pid_readdir()), process directories
+	// start at offset FIRST_PROCESS_ENTRY with '/proc/self', followed by
+	// '/proc/thread-self' and then '/proc/[pid]'.
+	if offset < FIRST_PROCESS_ENTRY {
+		offset = FIRST_PROCESS_ENTRY
+	}
+
+	if offset == FIRST_PROCESS_ENTRY {
+		dirent := vfs.Dirent{
+			Name:    selfName,
+			Type:    linux.DT_LNK,
+			Ino:     i.inoGen.NextIno(),
+			NextOff: offset + 1,
+		}
+		if !cb.Handle(dirent) {
+			return offset, nil
+		}
+		offset++
+	}
+	if offset == FIRST_PROCESS_ENTRY+1 {
+		dirent := vfs.Dirent{
+			Name:    threadSelfName,
+			Type:    linux.DT_LNK,
+			Ino:     i.inoGen.NextIno(),
+			NextOff: offset + 1,
+		}
+		if !cb.Handle(dirent) {
+			return offset, nil
+		}
+		offset++
+	}
+
+	// Collect all tasks that TGIDs are greater than the offset specified. Per
+	// Linux we only include in directory listings if it's the leader. But for
+	// whatever crazy reason, you can still walk to the given node.
+	var tids []int
+	startTid := offset - FIRST_PROCESS_ENTRY - 2
+	for _, tg := range i.pidns.ThreadGroups() {
+		tid := i.pidns.IDOfThreadGroup(tg)
+		if int64(tid) < startTid {
+			continue
+		}
+		if leader := tg.Leader(); leader != nil {
+			tids = append(tids, int(tid))
+		}
+	}
+
+	if len(tids) == 0 {
+		return offset, nil
+	}
+
+	sort.Ints(tids)
+	for _, tid := range tids {
+		dirent := vfs.Dirent{
+			Name:    strconv.FormatUint(uint64(tid), 10),
+			Type:    linux.DT_DIR,
+			Ino:     i.inoGen.NextIno(),
+			NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1,
+		}
+		if !cb.Handle(dirent) {
+			return offset, nil
+		}
+		offset++
+	}
+	return maxTaskID, nil
+}
+
+// Open implements kernfs.Inode.
+func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	fd := &kernfs.GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+	return fd.VFSFileDescription(), nil
+}
+
+func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx {
+	stat := i.InodeAttrs.Stat(vsfs)
+
+	// Add dynamic children to link count.
+	for _, tg := range i.pidns.ThreadGroups() {
+		if leader := tg.Leader(); leader != nil {
+			stat.Nlink++
+		}
+	}
+
+	return stat
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
new file mode 100644
index 000000000..91f30a798
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -0,0 +1,92 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+type selfSymlink struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	pidns *kernel.PIDNamespace
+}
+
+var _ kernfs.Inode = (*selfSymlink)(nil)
+
+func newSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry {
+	inode := &selfSymlink{pidns: pidns}
+	inode.Init(creds, ino, linux.ModeSymlink|perm)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// Who is reading this link?
+		return "", syserror.EINVAL
+	}
+	tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
+	if tgid == 0 {
+		return "", syserror.ENOENT
+	}
+	return strconv.FormatUint(uint64(tgid), 10), nil
+}
+
+type threadSelfSymlink struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	pidns *kernel.PIDNamespace
+}
+
+var _ kernfs.Inode = (*threadSelfSymlink)(nil)
+
+func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry {
+	inode := &threadSelfSymlink{pidns: pidns}
+	inode.Init(creds, ino, linux.ModeSymlink|perm)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// Who is reading this link?
+		return "", syserror.EINVAL
+	}
+	tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
+	tid := s.pidns.IDOfTask(t)
+	if tid == 0 || tgid == 0 {
+		return "", syserror.ENOENT
+	}
+	return fmt.Sprintf("%d/task/%d", tgid, tid), nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
new file mode 100644
index 000000000..ca8c87ec2
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -0,0 +1,555 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+	"math"
+	"path"
+	"strconv"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+var (
+	// Next offset 256 by convention. Adds 1 for the next offset.
+	selfLink       = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 0 + 1}
+	threadSelfLink = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 1 + 1}
+
+	// /proc/[pid] next offset starts at 256+2 (files above), then adds the
+	// PID, and adds 1 for the next offset.
+	proc1 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 1 + 1}
+	proc2 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 2 + 1}
+	proc3 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 3 + 1}
+)
+
+type testIterDirentsCallback struct {
+	dirents []vfs.Dirent
+}
+
+func (t *testIterDirentsCallback) Handle(d vfs.Dirent) bool {
+	t.dirents = append(t.dirents, d)
+	return true
+}
+
+func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) {
+	if got := len(dirs); got < 2 {
+		return dirs, fmt.Errorf("wrong number of dirents, want at least: 2, got: %d: %v", got, dirs)
+	}
+	for i, want := range []string{".", ".."} {
+		if got := dirs[i].Name; got != want {
+			return dirs, fmt.Errorf("wrong name, want: %s, got: %s", want, got)
+		}
+		if got := dirs[i].Type; got != linux.DT_DIR {
+			return dirs, fmt.Errorf("wrong type, want: %d, got: %d", linux.DT_DIR, got)
+		}
+	}
+	return dirs[2:], nil
+}
+
+func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
+	wants := map[string]vfs.Dirent{
+		"loadavg":     {Type: linux.DT_REG},
+		"meminfo":     {Type: linux.DT_REG},
+		"mounts":      {Type: linux.DT_LNK},
+		"self":        selfLink,
+		"stat":        {Type: linux.DT_REG},
+		"thread-self": threadSelfLink,
+		"version":     {Type: linux.DT_REG},
+	}
+	return checkFiles(gots, wants)
+}
+
+func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
+	wants := map[string]vfs.Dirent{
+		"io":     {Type: linux.DT_REG},
+		"maps":   {Type: linux.DT_REG},
+		"smaps":  {Type: linux.DT_REG},
+		"stat":   {Type: linux.DT_REG},
+		"statm":  {Type: linux.DT_REG},
+		"status": {Type: linux.DT_REG},
+	}
+	return checkFiles(gots, wants)
+}
+
+func checkFiles(gots []vfs.Dirent, wants map[string]vfs.Dirent) ([]vfs.Dirent, error) {
+	// Go over all files, when there is a match, the file is removed from both
+	// 'gots' and 'wants'. wants is expected to reach 0, as all files must
+	// be present. Remaining files in 'gots', is returned to caller to decide
+	// whether this is valid or not.
+	for i := 0; i < len(gots); i++ {
+		got := gots[i]
+		want, ok := wants[got.Name]
+		if !ok {
+			continue
+		}
+		if want.Type != got.Type {
+			return gots, fmt.Errorf("wrong file type, want: %v, got: %v: %+v", want.Type, got.Type, got)
+		}
+		if want.NextOff != 0 && want.NextOff != got.NextOff {
+			return gots, fmt.Errorf("wrong dirent offset, want: %v, got: %v: %+v", want.NextOff, got.NextOff, got)
+		}
+
+		delete(wants, got.Name)
+		gots = append(gots[0:i], gots[i+1:]...)
+		i--
+	}
+	if len(wants) != 0 {
+		return gots, fmt.Errorf("not all files were found, missing: %+v", wants)
+	}
+	return gots, nil
+}
+
+func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) {
+	k, err := boot()
+	if err != nil {
+		return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("creating kernel: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+	creds := auth.CredentialsFromContext(ctx)
+
+	vfsObj := vfs.New()
+	vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err)
+	}
+	return ctx, vfsObj, mntns.Root(), nil
+}
+
+func TestTasksEmpty(t *testing.T) {
+	ctx, vfsObj, root, err := setup()
+	if err != nil {
+		t.Fatalf("Setup failed: %v", err)
+	}
+	defer root.DecRef()
+
+	fd, err := vfsObj.OpenAt(
+		ctx,
+		auth.CredentialsFromContext(ctx),
+		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+		&vfs.OpenOptions{},
+	)
+	if err != nil {
+		t.Fatalf("vfsfs.OpenAt failed: %v", err)
+	}
+
+	cb := testIterDirentsCallback{}
+	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+		t.Fatalf("IterDirents(): %v", err)
+	}
+	cb.dirents, err = checkDots(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	cb.dirents, err = checkTasksStaticFiles(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	if len(cb.dirents) != 0 {
+		t.Errorf("found more files than expected: %+v", cb.dirents)
+	}
+}
+
+func TestTasks(t *testing.T) {
+	ctx, vfsObj, root, err := setup()
+	if err != nil {
+		t.Fatalf("Setup failed: %v", err)
+	}
+	defer root.DecRef()
+
+	k := kernel.KernelFromContext(ctx)
+	var tasks []*kernel.Task
+	for i := 0; i < 5; i++ {
+		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+		task, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc)
+		if err != nil {
+			t.Fatalf("CreateTask(): %v", err)
+		}
+		tasks = append(tasks, task)
+	}
+
+	fd, err := vfsObj.OpenAt(
+		ctx,
+		auth.CredentialsFromContext(ctx),
+		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+		&vfs.OpenOptions{},
+	)
+	if err != nil {
+		t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
+	}
+
+	cb := testIterDirentsCallback{}
+	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+		t.Fatalf("IterDirents(): %v", err)
+	}
+	cb.dirents, err = checkDots(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	cb.dirents, err = checkTasksStaticFiles(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	lastPid := 0
+	for _, d := range cb.dirents {
+		pid, err := strconv.Atoi(d.Name)
+		if err != nil {
+			t.Fatalf("Invalid process directory %q", d.Name)
+		}
+		if lastPid > pid {
+			t.Errorf("pids not in order: %v", cb.dirents)
+		}
+		found := false
+		for _, t := range tasks {
+			if k.TaskSet().Root.IDOfTask(t) == kernel.ThreadID(pid) {
+				found = true
+			}
+		}
+		if !found {
+			t.Errorf("Additional task ID %d listed: %v", pid, tasks)
+		}
+		// Next offset starts at 256+2 ('self' and 'thread-self'), then adds the
+		// PID, and adds 1 for the next offset.
+		if want := int64(256 + 2 + pid + 1); d.NextOff != want {
+			t.Errorf("Wrong dirent offset want: %d got: %d: %+v", want, d.NextOff, d)
+		}
+	}
+
+	// Test lookup.
+	for _, path := range []string{"/1", "/2"} {
+		fd, err := vfsObj.OpenAt(
+			ctx,
+			auth.CredentialsFromContext(ctx),
+			&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(path)},
+			&vfs.OpenOptions{},
+		)
+		if err != nil {
+			t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err)
+		}
+		buf := make([]byte, 1)
+		bufIOSeq := usermem.BytesIOSequence(buf)
+		if _, err := fd.Read(ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR {
+			t.Errorf("wrong error reading directory: %v", err)
+		}
+	}
+
+	if _, err := vfsObj.OpenAt(
+		ctx,
+		auth.CredentialsFromContext(ctx),
+		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/9999")},
+		&vfs.OpenOptions{},
+	); err != syserror.ENOENT {
+		t.Fatalf("wrong error from vfsfs.OpenAt(/9999): %v", err)
+	}
+}
+
+func TestTasksOffset(t *testing.T) {
+	ctx, vfsObj, root, err := setup()
+	if err != nil {
+		t.Fatalf("Setup failed: %v", err)
+	}
+	defer root.DecRef()
+
+	k := kernel.KernelFromContext(ctx)
+	for i := 0; i < 3; i++ {
+		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+		if _, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc); err != nil {
+			t.Fatalf("CreateTask(): %v", err)
+		}
+	}
+
+	for _, tc := range []struct {
+		name   string
+		offset int64
+		wants  map[string]vfs.Dirent
+	}{
+		{
+			name:   "small offset",
+			offset: 100,
+			wants: map[string]vfs.Dirent{
+				"self":        selfLink,
+				"thread-self": threadSelfLink,
+				"1":           proc1,
+				"2":           proc2,
+				"3":           proc3,
+			},
+		},
+		{
+			name:   "offset at start",
+			offset: 256,
+			wants: map[string]vfs.Dirent{
+				"self":        selfLink,
+				"thread-self": threadSelfLink,
+				"1":           proc1,
+				"2":           proc2,
+				"3":           proc3,
+			},
+		},
+		{
+			name:   "skip /proc/self",
+			offset: 257,
+			wants: map[string]vfs.Dirent{
+				"thread-self": threadSelfLink,
+				"1":           proc1,
+				"2":           proc2,
+				"3":           proc3,
+			},
+		},
+		{
+			name:   "skip symlinks",
+			offset: 258,
+			wants: map[string]vfs.Dirent{
+				"1": proc1,
+				"2": proc2,
+				"3": proc3,
+			},
+		},
+		{
+			name:   "skip first process",
+			offset: 260,
+			wants: map[string]vfs.Dirent{
+				"2": proc2,
+				"3": proc3,
+			},
+		},
+		{
+			name:   "last process",
+			offset: 261,
+			wants: map[string]vfs.Dirent{
+				"3": proc3,
+			},
+		},
+		{
+			name:   "after last",
+			offset: 262,
+			wants:  nil,
+		},
+		{
+			name:   "TaskLimit+1",
+			offset: kernel.TasksLimit + 1,
+			wants:  nil,
+		},
+		{
+			name:   "max",
+			offset: math.MaxInt64,
+			wants:  nil,
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			fd, err := vfsObj.OpenAt(
+				ctx,
+				auth.CredentialsFromContext(ctx),
+				&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+				&vfs.OpenOptions{},
+			)
+			if err != nil {
+				t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
+			}
+			if _, err := fd.Impl().Seek(ctx, tc.offset, linux.SEEK_SET); err != nil {
+				t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err)
+			}
+
+			cb := testIterDirentsCallback{}
+			if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+				t.Fatalf("IterDirents(): %v", err)
+			}
+			if cb.dirents, err = checkFiles(cb.dirents, tc.wants); err != nil {
+				t.Error(err.Error())
+			}
+			if len(cb.dirents) != 0 {
+				t.Errorf("found more files than expected: %+v", cb.dirents)
+			}
+		})
+	}
+}
+
+func TestTask(t *testing.T) {
+	ctx, vfsObj, root, err := setup()
+	if err != nil {
+		t.Fatalf("Setup failed: %v", err)
+	}
+	defer root.DecRef()
+
+	k := kernel.KernelFromContext(ctx)
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	_, err = createTask(ctx, "name", tc)
+	if err != nil {
+		t.Fatalf("CreateTask(): %v", err)
+	}
+
+	fd, err := vfsObj.OpenAt(
+		ctx,
+		auth.CredentialsFromContext(ctx),
+		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/1")},
+		&vfs.OpenOptions{},
+	)
+	if err != nil {
+		t.Fatalf("vfsfs.OpenAt(/1) failed: %v", err)
+	}
+
+	cb := testIterDirentsCallback{}
+	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+		t.Fatalf("IterDirents(): %v", err)
+	}
+	cb.dirents, err = checkDots(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	cb.dirents, err = checkTaskStaticFiles(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	if len(cb.dirents) != 0 {
+		t.Errorf("found more files than expected: %+v", cb.dirents)
+	}
+}
+
+func TestProcSelf(t *testing.T) {
+	ctx, vfsObj, root, err := setup()
+	if err != nil {
+		t.Fatalf("Setup failed: %v", err)
+	}
+	defer root.DecRef()
+
+	k := kernel.KernelFromContext(ctx)
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	task, err := createTask(ctx, "name", tc)
+	if err != nil {
+		t.Fatalf("CreateTask(): %v", err)
+	}
+
+	fd, err := vfsObj.OpenAt(
+		task,
+		auth.CredentialsFromContext(ctx),
+		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/self/"), FollowFinalSymlink: true},
+		&vfs.OpenOptions{},
+	)
+	if err != nil {
+		t.Fatalf("vfsfs.OpenAt(/self/) failed: %v", err)
+	}
+
+	cb := testIterDirentsCallback{}
+	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+		t.Fatalf("IterDirents(): %v", err)
+	}
+	cb.dirents, err = checkDots(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	cb.dirents, err = checkTaskStaticFiles(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	if len(cb.dirents) != 0 {
+		t.Errorf("found more files than expected: %+v", cb.dirents)
+	}
+}
+
+func iterateDir(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, fd *vfs.FileDescription) {
+	t.Logf("Iterating: /proc%s", fd.MappedName(ctx))
+
+	cb := testIterDirentsCallback{}
+	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+		t.Fatalf("IterDirents(): %v", err)
+	}
+	var err error
+	cb.dirents, err = checkDots(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	for _, d := range cb.dirents {
+		childPath := path.Join(fd.MappedName(ctx), d.Name)
+		if d.Type == linux.DT_LNK {
+			link, err := vfsObj.ReadlinkAt(
+				ctx,
+				auth.CredentialsFromContext(ctx),
+				&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)},
+			)
+			if err != nil {
+				t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", childPath, err)
+			} else {
+				t.Logf("Skipping symlink: /proc%s => %s", childPath, link)
+			}
+			continue
+		}
+
+		t.Logf("Opening: /proc%s", childPath)
+		child, err := vfsObj.OpenAt(
+			ctx,
+			auth.CredentialsFromContext(ctx),
+			&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)},
+			&vfs.OpenOptions{},
+		)
+		if err != nil {
+			t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err)
+			continue
+		}
+		stat, err := child.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			t.Errorf("Stat(%v) failed: %v", childPath, err)
+		}
+		if got := linux.FileMode(stat.Mode).DirentType(); got != d.Type {
+			t.Errorf("wrong file mode, stat: %v, dirent: %v", got, d.Type)
+		}
+		if d.Type == linux.DT_DIR {
+			// Found another dir, let's do it again!
+			iterateDir(ctx, t, vfsObj, root, child)
+		}
+	}
+}
+
+// TestTree iterates all directories and stats every file.
+func TestTree(t *testing.T) {
+	uberCtx, vfsObj, root, err := setup()
+	if err != nil {
+		t.Fatalf("Setup failed: %v", err)
+	}
+	defer root.DecRef()
+
+	k := kernel.KernelFromContext(uberCtx)
+	var tasks []*kernel.Task
+	for i := 0; i < 5; i++ {
+		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+		task, err := createTask(uberCtx, fmt.Sprintf("name-%d", i), tc)
+		if err != nil {
+			t.Fatalf("CreateTask(): %v", err)
+		}
+		tasks = append(tasks, task)
+	}
+
+	ctx := tasks[0]
+	fd, err := vfsObj.OpenAt(
+		ctx,
+		auth.CredentialsFromContext(uberCtx),
+		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+		&vfs.OpenOptions{},
+	)
+	if err != nil {
+		t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
+	}
+	iterateDir(ctx, t, vfsObj, root, fd)
+}
diff --git a/pkg/sentry/fsimpl/proc/version.go b/pkg/sentry/fsimpl/proc/version.go
index e1643d4e0..367f2396b 100644
--- a/pkg/sentry/fsimpl/proc/version.go
+++ b/pkg/sentry/fsimpl/proc/version.go
@@ -19,19 +19,21 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
 // versionData implements vfs.DynamicBytesSource for /proc/version.
 //
 // +stateify savable
 type versionData struct {
+	kernfs.DynamicBytesFile
+
 	// k is the owning Kernel.
 	k *kernel.Kernel
 }
 
-var _ vfs.DynamicBytesSource = (*versionData)(nil)
+var _ dynamicInode = (*versionData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 0cc751eb8..a5b285987 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -1,14 +1,13 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
 go_template_instance(
     name = "dentry_list",
     out = "dentry_list.go",
-    package = "memfs",
+    package = "tmpfs",
     prefix = "dentry",
     template = "//pkg/ilist:generic_list",
     types = {
@@ -18,25 +17,34 @@ go_template_instance(
 )
 
 go_library(
-    name = "memfs",
+    name = "tmpfs",
     srcs = [
         "dentry_list.go",
         "directory.go",
         "filesystem.go",
-        "memfs.go",
         "named_pipe.go",
         "regular_file.go",
         "symlink.go",
+        "tmpfs.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs",
+    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs",
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
         "//pkg/fspath",
+        "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
@@ -48,8 +56,9 @@ go_test(
     size = "small",
     srcs = ["benchmark_test.go"],
     deps = [
-        ":memfs",
+        ":tmpfs",
         "//pkg/abi/linux",
+        "//pkg/fspath",
         "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
@@ -62,15 +71,20 @@ go_test(
 )
 
 go_test(
-    name = "memfs_test",
+    name = "tmpfs_test",
     size = "small",
-    srcs = ["pipe_test.go"],
-    embed = [":memfs"],
+    srcs = [
+        "pipe_test.go",
+        "regular_file_test.go",
+    ],
+    embed = [":tmpfs"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/fspath",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/contexttest",
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index 4a7a94a52..d88c83499 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -21,12 +21,13 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -175,8 +176,10 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 
 			// Create VFS.
 			vfsObj := vfs.New()
-			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
+			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+				AllowUserMount: true,
+			})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
@@ -193,9 +196,9 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				pop := vfs.PathOperation{
-					Root:     root,
-					Start:    vd,
-					Pathname: name,
+					Root:  root,
+					Start: vd,
+					Path:  fspath.Parse(name),
 				}
 				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
 					Mode: 0755,
@@ -216,7 +219,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
 				Root:               root,
 				Start:              vd,
-				Pathname:           filename,
+				Path:               fspath.Parse(filename),
 				FollowFinalSymlink: true,
 			}, &vfs.OpenOptions{
 				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
@@ -237,7 +240,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               root,
 					Start:              root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
@@ -364,8 +367,10 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 
 			// Create VFS.
 			vfsObj := vfs.New()
-			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
+			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+				AllowUserMount: true,
+			})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
@@ -378,9 +383,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			root := mntns.Root()
 			defer root.DecRef()
 			pop := vfs.PathOperation{
-				Root:     root,
-				Start:    root,
-				Pathname: mountPointName,
+				Root:  root,
+				Start: root,
+				Path:  fspath.Parse(mountPointName),
 			}
 			if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
 				Mode: 0755,
@@ -394,7 +399,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			}
 			defer mountPoint.DecRef()
 			// Create and mount the submount.
-			if err := vfsObj.MountAt(ctx, creds, "", &pop, "memfs", &vfs.MountOptions{}); err != nil {
+			if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
 				b.Fatalf("failed to mount tmpfs submount: %v", err)
 			}
 			filePathBuilder.WriteString(mountPointName)
@@ -408,9 +413,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				pop := vfs.PathOperation{
-					Root:     root,
-					Start:    vd,
-					Pathname: name,
+					Root:  root,
+					Start: vd,
+					Path:  fspath.Parse(name),
 				}
 				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
 					Mode: 0755,
@@ -438,7 +443,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
 				Root:               root,
 				Start:              vd,
-				Pathname:           filename,
+				Path:               fspath.Parse(filename),
 				FollowFinalSymlink: true,
 			}, &vfs.OpenOptions{
 				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
@@ -458,7 +463,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               root,
 					Start:              root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 0bd82e480..887ca2619 100644
--- a/pkg/sentry/fsimpl/memfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package memfs
+package tmpfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
new file mode 100644
index 000000000..26979729e
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -0,0 +1,698 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	// All filesystem state is in-memory.
+	return nil
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+	if !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	nextVFSD, err := rp.ResolveComponent(&d.vfsd)
+	if err != nil {
+		return nil, err
+	}
+	if nextVFSD == nil {
+		// Since the Dentry tree is the sole source of truth for tmpfs, if it's
+		// not in the Dentry tree, it doesn't exist.
+		return nil, syserror.ENOENT
+	}
+	next := nextVFSD.Impl().(*dentry)
+	if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+		// TODO: symlink traversals update access time
+		if err := rp.HandleSymlink(symlink.target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return next, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+	for !rp.Final() {
+		next, err := stepLocked(rp, d)
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
+//
+// Preconditions: filesystem.mu must be locked.
+func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	for !rp.Done() {
+		next, err := stepLocked(rp, d)
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// doCreateAt is loosely analogous to a conjunction of Linux's
+// fs/namei.c:filename_create() and done_path_create().
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EEXIST
+	}
+	// Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
+	// because if the child exists we want to return EEXIST immediately instead
+	// of attempting symlink/mount traversal.
+	if parent.vfsd.Child(name) != nil {
+		return syserror.EEXIST
+	}
+	if !dir && rp.MustBeDir() {
+		return syserror.ENOENT
+	}
+	// In memfs, the only way to cause a dentry to be disowned is by removing
+	// it from the filesystem, so this check is equivalent to checking if
+	// parent has been removed.
+	if parent.vfsd.IsDisowned() {
+		return syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	return create(parent, name)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !d.inode.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil {
+			return nil, err
+		}
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return nil, err
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		if rp.Mount() != vd.Mount() {
+			return syserror.EXDEV
+		}
+		d := vd.Dentry().Impl().(*dentry)
+		if d.inode.isDir() {
+			return syserror.EPERM
+		}
+		if d.inode.nlink == 0 {
+			return syserror.ENOENT
+		}
+		if d.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+		d.inode.incLinksLocked()
+		child := fs.newDentry(d.inode)
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error {
+		if parent.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+		parent.inode.incLinksLocked() // from child's ".."
+		child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		switch opts.Mode.FileType() {
+		case 0, linux.S_IFREG:
+			child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+			parent.vfsd.InsertChild(&child.vfsd, name)
+			parent.inode.impl.(*directory).childList.PushBack(child)
+			return nil
+		case linux.S_IFIFO:
+			child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
+			parent.vfsd.InsertChild(&child.vfsd, name)
+			parent.inode.impl.(*directory).childList.PushBack(child)
+			return nil
+		case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK:
+			// Not yet supported.
+			return syserror.EPERM
+		default:
+			return syserror.EINVAL
+		}
+	})
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		// Not yet supported.
+		return nil, syserror.EOPNOTSUPP
+	}
+
+	// Handle O_CREAT and !O_CREAT separately, since in the latter case we
+	// don't need fs.mu for writing.
+	if opts.Flags&linux.O_CREAT == 0 {
+		fs.mu.RLock()
+		defer fs.mu.RUnlock()
+		d, err := resolveLocked(rp)
+		if err != nil {
+			return nil, err
+		}
+		return d.open(ctx, rp, opts.Flags, false /* afterCreate */)
+	}
+
+	mustCreate := opts.Flags&linux.O_EXCL != 0
+	start := rp.Start().Impl().(*dentry)
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	if rp.Done() {
+		// Reject attempts to open directories with O_CREAT.
+		if rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
+		if mustCreate {
+			return nil, syserror.EEXIST
+		}
+		return start.open(ctx, rp, opts.Flags, false /* afterCreate */)
+	}
+afterTrailingSymlink:
+	parent, err := walkParentDirLocked(rp, start)
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
+	}
+	// Reject attempts to open directories with O_CREAT.
+	if rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return nil, syserror.EISDIR
+	}
+	// Determine whether or not we need to create a file.
+	child, err := stepLocked(rp, parent)
+	if err == syserror.ENOENT {
+		// Already checked for searchability above; now check for writability.
+		if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+			return nil, err
+		}
+		if err := rp.Mount().CheckBeginWrite(); err != nil {
+			return nil, err
+		}
+		defer rp.Mount().EndWrite()
+		// Create and open the child.
+		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return child.open(ctx, rp, opts.Flags, true)
+	}
+	if err != nil {
+		return nil, err
+	}
+	// Do we need to resolve a trailing symlink?
+	if !rp.Done() {
+		start = parent
+		goto afterTrailingSymlink
+	}
+	// Open existing file.
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	return child.open(ctx, rp, opts.Flags, false)
+}
+
+func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(flags)
+	if !afterCreate {
+		if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
+			return nil, err
+		}
+	}
+	mnt := rp.Mount()
+	switch impl := d.inode.impl.(type) {
+	case *regularFile:
+		var fd regularFileFD
+		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
+		fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
+		if fd.writable {
+			if err := mnt.CheckBeginWrite(); err != nil {
+				return nil, err
+			}
+			// mnt.EndWrite() is called by regularFileFD.Release().
+		}
+		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
+		if flags&linux.O_TRUNC != 0 {
+			impl.mu.Lock()
+			impl.data.Truncate(0, impl.memFile)
+			atomic.StoreUint64(&impl.size, 0)
+			impl.mu.Unlock()
+		}
+		return &fd.vfsfd, nil
+	case *directory:
+		// Can't open directories writably.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		var fd directoryFD
+		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
+		return &fd.vfsfd, nil
+	case *symlink:
+		// Can't open symlinks without O_PATH (which is unimplemented).
+		return nil, syserror.ELOOP
+	case *namedPipe:
+		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags)
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
+	}
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return "", err
+	}
+	symlink, ok := d.inode.impl.(*symlink)
+	if !ok {
+		return "", syserror.EINVAL
+	}
+	return symlink.target, nil
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	if opts.Flags != 0 {
+		// TODO(b/145974740): Support renameat2 flags.
+		return syserror.EINVAL
+	}
+
+	// Resolve newParent first to verify that it's on this Mount.
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	newName := rp.Component()
+	if newName == "." || newName == ".." {
+		return syserror.EBUSY
+	}
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	oldParent := oldParentVD.Dentry().Impl().(*dentry)
+	if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	// Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
+	// because if the existing child is a symlink or mount point then we want
+	// to rename over it rather than follow it.
+	renamedVFSD := oldParent.vfsd.Child(oldName)
+	if renamedVFSD == nil {
+		return syserror.ENOENT
+	}
+	renamed := renamedVFSD.Impl().(*dentry)
+	if renamed.inode.isDir() {
+		if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) {
+			return syserror.EINVAL
+		}
+		if oldParent != newParent {
+			// Writability is needed to change renamed's "..".
+			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil {
+				return err
+			}
+		}
+	} else {
+		if opts.MustBeDir || rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+
+	if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	replacedVFSD := newParent.vfsd.Child(newName)
+	var replaced *dentry
+	if replacedVFSD != nil {
+		replaced = replacedVFSD.Impl().(*dentry)
+		if replaced.inode.isDir() {
+			if !renamed.inode.isDir() {
+				return syserror.EISDIR
+			}
+			if replaced.vfsd.HasChildren() {
+				return syserror.ENOTEMPTY
+			}
+		} else {
+			if rp.MustBeDir() {
+				return syserror.ENOTDIR
+			}
+			if renamed.inode.isDir() {
+				return syserror.ENOTDIR
+			}
+		}
+	} else {
+		if renamed.inode.isDir() && newParent.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+	}
+	if newParent.vfsd.IsDisowned() {
+		return syserror.ENOENT
+	}
+
+	// Linux places this check before some of those above; we do it here for
+	// simplicity, under the assumption that applications are not intentionally
+	// doing noop renames expecting them to succeed where non-noop renames
+	// would fail.
+	if renamedVFSD == replacedVFSD {
+		return nil
+	}
+	vfsObj := rp.VirtualFilesystem()
+	oldParentDir := oldParent.inode.impl.(*directory)
+	newParentDir := newParent.inode.impl.(*directory)
+	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil {
+		return err
+	}
+	if replaced != nil {
+		newParentDir.childList.Remove(replaced)
+		if replaced.inode.isDir() {
+			newParent.inode.decLinksLocked() // from replaced's ".."
+		}
+		replaced.inode.decLinksLocked()
+	}
+	oldParentDir.childList.Remove(renamed)
+	newParentDir.childList.PushBack(renamed)
+	if renamed.inode.isDir() {
+		oldParent.inode.decLinksLocked()
+		newParent.inode.incLinksLocked()
+	}
+	// TODO: update timestamps and parent directory sizes
+	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
+	return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." {
+		return syserror.EINVAL
+	}
+	if name == ".." {
+		return syserror.ENOTEMPTY
+	}
+	childVFSD := parent.vfsd.Child(name)
+	if childVFSD == nil {
+		return syserror.ENOENT
+	}
+	child := childVFSD.Impl().(*dentry)
+	if !child.inode.isDir() {
+		return syserror.ENOTDIR
+	}
+	if childVFSD.HasChildren() {
+		return syserror.ENOTEMPTY
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	vfsObj := rp.VirtualFilesystem()
+	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+		return err
+	}
+	parent.inode.impl.(*directory).childList.Remove(child)
+	parent.inode.decLinksLocked() // from child's ".."
+	child.inode.decLinksLocked()
+	vfsObj.CommitDeleteDentry(childVFSD)
+	return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return err
+	}
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	// TODO: implement inode.setStat
+	return syserror.EPERM
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	var stat linux.Statx
+	d.inode.statTo(&stat)
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	// TODO: actually implement statfs
+	return linux.Statfs{}, syserror.ENOSYS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EISDIR
+	}
+	childVFSD := parent.vfsd.Child(name)
+	if childVFSD == nil {
+		return syserror.ENOENT
+	}
+	child := childVFSD.Impl().(*dentry)
+	if child.inode.isDir() {
+		return syserror.EISDIR
+	}
+	if !rp.MustBeDir() {
+		return syserror.ENOTDIR
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	vfsObj := rp.VirtualFilesystem()
+	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+		return err
+	}
+	parent.inode.impl.(*directory).childList.Remove(child)
+	child.inode.decLinksLocked()
+	vfsObj.CommitDeleteDentry(childVFSD)
+	return nil
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return nil, err
+	}
+	// TODO(b/127675828): support extended attributes
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return "", err
+	}
+	// TODO(b/127675828): support extended attributes
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return err
+	}
+	// TODO(b/127675828): support extended attributes
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return err
+	}
+	// TODO(b/127675828): support extended attributes
+	return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 91cb4b1fc..40bde54de 100644
--- a/pkg/sentry/fsimpl/memfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package memfs
+package tmpfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -55,8 +55,6 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v
 		return nil, err
 	}
 	mnt := rp.Mount()
-	mnt.IncRef()
-	vfsd.IncRef()
-	fd.vfsfd.Init(&fd, mnt, vfsd)
+	fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 	return &fd.vfsfd, nil
 }
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index 5bf527c80..70b42a6ec 100644
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -12,13 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package memfs
+package tmpfs
 
 import (
 	"bytes"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -38,7 +39,7 @@ func TestSeparateFDs(t *testing.T) {
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	rfdchan := make(chan *vfs.FileDescription)
@@ -76,7 +77,7 @@ func TestNonblockingRead(t *testing.T) {
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK}
@@ -108,7 +109,7 @@ func TestNonblockingWriteError(t *testing.T) {
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK}
@@ -126,7 +127,7 @@ func TestSingleFD(t *testing.T) {
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	openOpts := vfs.OpenOptions{Flags: linux.O_RDWR}
@@ -151,8 +152,10 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 
 	// Create VFS.
 	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
+	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
@@ -160,10 +163,9 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	// Create the pipe.
 	root := mntns.Root()
 	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Pathname:           fileName,
-		FollowFinalSymlink: true,
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(fileName),
 	}
 	mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644}
 	if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil {
@@ -174,7 +176,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}, &vfs.StatOptions{})
 	if err != nil {
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
new file mode 100644
index 000000000..f51e247a7
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -0,0 +1,357 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"io"
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+type regularFile struct {
+	inode inode
+
+	// memFile is a platform.File used to allocate pages to this regularFile.
+	memFile *pgalloc.MemoryFile
+
+	// mu protects the fields below.
+	mu sync.RWMutex
+
+	// data maps offsets into the file to offsets into memFile that store
+	// the file's data.
+	data fsutil.FileRangeSet
+
+	// size is the size of data, but accessed using atomic memory
+	// operations to avoid locking in inode.stat().
+	size uint64
+
+	// seals represents file seals on this inode.
+	seals uint32
+}
+
+func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
+	file := &regularFile{
+		memFile: fs.memFile,
+	}
+	file.inode.init(file, fs, creds, mode)
+	file.inode.nlink = 1 // from parent directory
+	return &file.inode
+}
+
+type regularFileFD struct {
+	fileDescription
+
+	// These are immutable.
+	readable bool
+	writable bool
+
+	// off is the file offset. off is accessed using atomic memory operations.
+	// offMu serializes operations that may mutate off.
+	off   int64
+	offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {
+	if fd.writable {
+		fd.vfsfd.VirtualDentry().Mount().EndWrite()
+	}
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if !fd.readable {
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+	f := fd.inode().impl.(*regularFile)
+	rw := getRegularFileReadWriter(f, offset)
+	n, err := dst.CopyOutFrom(ctx, rw)
+	putRegularFileReadWriter(rw)
+	return int64(n), err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	if !fd.writable {
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	srclen := src.NumBytes()
+	if srclen == 0 {
+		return 0, nil
+	}
+	f := fd.inode().impl.(*regularFile)
+	end := offset + srclen
+	if end < offset {
+		// Overflow.
+		return 0, syserror.EFBIG
+	}
+	rw := getRegularFileReadWriter(f, offset)
+	n, err := src.CopyInTo(ctx, rw)
+	putRegularFileReadWriter(rw)
+	return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, err := fd.PWrite(ctx, src, fd.off, opts)
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.offMu.Lock()
+	defer fd.offMu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// use offset as specified
+	case linux.SEEK_CUR:
+		offset += fd.off
+	case linux.SEEK_END:
+		offset += int64(atomic.LoadUint64(&fd.inode().impl.(*regularFile).size))
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+	return nil
+}
+
+// regularFileReadWriter implements safemem.Reader and Safemem.Writer.
+type regularFileReadWriter struct {
+	file *regularFile
+
+	// Offset into the file to read/write at. Note that this may be
+	// different from the FD offset if PRead/PWrite is used.
+	off uint64
+}
+
+var regularFileReadWriterPool = sync.Pool{
+	New: func() interface{} {
+		return &regularFileReadWriter{}
+	},
+}
+
+func getRegularFileReadWriter(file *regularFile, offset int64) *regularFileReadWriter {
+	rw := regularFileReadWriterPool.Get().(*regularFileReadWriter)
+	rw.file = file
+	rw.off = uint64(offset)
+	return rw
+}
+
+func putRegularFileReadWriter(rw *regularFileReadWriter) {
+	rw.file = nil
+	regularFileReadWriterPool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	rw.file.mu.RLock()
+
+	// Compute the range to read (limited by file size and overflow-checked).
+	if rw.off >= rw.file.size {
+		rw.file.mu.RUnlock()
+		return 0, io.EOF
+	}
+	end := rw.file.size
+	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
+		end = rend
+	}
+
+	var done uint64
+	seg, gap := rw.file.data.Find(uint64(rw.off))
+	for rw.off < end {
+		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
+		switch {
+		case seg.Ok():
+			// Get internal mappings.
+			ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+			if err != nil {
+				rw.file.mu.RUnlock()
+				return done, err
+			}
+
+			// Copy from internal mappings.
+			n, err := safemem.CopySeq(dsts, ims)
+			done += n
+			rw.off += uint64(n)
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				rw.file.mu.RUnlock()
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Tmpfs holes are zero-filled.
+			gapmr := gap.Range().Intersect(mr)
+			dst := dsts.TakeFirst64(gapmr.Length())
+			n, err := safemem.ZeroSeq(dst)
+			done += n
+			rw.off += uint64(n)
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				rw.file.mu.RUnlock()
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+		}
+	}
+	rw.file.mu.RUnlock()
+	return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	rw.file.mu.Lock()
+
+	// Compute the range to write (overflow-checked).
+	end := rw.off + srcs.NumBytes()
+	if end <= rw.off {
+		end = math.MaxInt64
+	}
+
+	// Check if seals prevent either file growth or all writes.
+	switch {
+	case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
+		rw.file.mu.Unlock()
+		return 0, syserror.EPERM
+	case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
+		// When growth is sealed, Linux effectively allows writes which would
+		// normally grow the file to partially succeed up to the current EOF,
+		// rounded down to the page boundary before the EOF.
+		//
+		// This happens because writes (and thus the growth check) for tmpfs
+		// files proceed page-by-page on Linux, and the final write to the page
+		// containing EOF fails, resulting in a partial write up to the start of
+		// that page.
+		//
+		// To emulate this behaviour, artifically truncate the write to the
+		// start of the page containing the current EOF.
+		//
+		// See Linux, mm/filemap.c:generic_perform_write() and
+		// mm/shmem.c:shmem_write_begin().
+		if pgstart := uint64(usermem.Addr(rw.file.size).RoundDown()); end > pgstart {
+			end = pgstart
+		}
+		if end <= rw.off {
+			// Truncation would result in no data being written.
+			rw.file.mu.Unlock()
+			return 0, syserror.EPERM
+		}
+	}
+
+	// Page-aligned mr for when we need to allocate memory. RoundUp can't
+	// overflow since end is an int64.
+	pgstartaddr := usermem.Addr(rw.off).RoundDown()
+	pgendaddr, _ := usermem.Addr(end).RoundUp()
+	pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}
+
+	var (
+		done   uint64
+		retErr error
+	)
+	seg, gap := rw.file.data.Find(uint64(rw.off))
+	for rw.off < end {
+		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
+		switch {
+		case seg.Ok():
+			// Get internal mappings.
+			ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Copy to internal mappings.
+			n, err := safemem.CopySeq(ims, srcs)
+			done += n
+			rw.off += uint64(n)
+			srcs = srcs.DropFirst64(n)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Allocate memory for the write.
+			gapMR := gap.Range().Intersect(pgMR)
+			fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Write to that memory as usual.
+			seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
+		}
+	}
+exitLoop:
+	// If the write ends beyond the file's previous size, it causes the
+	// file to grow.
+	if rw.off > rw.file.size {
+		atomic.StoreUint64(&rw.file.size, rw.off)
+	}
+
+	rw.file.mu.Unlock()
+	return done, retErr
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
new file mode 100644
index 000000000..3731c5b6f
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -0,0 +1,224 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If
+// the returned err is not nil, then cleanup should be called when the FD is no
+// longer needed.
+func newFileFD(ctx context.Context, filename string) (*vfs.FileDescription, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+
+	vfsObj := vfs.New()
+	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
+	}
+	root := mntns.Root()
+
+	// Create the file that will be write/read.
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Path:               fspath.Parse(filename),
+		FollowFinalSymlink: true,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+		Mode:  0644,
+	})
+	if err != nil {
+		root.DecRef()
+		mntns.DecRef(vfsObj)
+		return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err)
+	}
+
+	return fd, func() {
+		root.DecRef()
+		mntns.DecRef(vfsObj)
+	}, nil
+}
+
+// Test that we can write some data to a file and read it back.`
+func TestSimpleWriteRead(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, "simpleReadWrite")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Write.
+	data := []byte("foobarbaz")
+	n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+	}
+	if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want {
+		t.Errorf("fd.Write left offset at %d, want %d", got, want)
+	}
+
+	// Seek back to beginning.
+	if _, err := fd.Seek(ctx, 0, linux.SEEK_SET); err != nil {
+		t.Fatalf("fd.Seek failed: %v", err)
+	}
+	if got, want := fd.Impl().(*regularFileFD).off, int64(0); got != want {
+		t.Errorf("fd.Seek(0) left offset at %d, want %d", got, want)
+	}
+
+	// Read.
+	buf := make([]byte, len(data))
+	n, err = fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+	if err != nil && err != io.EOF {
+		t.Fatalf("fd.Read failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Read got short read length %d, want %d", n, len(data))
+	}
+	if got, want := string(buf), string(data); got != want {
+		t.Errorf("Read got %q want %s", got, want)
+	}
+	if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want {
+		t.Errorf("fd.Write left offset at %d, want %d", got, want)
+	}
+}
+
+func TestPWrite(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, "PRead")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Fill file with 1k 'a's.
+	data := bytes.Repeat([]byte{'a'}, 1000)
+	n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+	}
+
+	// Write "gVisor is awesome" at various offsets.
+	buf := []byte("gVisor is awesome")
+	offsets := []int{0, 1, 2, 10, 20, 50, 100, len(data) - 100, len(data) - 1, len(data), len(data) + 1}
+	for _, offset := range offsets {
+		name := fmt.Sprintf("PWrite offset=%d", offset)
+		t.Run(name, func(t *testing.T) {
+			n, err := fd.PWrite(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.WriteOptions{})
+			if err != nil {
+				t.Errorf("fd.PWrite got err %v want nil", err)
+			}
+			if n != int64(len(buf)) {
+				t.Errorf("fd.PWrite got %d bytes want %d", n, len(buf))
+			}
+
+			// Update data to reflect expected file contents.
+			if len(data) < offset+len(buf) {
+				data = append(data, make([]byte, (offset+len(buf))-len(data))...)
+			}
+			copy(data[offset:], buf)
+
+			// Read the whole file and compare with data.
+			readBuf := make([]byte, len(data))
+			n, err = fd.PRead(ctx, usermem.BytesIOSequence(readBuf), 0, vfs.ReadOptions{})
+			if err != nil {
+				t.Fatalf("fd.PRead failed: %v", err)
+			}
+			if n != int64(len(data)) {
+				t.Errorf("fd.PRead got short read length %d, want %d", n, len(data))
+			}
+			if got, want := string(readBuf), string(data); got != want {
+				t.Errorf("PRead got %q want %s", got, want)
+			}
+
+		})
+	}
+}
+
+func TestPRead(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, "PRead")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Write 100 sequences of 'gVisor is awesome'.
+	data := bytes.Repeat([]byte("gVisor is awsome"), 100)
+	n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+	}
+
+	// Read various sizes from various offsets.
+	sizes := []int{0, 1, 2, 10, 20, 50, 100, 1000}
+	offsets := []int{0, 1, 2, 10, 20, 50, 100, 1000, len(data) - 100, len(data) - 1, len(data), len(data) + 1}
+
+	for _, size := range sizes {
+		for _, offset := range offsets {
+			name := fmt.Sprintf("PRead offset=%d size=%d", offset, size)
+			t.Run(name, func(t *testing.T) {
+				var (
+					wantRead []byte
+					wantErr  error
+				)
+				if offset < len(data) {
+					wantRead = data[offset:]
+				} else if size > 0 {
+					wantErr = io.EOF
+				}
+				if offset+size < len(data) {
+					wantRead = wantRead[:size]
+				}
+				buf := make([]byte, size)
+				n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.ReadOptions{})
+				if err != wantErr {
+					t.Errorf("fd.PRead got err %v want %v", err, wantErr)
+				}
+				if n != int64(len(wantRead)) {
+					t.Errorf("fd.PRead got %d bytes want %d", n, len(wantRead))
+				}
+				if got := string(buf[:n]); got != string(wantRead) {
+					t.Errorf("fd.PRead got %q want %q", got, string(wantRead))
+				}
+			})
+		}
+	}
+}
diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index b2ac2cbeb..5246aca84 100644
--- a/pkg/sentry/fsimpl/memfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package memfs
+package tmpfs
 
 import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 4cb2a4e0f..7be6faa5b 100644
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -12,29 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package memfs provides a filesystem implementation that behaves like tmpfs:
+// Package tmpfs provides a filesystem implementation that behaves like tmpfs:
 // the Dentry tree is the sole source of truth for the state of the filesystem.
 //
-// memfs is intended primarily to demonstrate filesystem implementation
-// patterns. Real uses cases for an in-memory filesystem should use tmpfs
-// instead.
-//
 // Lock order:
 //
 // filesystem.mu
 //   regularFileFD.offMu
 //     regularFile.mu
 //   inode.mu
-package memfs
+package tmpfs
 
 import (
 	"fmt"
+	"math"
 	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -46,6 +44,9 @@ type FilesystemType struct{}
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
+	// memFile is used to allocate pages to for regular files.
+	memFile *pgalloc.MemoryFile
+
 	// mu serializes changes to the Dentry tree.
 	mu sync.RWMutex
 
@@ -54,7 +55,13 @@ type filesystem struct {
 
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	var fs filesystem
+	memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
+	if memFileProvider == nil {
+		panic("MemoryFileProviderFromContext returned nil")
+	}
+	fs := filesystem{
+		memFile: memFileProvider.MemoryFile(),
+	}
 	fs.vfsfs.Init(vfsObj, &fs)
 	root := fs.newDentry(fs.newDirectory(creds, 01777))
 	return &fs.vfsfs, &root.vfsd, nil
@@ -64,12 +71,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 func (fs *filesystem) Release() {
 }
 
-// Sync implements vfs.FilesystemImpl.Sync.
-func (fs *filesystem) Sync(ctx context.Context) error {
-	// All filesystem state is in-memory.
-	return nil
-}
-
 // dentry implements vfs.DentryImpl.
 type dentry struct {
 	vfsd vfs.Dentry
@@ -79,11 +80,11 @@ type dentry struct {
 	// immutable.
 	inode *inode
 
-	// memfs doesn't count references on dentries; because the dentry tree is
+	// tmpfs doesn't count references on dentries; because the dentry tree is
 	// the sole source of truth, it is by definition always consistent with the
 	// state of the filesystem. However, it does count references on inodes,
 	// because inode resources are released when all references are dropped.
-	// (memfs doesn't really have resources to release, but we implement
+	// (tmpfs doesn't really have resources to release, but we implement
 	// reference counting because tmpfs regular files will.)
 
 	// dentryEntry (ugh) links dentries into their parent directory.childList.
@@ -137,6 +138,8 @@ type inode struct {
 	impl interface{} // immutable
 }
 
+const maxLinks = math.MaxUint32
+
 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
 	i.refs = 1
 	i.mode = uint32(mode)
@@ -147,25 +150,33 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials,
 	i.impl = impl
 }
 
-// Preconditions: filesystem.mu must be locked for writing.
+// incLinksLocked increments i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// i.nlink < maxLinks.
 func (i *inode) incLinksLocked() {
-	if atomic.AddUint32(&i.nlink, 1) <= 1 {
-		panic("memfs.inode.incLinksLocked() called with no existing links")
+	if i.nlink == 0 {
+		panic("tmpfs.inode.incLinksLocked() called with no existing links")
 	}
+	if i.nlink == maxLinks {
+		panic("memfs.inode.incLinksLocked() called with maximum link count")
+	}
+	atomic.AddUint32(&i.nlink, 1)
 }
 
-// Preconditions: filesystem.mu must be locked for writing.
+// decLinksLocked decrements i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
 func (i *inode) decLinksLocked() {
-	if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
-		i.decRef()
-	} else if nlink == ^uint32(0) { // negative overflow
-		panic("memfs.inode.decLinksLocked() called with no existing links")
+	if i.nlink == 0 {
+		panic("tmpfs.inode.decLinksLocked() called with no existing links")
 	}
+	atomic.AddUint32(&i.nlink, ^uint32(0))
 }
 
 func (i *inode) incRef() {
 	if atomic.AddInt64(&i.refs, 1) <= 1 {
-		panic("memfs.inode.incRef() called without holding a reference")
+		panic("tmpfs.inode.incRef() called without holding a reference")
 	}
 }
 
@@ -184,14 +195,14 @@ func (i *inode) tryIncRef() bool {
 func (i *inode) decRef() {
 	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
 		// This is unnecessary; it's mostly to simulate what tmpfs would do.
-		if regfile, ok := i.impl.(*regularFile); ok {
-			regfile.mu.Lock()
-			regfile.data = nil
-			atomic.StoreInt64(&regfile.dataLen, 0)
-			regfile.mu.Unlock()
+		if regFile, ok := i.impl.(*regularFile); ok {
+			regFile.mu.Lock()
+			regFile.data.DropAll(regFile.memFile)
+			atomic.StoreUint64(&regFile.size, 0)
+			regFile.mu.Unlock()
 		}
 	} else if refs < 0 {
-		panic("memfs.inode.decRef() called without holding a reference")
+		panic("tmpfs.inode.decRef() called without holding a reference")
 	}
 }
 
@@ -215,7 +226,7 @@ func (i *inode) statTo(stat *linux.Statx) {
 	case *regularFile:
 		stat.Mode |= linux.S_IFREG
 		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
-		stat.Size = uint64(atomic.LoadInt64(&impl.dataLen))
+		stat.Size = uint64(atomic.LoadUint64(&impl.size))
 		// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
 		// a uint64 accessed using atomic memory operations to avoid taking
 		// locks).
@@ -256,13 +267,11 @@ func (i *inode) direntType() uint8 {
 	}
 }
 
-// fileDescription is embedded by memfs implementations of
+// fileDescription is embedded by tmpfs implementations of
 // vfs.FileDescriptionImpl.
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
-
-	flags uint32 // status flags; immutable
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
@@ -273,18 +282,6 @@ func (fd *fileDescription) inode() *inode {
 	return fd.vfsfd.Dentry().Impl().(*dentry).inode
 }
 
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
-}
-
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	var stat linux.Statx
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index bd3fb4c03..8653d2f63 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -762,7 +762,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		mounts.IncRef()
 	}
 
-	tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+	tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
 	ctx := args.NewContext(k)
 
 	// Get the root directory from the MountNamespace.
@@ -1191,6 +1191,11 @@ func (k *Kernel) GlobalInit() *ThreadGroup {
 	return k.globalInit
 }
 
+// TestOnly_SetGlobalInit sets the thread group with ID 1 in the root PID namespace.
+func (k *Kernel) TestOnly_SetGlobalInit(tg *ThreadGroup) {
+	k.globalInit = tg
+}
+
 // ApplicationCores returns the number of CPUs visible to sandboxed
 // applications.
 func (k *Kernel) ApplicationCores() uint {
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 24ea002ba..b14429854 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -15,17 +15,29 @@
 package kernel
 
 import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// Restartable sequences, as described in https://lwn.net/Articles/650333/.
+// Restartable sequences.
+//
+// We support two different APIs for restartable sequences.
+//
+//  1. The upstream interface added in v4.18.
+//  2. The interface described in https://lwn.net/Articles/650333/.
+//
+// Throughout this file and other parts of the kernel, the latter is referred
+// to as "old rseq". This interface was never merged upstream, but is supported
+// for a limited set of applications that use it regardless.
 
-// RSEQCriticalRegion describes a restartable sequence critical region.
+// OldRSeqCriticalRegion describes an old rseq critical region.
 //
 // +stateify savable
-type RSEQCriticalRegion struct {
+type OldRSeqCriticalRegion struct {
 	// When a task in this thread group has its CPU preempted (as defined by
 	// platform.ErrContextCPUPreempted) or has a signal delivered to an
 	// application handler while its instruction pointer is in CriticalSection,
@@ -35,86 +47,359 @@ type RSEQCriticalRegion struct {
 	Restart         usermem.Addr
 }
 
-// RSEQAvailable returns true if t supports restartable sequences.
-func (t *Task) RSEQAvailable() bool {
+// RSeqAvailable returns true if t supports (old and new) restartable sequences.
+func (t *Task) RSeqAvailable() bool {
 	return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption()
 }
 
-// RSEQCriticalRegion returns a copy of t's thread group's current restartable
-// sequence.
-func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion {
-	return *t.tg.rscr.Load().(*RSEQCriticalRegion)
+// SetRSeq registers addr as this thread's rseq structure.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error {
+	if t.rseqAddr != 0 {
+		if t.rseqAddr != addr {
+			return syserror.EINVAL
+		}
+		if t.rseqSignature != signature {
+			return syserror.EINVAL
+		}
+		return syserror.EBUSY
+	}
+
+	// rseq must be aligned and correctly sized.
+	if addr&(linux.AlignOfRSeq-1) != 0 {
+		return syserror.EINVAL
+	}
+	if length != linux.SizeOfRSeq {
+		return syserror.EINVAL
+	}
+	if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok {
+		return syserror.EFAULT
+	}
+
+	t.rseqAddr = addr
+	t.rseqSignature = signature
+
+	// Initialize the CPUID.
+	//
+	// Linux implicitly does this on return from userspace, where failure
+	// would cause SIGSEGV.
+	if err := t.rseqUpdateCPU(); err != nil {
+		t.rseqAddr = 0
+		t.rseqSignature = 0
+
+		t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return syserror.EFAULT
+	}
+
+	return nil
 }
 
-// SetRSEQCriticalRegion replaces t's thread group's restartable sequence.
+// ClearRSeq unregisters addr as this thread's rseq structure.
 //
-// Preconditions: t.RSEQAvailable() == true.
-func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error {
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) ClearRSeq(addr usermem.Addr, length, signature uint32) error {
+	if t.rseqAddr == 0 {
+		return syserror.EINVAL
+	}
+	if t.rseqAddr != addr {
+		return syserror.EINVAL
+	}
+	if length != linux.SizeOfRSeq {
+		return syserror.EINVAL
+	}
+	if t.rseqSignature != signature {
+		return syserror.EPERM
+	}
+
+	if err := t.rseqClearCPU(); err != nil {
+		return err
+	}
+
+	t.rseqAddr = 0
+	t.rseqSignature = 0
+
+	if t.oldRSeqCPUAddr == 0 {
+		// rseqCPU no longer needed.
+		t.rseqCPU = -1
+	}
+
+	return nil
+}
+
+// OldRSeqCriticalRegion returns a copy of t's thread group's current
+// old restartable sequence.
+func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion {
+	return *t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
+}
+
+// SetOldRSeqCriticalRegion replaces t's thread group's old restartable
+// sequence.
+//
+// Preconditions: t.RSeqAvailable() == true.
+func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error {
 	// These checks are somewhat more lenient than in Linux, which (bizarrely)
-	// requires rscr.CriticalSection to be non-empty and rscr.Restart to be
-	// outside of rscr.CriticalSection, even if rscr.CriticalSection.Start == 0
+	// requires r.CriticalSection to be non-empty and r.Restart to be
+	// outside of r.CriticalSection, even if r.CriticalSection.Start == 0
 	// (which disables the critical region).
-	if rscr.CriticalSection.Start == 0 {
-		rscr.CriticalSection.End = 0
-		rscr.Restart = 0
-		t.tg.rscr.Store(&rscr)
+	if r.CriticalSection.Start == 0 {
+		r.CriticalSection.End = 0
+		r.Restart = 0
+		t.tg.oldRSeqCritical.Store(&r)
 		return nil
 	}
-	if rscr.CriticalSection.Start >= rscr.CriticalSection.End {
+	if r.CriticalSection.Start >= r.CriticalSection.End {
 		return syserror.EINVAL
 	}
-	if rscr.CriticalSection.Contains(rscr.Restart) {
+	if r.CriticalSection.Contains(r.Restart) {
 		return syserror.EINVAL
 	}
-	// TODO(jamieliu): check that rscr.CriticalSection and rscr.Restart are in
-	// the application address range, for consistency with Linux
-	t.tg.rscr.Store(&rscr)
+	// TODO(jamieliu): check that r.CriticalSection and r.Restart are in
+	// the application address range, for consistency with Linux.
+	t.tg.oldRSeqCritical.Store(&r)
 	return nil
 }
 
-// RSEQCPUAddr returns the address that RSEQ will keep updated with t's CPU
-// number.
+// OldRSeqCPUAddr returns the address that old rseq will keep updated with t's
+// CPU number.
 //
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) RSEQCPUAddr() usermem.Addr {
-	return t.rseqCPUAddr
+func (t *Task) OldRSeqCPUAddr() usermem.Addr {
+	return t.oldRSeqCPUAddr
 }
 
-// SetRSEQCPUAddr replaces the address that RSEQ will keep updated with t's CPU
-// number.
+// SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
+// t's CPU number.
 //
-// Preconditions: t.RSEQAvailable() == true. The caller must be running on the
+// Preconditions: t.RSeqAvailable() == true. The caller must be running on the
 // task goroutine. t's AddressSpace must be active.
-func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error {
-	t.rseqCPUAddr = addr
-	if addr != 0 {
-		t.rseqCPU = int32(hostcpu.GetCPU())
-		if err := t.rseqCopyOutCPU(); err != nil {
-			t.rseqCPUAddr = 0
-			t.rseqCPU = -1
-			return syserror.EINVAL // yes, EINVAL, not err or EFAULT
-		}
-	} else {
-		t.rseqCPU = -1
+func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
+	t.oldRSeqCPUAddr = addr
+
+	// Check that addr is writable.
+	//
+	// N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's
+	// unfortunate, but unlikely in a correct program.
+	if err := t.rseqUpdateCPU(); err != nil {
+		t.oldRSeqCPUAddr = 0
+		return syserror.EINVAL // yes, EINVAL, not err or EFAULT
 	}
 	return nil
 }
 
 // Preconditions: The caller must be running on the task goroutine. t's
 // AddressSpace must be active.
-func (t *Task) rseqCopyOutCPU() error {
+func (t *Task) rseqUpdateCPU() error {
+	if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
+		t.rseqCPU = -1
+		return nil
+	}
+
+	t.rseqCPU = int32(hostcpu.GetCPU())
+
+	// Update both CPUs, even if one fails.
+	rerr := t.rseqCopyOutCPU()
+	oerr := t.oldRSeqCopyOutCPU()
+
+	if rerr != nil {
+		return rerr
+	}
+	return oerr
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) oldRSeqCopyOutCPU() error {
+	if t.oldRSeqCPUAddr == 0 {
+		return nil
+	}
+
 	buf := t.CopyScratchBuffer(4)
 	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
-	_, err := t.CopyOutBytes(t.rseqCPUAddr, buf)
+	_, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf)
+	return err
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqCopyOutCPU() error {
+	if t.rseqAddr == 0 {
+		return nil
+	}
+
+	buf := t.CopyScratchBuffer(8)
+	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
+	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))     // CPUIDStart
+	usermem.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID
+	// N.B. This write is not atomic, but since this occurs on the task
+	// goroutine then as long as userspace uses a single-instruction read
+	// it can't see an invalid value.
+	_, err := t.CopyOutBytes(t.rseqAddr, buf)
+	return err
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqClearCPU() error {
+	buf := t.CopyScratchBuffer(8)
+	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
+	usermem.ByteOrder.PutUint32(buf, 0)                                   // CPUIDStart
+	usermem.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID
+	// N.B. This write is not atomic, but since this occurs on the task
+	// goroutine then as long as userspace uses a single-instruction read
+	// it can't see an invalid value.
+	_, err := t.CopyOutBytes(t.rseqAddr, buf)
 	return err
 }
 
+// rseqAddrInterrupt checks if IP is in a critical section, and aborts if so.
+//
+// This is a bit complex since both the RSeq and RSeqCriticalSection structs
+// are stored in userspace. So we must:
+//
+// 1. Copy in the address of RSeqCriticalSection from RSeq.
+// 2. Copy in RSeqCriticalSection itself.
+// 3. Validate critical section struct version, address range, abort address.
+// 4. Validate the abort signature (4 bytes preceding abort IP match expected
+//    signature).
+// 5. Clear address of RSeqCriticalSection from RSeq.
+// 6. Finally, conditionally abort.
+//
+// See kernel/rseq.c:rseq_ip_fixup for reference.
+//
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqAddrInterrupt() {
+	if t.rseqAddr == 0 {
+		return
+	}
+
+	critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection)
+	if !ok {
+		// SetRSeq should validate this.
+		panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr))
+	}
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		t.Debugf("Only 64-bit rseq supported.")
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	buf := t.CopyScratchBuffer(8)
+	if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil {
+		t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	critAddr := usermem.Addr(usermem.ByteOrder.Uint64(buf))
+	if critAddr == 0 {
+		return
+	}
+
+	buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection)
+	if _, err := t.CopyInBytes(critAddr, buf); err != nil {
+		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Manually marshal RSeqCriticalSection as this is in the hot path when
+	// rseq is enabled. It must be as fast as possible.
+	//
+	// TODO(b/130243041): Replace with go_marshal.
+	cs := linux.RSeqCriticalSection{
+		Version:          usermem.ByteOrder.Uint32(buf[0:4]),
+		Flags:            usermem.ByteOrder.Uint32(buf[4:8]),
+		Start:            usermem.ByteOrder.Uint64(buf[8:16]),
+		PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]),
+		Abort:            usermem.ByteOrder.Uint64(buf[24:32]),
+	}
+
+	if cs.Version != 0 {
+		t.Debugf("Unknown version in %+v", cs)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	start := usermem.Addr(cs.Start)
+	critRange, ok := start.ToRange(cs.PostCommitOffset)
+	if !ok {
+		t.Debugf("Invalid start and offset in %+v", cs)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	abort := usermem.Addr(cs.Abort)
+	if critRange.Contains(abort) {
+		t.Debugf("Abort in critical section in %+v", cs)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Verify signature.
+	sigAddr := abort - linux.SizeOfRSeqSignature
+
+	buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature)
+	if _, err := t.CopyInBytes(sigAddr, buf); err != nil {
+		t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	sig := usermem.ByteOrder.Uint32(buf)
+	if sig != t.rseqSignature {
+		t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Clear the critical section address.
+	//
+	// NOTE(b/143949567): We don't support any rseq flags, so we always
+	// restart if we are in the critical section, and thus *always* clear
+	// critAddrAddr.
+	if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{
+		AddressSpaceActive: true,
+	}); err != nil {
+		t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Finally we can actually decide whether or not to restart.
+	if !critRange.Contains(usermem.Addr(t.Arch().IP())) {
+		return
+	}
+
+	t.Arch().SetIP(uintptr(cs.Abort))
+}
+
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) rseqInterrupt() {
-	rscr := t.tg.rscr.Load().(*RSEQCriticalRegion)
-	if ip := t.Arch().IP(); rscr.CriticalSection.Contains(usermem.Addr(ip)) {
-		t.Debugf("Interrupted RSEQ critical section at %#x; restarting at %#x", ip, rscr.Restart)
-		t.Arch().SetIP(uintptr(rscr.Restart))
-		t.Arch().SetRSEQInterruptedIP(ip)
+func (t *Task) oldRSeqInterrupt() {
+	r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
+	if ip := t.Arch().IP(); r.CriticalSection.Contains(usermem.Addr(ip)) {
+		t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart)
+		t.Arch().SetIP(uintptr(r.Restart))
+		t.Arch().SetOldRSeqInterruptedIP(ip)
 	}
 }
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) rseqInterrupt() {
+	t.rseqAddrInterrupt()
+	t.oldRSeqInterrupt()
+}
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 5bd610f68..19034a21e 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -71,9 +71,20 @@ type Registry struct {
 	mu sync.Mutex `state:"nosave"`
 
 	// shms maps segment ids to segments.
+	//
+	// shms holds all referenced segments, which are removed on the last
+	// DecRef. Thus, it cannot itself hold a reference on the Shm.
+	//
+	// Since removal only occurs after the last (unlocked) DecRef, there
+	// exists a short window during which a Shm still exists in Shm, but is
+	// unreferenced. Users must use TryIncRef to determine if the Shm is
+	// still valid.
 	shms map[ID]*Shm
 
 	// keysToShms maps segment keys to segments.
+	//
+	// Shms in keysToShms are guaranteed to be referenced, as they are
+	// removed by disassociateKey before the last DecRef.
 	keysToShms map[Key]*Shm
 
 	// Sum of the sizes of all existing segments rounded up to page size, in
@@ -95,10 +106,18 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry {
 }
 
 // FindByID looks up a segment given an ID.
+//
+// FindByID returns a reference on Shm.
 func (r *Registry) FindByID(id ID) *Shm {
 	r.mu.Lock()
 	defer r.mu.Unlock()
-	return r.shms[id]
+	s := r.shms[id]
+	// Take a reference on s. If TryIncRef fails, s has reached the last
+	// DecRef, but hasn't quite been removed from r.shms yet.
+	if s != nil && s.TryIncRef() {
+		return s
+	}
+	return nil
 }
 
 // dissociateKey removes the association between a segment and its key,
@@ -119,6 +138,8 @@ func (r *Registry) dissociateKey(s *Shm) {
 
 // FindOrCreate looks up or creates a segment in the registry. It's functionally
 // analogous to open(2).
+//
+// FindOrCreate returns a reference on Shm.
 func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
 	if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
 		// "A new segment was to be created and size is less than SHMMIN or
@@ -166,6 +187,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
 				return nil, syserror.EEXIST
 			}
 
+			shm.IncRef()
 			return shm, nil
 		}
 
@@ -193,7 +215,14 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
 	// Need to create a new segment.
 	creator := fs.FileOwnerFromContext(ctx)
 	perms := fs.FilePermsFromMode(mode)
-	return r.newShm(ctx, pid, key, creator, perms, size)
+	s, err := r.newShm(ctx, pid, key, creator, perms, size)
+	if err != nil {
+		return nil, err
+	}
+	// The initial reference is held by s itself. Take another to return to
+	// the caller.
+	s.IncRef()
+	return s, nil
 }
 
 // newShm creates a new segment in the registry.
@@ -296,22 +325,26 @@ func (r *Registry) remove(s *Shm) {
 
 // Shm represents a single shared memory segment.
 //
-// Shm segment are backed directly by an allocation from platform
-// memory. Segments are always mapped as a whole, greatly simplifying how
-// mappings are tracked. However note that mremap and munmap calls may cause the
-// vma for a segment to become fragmented; which requires special care when
-// unmapping a segment. See mm/shm.go.
+// Shm segment are backed directly by an allocation from platform memory.
+// Segments are always mapped as a whole, greatly simplifying how mappings are
+// tracked. However note that mremap and munmap calls may cause the vma for a
+// segment to become fragmented; which requires special care when unmapping a
+// segment. See mm/shm.go.
 //
 // Segments persist until they are explicitly marked for destruction via
-// shmctl(SHM_RMID).
+// MarkDestroyed().
 //
 // Shm implements memmap.Mappable and memmap.MappingIdentity.
 //
 // +stateify savable
 type Shm struct {
-	// AtomicRefCount tracks the number of references to this segment from
-	// maps. A segment always holds a reference to itself, until it's marked for
+	// AtomicRefCount tracks the number of references to this segment.
+	//
+	// A segment holds a reference to itself until it is marked for
 	// destruction.
+	//
+	// In addition to direct users, the MemoryManager will hold references
+	// via MappingIdentity.
 	refs.AtomicRefCount
 
 	mfp pgalloc.MemoryFileProvider
@@ -484,9 +517,8 @@ type AttachOpts struct {
 // ConfigureAttach creates an mmap configuration for the segment with the
 // requested attach options.
 //
-// ConfigureAttach returns with a ref on s on success. The caller should drop
-// this once the map is installed. This reference prevents s from being
-// destroyed before the returned configuration is used.
+// Postconditions: The returned MMapOpts are valid only as long as a reference
+// continues to be held on s.
 func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -504,7 +536,6 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts Attac
 		// in the user namespace that governs its IPC namespace." - man shmat(2)
 		return memmap.MMapOpts{}, syserror.EACCES
 	}
-	s.IncRef()
 	return memmap.MMapOpts{
 		Length: s.size,
 		Offset: 0,
@@ -549,10 +580,15 @@ func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
 	}
 	creds := auth.CredentialsFromContext(ctx)
 
-	nattach := uint64(s.ReadRefs())
-	// Don't report the self-reference we keep prior to being marked for
-	// destruction. However, also don't report a count of -1 for segments marked
-	// as destroyed, with no mappings.
+	// Use the reference count as a rudimentary count of the number of
+	// attaches. We exclude:
+	//
+	// 1. The reference the caller holds.
+	// 2. The self-reference held by s prior to destruction.
+	//
+	// Note that this may still overcount by including transient references
+	// used in concurrent calls.
+	nattach := uint64(s.ReadRefs()) - 1
 	if !s.pendingDestruction {
 		nattach--
 	}
@@ -620,18 +656,17 @@ func (s *Shm) MarkDestroyed() {
 	s.registry.dissociateKey(s)
 
 	s.mu.Lock()
-	// Only drop the segment's self-reference once, when destruction is
-	// requested. Otherwise, repeated calls to shmctl(IPC_RMID) would force a
-	// segment to be destroyed prematurely, potentially with active maps to the
-	// segment's address range. Remaining references are dropped when the
-	// segment is detached or unmaped.
+	defer s.mu.Unlock()
 	if !s.pendingDestruction {
 		s.pendingDestruction = true
-		s.mu.Unlock() // Must release s.mu before calling s.DecRef.
+		// Drop the self-reference so destruction occurs when all
+		// external references are gone.
+		//
+		// N.B. This cannot be the final DecRef, as the caller also
+		// holds a reference.
 		s.DecRef()
 		return
 	}
-	s.mu.Unlock()
 }
 
 // checkOwnership verifies whether a segment may be accessed by ctx as an
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index ab0c6c4aa..d25a7903b 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -489,18 +489,43 @@ type Task struct {
 	// netns is protected by mu. netns is owned by the task goroutine.
 	netns bool
 
-	// If rseqPreempted is true, before the next call to p.Switch(), interrupt
-	// RSEQ critical regions as defined by tg.rseq and write the task
-	// goroutine's CPU number to rseqCPUAddr. rseqCPU is the last CPU number
-	// written to rseqCPUAddr.
+	// If rseqPreempted is true, before the next call to p.Switch(),
+	// interrupt rseq critical regions as defined by rseqAddr and
+	// tg.oldRSeqCritical and write the task goroutine's CPU number to
+	// rseqAddr/oldRSeqCPUAddr.
 	//
-	// If rseqCPUAddr is 0, rseqCPU is -1.
+	// We support two ABIs for restartable sequences:
 	//
-	// rseqCPUAddr, rseqCPU, and rseqPreempted are exclusive to the task
-	// goroutine.
+	//  1. The upstream interface added in v4.18,
+	//  2. An "old" interface never merged upstream. In the implementation,
+	//     this is referred to as "old rseq".
+	//
+	// rseqPreempted is exclusive to the task goroutine.
 	rseqPreempted bool `state:"nosave"`
-	rseqCPUAddr   usermem.Addr
-	rseqCPU       int32
+
+	// rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr.
+	//
+	// If rseq is unused, rseqCPU is -1 for convenient use in
+	// platform.Context.Switch.
+	//
+	// rseqCPU is exclusive to the task goroutine.
+	rseqCPU int32
+
+	// oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable.
+	//
+	// oldRSeqCPUAddr is exclusive to the task goroutine.
+	oldRSeqCPUAddr usermem.Addr
+
+	// rseqAddr is a pointer to the userspace linux.RSeq structure.
+	//
+	// rseqAddr is exclusive to the task goroutine.
+	rseqAddr usermem.Addr
+
+	// rseqSignature is the signature that the rseq abort IP must be signed
+	// with.
+	//
+	// rseqSignature is exclusive to the task goroutine.
+	rseqSignature uint32
 
 	// copyScratchBuffer is a buffer available to CopyIn/CopyOut
 	// implementations that require an intermediate buffer to copy data
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 3eadfedb4..247bd4aba 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -236,14 +236,19 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	} else if opts.NewPIDNamespace {
 		pidns = pidns.NewChild(userns)
 	}
+
 	tg := t.tg
+	rseqAddr := usermem.Addr(0)
+	rseqSignature := uint32(0)
 	if opts.NewThreadGroup {
 		tg.mounts.IncRef()
 		sh := t.tg.signalHandlers
 		if opts.NewSignalHandlers {
 			sh = sh.Fork()
 		}
-		tg = t.k.newThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
+		tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy())
+		rseqAddr = t.rseqAddr
+		rseqSignature = t.rseqSignature
 	}
 
 	cfg := &TaskConfig{
@@ -260,6 +265,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		UTSNamespace:            utsns,
 		IPCNamespace:            ipcns,
 		AbstractSocketNamespace: t.abstractSockets,
+		RSeqAddr:                rseqAddr,
+		RSeqSignature:           rseqSignature,
 		ContainerID:             t.ContainerID(),
 	}
 	if opts.NewThreadGroup {
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 90a6190f1..fa6528386 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -190,9 +190,11 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.updateRSSLocked()
 	// Restartable sequence state is discarded.
 	t.rseqPreempted = false
-	t.rseqCPUAddr = 0
 	t.rseqCPU = -1
-	t.tg.rscr.Store(&RSEQCriticalRegion{})
+	t.rseqAddr = 0
+	t.rseqSignature = 0
+	t.oldRSeqCPUAddr = 0
+	t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
 	t.tg.pidns.owner.mu.Unlock()
 
 	// Remove FDs with the CloseOnExec flag set.
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index d97f8c189..6357273d3 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -169,12 +169,22 @@ func (*runApp) execute(t *Task) taskRunState {
 	// Apply restartable sequences.
 	if t.rseqPreempted {
 		t.rseqPreempted = false
-		if t.rseqCPUAddr != 0 {
+		if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 {
+			// Linux writes the CPU on every preemption. We only do
+			// so if it changed. Thus we may delay delivery of
+			// SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid.
 			cpu := int32(hostcpu.GetCPU())
 			if t.rseqCPU != cpu {
 				t.rseqCPU = cpu
 				if err := t.rseqCopyOutCPU(); err != nil {
-					t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err)
+					t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
+					t.forceSignal(linux.SIGSEGV, false)
+					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+					// Re-enter the task run loop for signal delivery.
+					return (*runApp)(nil)
+				}
+				if err := t.oldRSeqCopyOutCPU(); err != nil {
+					t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err)
 					t.forceSignal(linux.SIGSEGV, false)
 					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
 					// Re-enter the task run loop for signal delivery.
@@ -320,7 +330,7 @@ func (*runApp) execute(t *Task) taskRunState {
 		return (*runApp)(nil)
 
 	case platform.ErrContextCPUPreempted:
-		// Ensure that RSEQ critical sections are interrupted and per-thread
+		// Ensure that rseq critical sections are interrupted and per-thread
 		// CPU values are updated before the next platform.Context.Switch().
 		t.rseqPreempted = true
 		return (*runApp)(nil)
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 3522a4ae5..58af16ee2 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -79,6 +80,13 @@ type TaskConfig struct {
 	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
 	AbstractSocketNamespace *AbstractSocketNamespace
 
+	// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
+	RSeqAddr usermem.Addr
+
+	// RSeqSignature is the signature that the rseq abort IP must be signed
+	// with.
+	RSeqSignature uint32
+
 	// ContainerID is the container the new task belongs to.
 	ContainerID string
 }
@@ -126,6 +134,8 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		ipcns:           cfg.IPCNamespace,
 		abstractSockets: cfg.AbstractSocketNamespace,
 		rseqCPU:         -1,
+		rseqAddr:        cfg.RSeqAddr,
+		rseqSignature:   cfg.RSeqSignature,
 		futexWaiter:     futex.NewWaiter(),
 		containerID:     cfg.ContainerID,
 	}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 72568d296..c0197a563 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -238,8 +238,8 @@ type ThreadGroup struct {
 	// execed is protected by the TaskSet mutex.
 	execed bool
 
-	// rscr is the thread group's RSEQ critical region.
-	rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
+	// oldRSeqCritical is the thread group's old rseq critical region.
+	oldRSeqCritical atomic.Value `state:".(*OldRSeqCriticalRegion)"`
 
 	// mounts is the thread group's mount namespace. This does not really
 	// correspond to a "mount namespace" in Linux, but is more like a
@@ -256,35 +256,35 @@ type ThreadGroup struct {
 	tty *TTY
 }
 
-// newThreadGroup returns a new, empty thread group in PID namespace ns. The
+// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
 // thread group leader will send its parent terminationSignal when it exits.
 // The new thread group isn't visible to the system until a task has been
 // created inside of it by a successful call to TaskSet.NewTask.
-func (k *Kernel) newThreadGroup(mounts *fs.MountNamespace, ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
+func (k *Kernel) NewThreadGroup(mntns *fs.MountNamespace, pidns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet) *ThreadGroup {
 	tg := &ThreadGroup{
 		threadGroupNode: threadGroupNode{
-			pidns: ns,
+			pidns: pidns,
 		},
 		signalHandlers:    sh,
 		terminationSignal: terminationSignal,
 		ioUsage:           &usage.IO{},
 		limits:            limits,
-		mounts:            mounts,
+		mounts:            mntns,
 	}
 	tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
 	tg.timers = make(map[linux.TimerID]*IntervalTimer)
-	tg.rscr.Store(&RSEQCriticalRegion{})
+	tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
 	return tg
 }
 
-// saveRscr is invoked by stateify.
-func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion {
-	return tg.rscr.Load().(*RSEQCriticalRegion)
+// saveOldRSeqCritical is invoked by stateify.
+func (tg *ThreadGroup) saveOldRSeqCritical() *OldRSeqCriticalRegion {
+	return tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
 }
 
-// loadRscr is invoked by stateify.
-func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) {
-	tg.rscr.Store(rscr)
+// loadOldRSeqCritical is invoked by stateify.
+func (tg *ThreadGroup) loadOldRSeqCritical(r *OldRSeqCriticalRegion) {
+	tg.oldRSeqCritical.Store(r)
 }
 
 // SignalHandlers returns the signal handlers used by tg.
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
index 8c2246bb4..79610acb7 100644
--- a/pkg/sentry/mm/procfs.go
+++ b/pkg/sentry/mm/procfs.go
@@ -66,8 +66,6 @@ func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer
 	var start usermem.Addr
 
 	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
-		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
-		// "panic: autosave error: type usermem.Addr is not registered".
 		mm.appendVMAMapsEntryLocked(ctx, vseg, buf)
 	}
 
@@ -81,7 +79,6 @@ func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer
 	//
 	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
 	if start != vsyscallEnd {
-		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
 		buf.WriteString(vsyscallMapsEntry)
 	}
 }
@@ -97,8 +94,6 @@ func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile
 		start = *handle.(*usermem.Addr)
 	}
 	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
-		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
-		// "panic: autosave error: type usermem.Addr is not registered".
 		vmaAddr := vseg.End()
 		data = append(data, seqfile.SeqData{
 			Buf:    mm.vmaMapsEntryLocked(ctx, vseg),
@@ -116,7 +111,6 @@ func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile
 	//
 	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
 	if start != vsyscallEnd {
-		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
 		vmaAddr := vsyscallEnd
 		data = append(data, seqfile.SeqData{
 			Buf:    []byte(vsyscallMapsEntry),
@@ -187,15 +181,12 @@ func (mm *MemoryManager) ReadSmapsDataInto(ctx context.Context, buf *bytes.Buffe
 	var start usermem.Addr
 
 	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
-		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
-		// "panic: autosave error: type usermem.Addr is not registered".
 		mm.vmaSmapsEntryIntoLocked(ctx, vseg, buf)
 	}
 
 	// We always emulate vsyscall, so advertise it here. See
 	// ReadMapsSeqFileData for additional commentary.
 	if start != vsyscallEnd {
-		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
 		buf.WriteString(vsyscallSmapsEntry)
 	}
 }
@@ -211,8 +202,6 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil
 		start = *handle.(*usermem.Addr)
 	}
 	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
-		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
-		// "panic: autosave error: type usermem.Addr is not registered".
 		vmaAddr := vseg.End()
 		data = append(data, seqfile.SeqData{
 			Buf:    mm.vmaSmapsEntryLocked(ctx, vseg),
@@ -223,7 +212,6 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil
 	// We always emulate vsyscall, so advertise it here. See
 	// ReadMapsSeqFileData for additional commentary.
 	if start != vsyscallEnd {
-		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
 		vmaAddr := vsyscallEnd
 		data = append(data, seqfile.SeqData{
 			Buf:    []byte(vsyscallSmapsEntry),
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 5c52d4007..f3afd98da 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -34,6 +34,8 @@ go_library(
         "machine_arm64_unsafe.go",
         "machine_unsafe.go",
         "physical_map.go",
+        "physical_map_amd64.go",
+        "physical_map_arm64.go",
         "virtual_map.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/platform/kvm",
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
index 586e91bb2..91de5dab1 100644
--- a/pkg/sentry/platform/kvm/physical_map.go
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -24,15 +24,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
-const (
-	// reservedMemory is a chunk of physical memory reserved starting at
-	// physical address zero. There are some special pages in this region,
-	// so we just call the whole thing off.
-	//
-	// Other architectures may define this to be zero.
-	reservedMemory = 0x100000000
-)
-
 type region struct {
 	virtual uintptr
 	length  uintptr
@@ -59,8 +50,7 @@ func fillAddressSpace() (excludedRegions []region) {
 	// We can cut vSize in half, because the kernel will be using the top
 	// half and we ignore it while constructing mappings. It's as if we've
 	// already excluded half the possible addresses.
-	vSize := uintptr(1) << ring0.VirtualAddressBits()
-	vSize = vSize >> 1
+	vSize := ring0.UserspaceSize
 
 	// We exclude reservedMemory below from our physical memory size, so it
 	// needs to be dropped here as well. Otherwise, we could end up with
diff --git a/pkg/sentry/fsimpl/proc/filesystems.go b/pkg/sentry/platform/kvm/physical_map_amd64.go
index 0e016bca5..c5adfb577 100644
--- a/pkg/sentry/fsimpl/proc/filesystems.go
+++ b/pkg/sentry/platform/kvm/physical_map_amd64.go
@@ -12,14 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package proc
+package kvm
 
-// filesystemsData implements vfs.DynamicBytesSource for /proc/filesystems.
-//
-// +stateify savable
-type filesystemsData struct{}
-
-// TODO(gvisor.dev/issue/1195): Implement vfs.DynamicBytesSource.Generate for
-// filesystemsData. We would need to retrive filesystem names from
-// vfs.VirtualFilesystem. Also needs vfs replacement for
-// fs.Filesystem.AllowUserList() and fs.FilesystemRequiresDev.
+const (
+	// reservedMemory is a chunk of physical memory reserved starting at
+	// physical address zero. There are some special pages in this region,
+	// so we just call the whole thing off.
+	reservedMemory = 0x100000000
+)
diff --git a/pkg/sentry/fsimpl/proc/proc.go b/pkg/sentry/platform/kvm/physical_map_arm64.go
index 31dec36de..4d8561453 100644
--- a/pkg/sentry/fsimpl/proc/proc.go
+++ b/pkg/sentry/platform/kvm/physical_map_arm64.go
@@ -12,5 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package proc implements a partial in-memory file system for procfs.
-package proc
+package kvm
+
+const (
+	reservedMemory = 0
+)
diff --git a/pkg/sentry/platform/ptrace/stub_amd64.s b/pkg/sentry/platform/ptrace/stub_amd64.s
index 64c718d21..16f9c523e 100644
--- a/pkg/sentry/platform/ptrace/stub_amd64.s
+++ b/pkg/sentry/platform/ptrace/stub_amd64.s
@@ -64,6 +64,8 @@ begin:
 	CMPQ AX, $0
 	JL error
 
+	MOVQ $0, BX
+
 	// SIGSTOP to wait for attach.
 	//
 	// The SYSCALL instruction will be used for future syscall injection by
@@ -73,23 +75,26 @@ begin:
 	MOVQ $SIGSTOP, SI
 	SYSCALL
 
-	// The tracer may "detach" and/or allow code execution here in three cases:
-	//
-	// 1. New (traced) stub threads are explicitly detached by the
-	// goroutine in newSubprocess. However, they are detached while in
-	// group-stop, so they do not execute code here.
-	//
-	// 2. If a tracer thread exits, it implicitly detaches from the stub,
-	// potentially allowing code execution here. However, the Go runtime
-	// never exits individual threads, so this case never occurs.
-	//
-	// 3. subprocess.createStub clones a new stub process that is untraced,
+	// The sentry sets BX to 1 when creating stub process.
+	CMPQ BX, $1
+	JE clone
+
+	// Notify the Sentry that syscall exited.
+done:
+	INT $3
+	// Be paranoid.
+	JMP done
+clone:
+	// subprocess.createStub clones a new stub process that is untraced,
 	// thus executing this code. We setup the PDEATHSIG before SIGSTOPing
 	// ourselves for attach by the tracer.
 	//
 	// R15 has been updated with the expected PPID.
-	JMP begin
+	CMPQ AX, $0
+	JE begin
 
+	// The clone syscall returns a non-zero value.
+	JMP done
 error:
 	// Exit with -errno.
 	MOVQ AX, DI
diff --git a/pkg/sentry/platform/ptrace/stub_arm64.s b/pkg/sentry/platform/ptrace/stub_arm64.s
index 2c5e4d5cb..6162df02a 100644
--- a/pkg/sentry/platform/ptrace/stub_arm64.s
+++ b/pkg/sentry/platform/ptrace/stub_arm64.s
@@ -59,6 +59,8 @@ begin:
 	CMP $0x0, R0
 	BLT error
 
+	MOVD $0, R9
+
 	// SIGSTOP to wait for attach.
 	//
 	// The SYSCALL instruction will be used for future syscall injection by
@@ -66,22 +68,26 @@ begin:
 	MOVD $SYS_KILL, R8
 	MOVD $SIGSTOP, R1
 	SVC
-	// The tracer may "detach" and/or allow code execution here in three cases:
-	//
-	// 1. New (traced) stub threads are explicitly detached by the
-	// goroutine in newSubprocess. However, they are detached while in
-	// group-stop, so they do not execute code here.
-	//
-	// 2. If a tracer thread exits, it implicitly detaches from the stub,
-	// potentially allowing code execution here. However, the Go runtime
-	// never exits individual threads, so this case never occurs.
-	//
-	// 3. subprocess.createStub clones a new stub process that is untraced,
+
+	// The sentry sets R9 to 1 when creating stub process.
+	CMP $1, R9
+	BEQ clone
+
+done:
+	// Notify the Sentry that syscall exited.
+	BRK $3
+	B done // Be paranoid.
+clone:
+	// subprocess.createStub clones a new stub process that is untraced,
 	// thus executing this code. We setup the PDEATHSIG before SIGSTOPing
 	// ourselves for attach by the tracer.
 	//
 	// R7 has been updated with the expected PPID.
-	B begin
+	CMP $0, R0
+	BEQ begin
+
+	// The clone system call returned a non-zero value.
+	B done
 
 error:
 	// Exit with -errno.
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index ddb1f41e3..20244fd95 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -21,6 +21,7 @@ import (
 	"sync"
 	"syscall"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/procid"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -429,13 +430,15 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 	}
 
 	for {
-		// Execute the syscall instruction.
-		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
+		// Execute the syscall instruction. The task has to stop on the
+		// trap instruction which is right after the syscall
+		// instruction.
+		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
 		}
 
 		sig := t.wait(stopped)
-		if sig == (syscallEvent | syscall.SIGTRAP) {
+		if sig == syscall.SIGTRAP {
 			// Reached syscall-enter-stop.
 			break
 		} else {
@@ -447,18 +450,6 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 		}
 	}
 
-	// Complete the actual system call.
-	if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
-		panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
-	}
-
-	// Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens
-	// between syscall-enter-stop and syscall-exit-stop; it happens *after*
-	// syscall-exit-stop.)" - ptrace(2), "Syscall-stops"
-	if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) {
-		t.dumpAndPanic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig))
-	}
-
 	// Grab registers.
 	if err := t.getRegs(regs); err != nil {
 		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
@@ -541,14 +532,14 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 		if isSingleStepping(regs) {
 			if _, _, errno := syscall.RawSyscall6(
 				syscall.SYS_PTRACE,
-				syscall.PTRACE_SYSEMU_SINGLESTEP,
+				unix.PTRACE_SYSEMU_SINGLESTEP,
 				uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 				panic(fmt.Sprintf("ptrace sysemu failed: %v", errno))
 			}
 		} else {
 			if _, _, errno := syscall.RawSyscall6(
 				syscall.SYS_PTRACE,
-				syscall.PTRACE_SYSEMU,
+				unix.PTRACE_SYSEMU,
 				uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 				panic(fmt.Sprintf("ptrace sysemu failed: %v", errno))
 			}
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 606dc2b1d..e99798c56 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -141,9 +141,11 @@ func (t *thread) adjustInitRegsRip() {
 	t.initRegs.Rip -= initRegsRipAdjustment
 }
 
-// Pass the expected PPID to the child via R15 when creating stub process
+// Pass the expected PPID to the child via R15 when creating stub process.
 func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
 	initregs.R15 = uint64(ppid)
+	// Rbx has to be set to 1 when creating stub process.
+	initregs.Rbx = 1
 }
 
 // patchSignalInfo patches the signal info to account for hitting the seccomp
diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go
index 62a686ee7..7b975137f 100644
--- a/pkg/sentry/platform/ptrace/subprocess_arm64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go
@@ -127,6 +127,8 @@ func (t *thread) adjustInitRegsRip() {
 // Pass the expected PPID to the child via X7 when creating stub process
 func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
 	initregs.Regs[7] = uint64(ppid)
+	// R9 has to be set to 1 when creating stub process.
+	initregs.Regs[9] = 1
 }
 
 // patchSignalInfo patches the signal info to account for hitting the seccomp
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index cf13ea5e4..74968dfdf 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -54,7 +54,7 @@ func probeSeccomp() bool {
 
 	for {
 		// Attempt an emulation.
-		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
+		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
 		}
 
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index f1af18265..87f4552b5 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -71,6 +71,7 @@ go_library(
         "lib_amd64.go",
         "lib_amd64.s",
         "lib_arm64.go",
+        "lib_arm64.s",
         "ring0.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0",
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
index fbfbd9bab..dc0eeec01 100644
--- a/pkg/sentry/platform/ring0/defs_arm64.go
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -73,6 +73,9 @@ type CPUArchState struct {
 
 	// application context pointer.
 	appAddr uintptr
+
+	// lazyVFP is the value of cpacr_el1.
+	lazyVFP uintptr
 }
 
 // ErrorCode returns the last error code.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 29c475882..64e9c0845 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -31,6 +31,11 @@
 #define RSV_REG 	R18_PLATFORM
 #define RSV_REG_APP 	R9
 
+#define FPEN_NOTRAP 	0x3
+#define FPEN_SHIFT 	20
+
+#define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT)
+
 #define REGISTERS_SAVE(reg, offset) \
   MOVD R0, offset+PTRACE_R0(reg); \
   MOVD R1, offset+PTRACE_R1(reg); \
@@ -279,6 +284,16 @@
 #define IRQ_DISABLE \
 	MSR $2, DAIFClr;
 
+#define VFP_ENABLE \
+	MOVD $FPEN_ENABLE, R0; \
+	WORD $0xd5181040; \ //MSR R0, CPACR_EL1
+	ISB $15;
+
+#define VFP_DISABLE \
+	MOVD $0x0, R0; \
+	WORD $0xd5181040; \ //MSR R0, CPACR_EL1
+	ISB $15;
+
 #define KERNEL_ENTRY_FROM_EL0 \
 	SUB $16, RSP, RSP; \		// step1, save r18, r9 into kernel temporary stack.
 	STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \
@@ -318,6 +333,11 @@ TEXT ·Halt(SB),NOSPLIT,$0
 	BNE mmio_exit
 	MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG)
 mmio_exit:
+	// Disable fpsimd.
+	WORD $0xd5381041 // MRS CPACR_EL1, R1
+	MOVD R1, CPU_LAZY_VFP(RSV_REG)
+	VFP_DISABLE
+
 	// MMIO_EXIT.
 	MOVD $0, R9
 	MOVD R0, 0xffff000000001000(R9)
@@ -337,6 +357,73 @@ TEXT ·Current(SB),NOSPLIT,$0-8
 #define STACK_FRAME_SIZE 16
 
 TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
+	// Step1, save sentry context into memory.
+	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS)
+	MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG)
+
+	WORD $0xd5384003    //    MRS SPSR_EL1, R3
+	MOVD R3, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG)
+	MOVD R30, CPU_REGISTERS+PTRACE_PC(RSV_REG)
+	MOVD RSP, R3
+	MOVD R3, CPU_REGISTERS+PTRACE_SP(RSV_REG)
+
+	MOVD CPU_REGISTERS+PTRACE_R3(RSV_REG), R3
+
+	// Step2, save SP_EL1, PSTATE into kernel temporary stack.
+	// switch to temporary stack.
+	LOAD_KERNEL_STACK(RSV_REG)
+	WORD $0xd538d092    //MRS   TPIDR_EL1, R18
+
+	SUB $STACK_FRAME_SIZE, RSP, RSP
+	MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R11
+	MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R12
+	STP (R11, R12), 16*0(RSP)
+
+	MOVD CPU_REGISTERS+PTRACE_R11(RSV_REG), R11
+	MOVD CPU_REGISTERS+PTRACE_R12(RSV_REG), R12
+
+	// Step3, test user pagetable.
+	// If user pagetable is empty, trapped in el1_ia.
+	WORD $0xd538d092    //MRS   TPIDR_EL1, R18
+	SWITCH_TO_APP_PAGETABLE(RSV_REG)
+	WORD $0xd538d092    //MRS   TPIDR_EL1, R18
+	SWITCH_TO_KVM_PAGETABLE(RSV_REG)
+	WORD $0xd538d092    //MRS   TPIDR_EL1, R18
+
+	// If pagetable is not empty, recovery kernel temporary stack.
+	ADD $STACK_FRAME_SIZE, RSP, RSP
+
+	// Step4, load app context pointer.
+	MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP
+
+	// Step5, prepare the environment for container application.
+	// set sp_el0.
+	MOVD PTRACE_SP(RSV_REG_APP), R1
+	WORD $0xd5184101        //MSR R1, SP_EL0
+	// set pc.
+	MOVD PTRACE_PC(RSV_REG_APP), R1
+	MSR R1, ELR_EL1
+	// set pstate.
+	MOVD PTRACE_PSTATE(RSV_REG_APP), R1
+	WORD $0xd5184001  //MSR R1, SPSR_EL1
+
+	// RSV_REG & RSV_REG_APP will be loaded at the end.
+	REGISTERS_LOAD(RSV_REG_APP, 0)
+
+	// switch to user pagetable.
+	MOVD PTRACE_R18(RSV_REG_APP), RSV_REG
+	MOVD PTRACE_R9(RSV_REG_APP), RSV_REG_APP
+
+	SUB $STACK_FRAME_SIZE, RSP, RSP
+	STP (RSV_REG, RSV_REG_APP), 16*0(RSP)
+
+	WORD $0xd538d092    //MRS   TPIDR_EL1, R18
+
+	SWITCH_TO_APP_PAGETABLE(RSV_REG)
+
+	LDP 16*0(RSP), (RSV_REG, RSV_REG_APP)
+	ADD $STACK_FRAME_SIZE, RSP, RSP
+
 	ERET()
 
 TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
@@ -382,6 +469,8 @@ TEXT ·El1_sync(SB),NOSPLIT,$0
 	BEQ el1_svc
 	CMP $ESR_ELx_EC_BREAKPT_CUR, R24
 	BGE el1_dbg
+	CMP $ESR_ELx_EC_FP_ASIMD, R24
+	BEQ el1_fpsimd_acc
 	B el1_invalid
 
 el1_da:
@@ -402,6 +491,10 @@ el1_svc:
 el1_dbg:
 	B ·Shutdown(SB)
 
+el1_fpsimd_acc:
+	VFP_ENABLE
+	B ·kernelExitToEl1(SB)  // Resume.
+
 el1_invalid:
 	B ·Shutdown(SB)
 
@@ -528,6 +621,10 @@ TEXT ·Vectors(SB),NOSPLIT,$0
 	B ·El0_error_invalid(SB)
 	nop31Instructions()
 
+	// The exception-vector-table is required to be 11-bits aligned.
+	// Please see Linux source code as reference: arch/arm64/kernel/entry.s.
+	// For gvisor, I defined it as 4K in length, filled the 2nd 2K part with NOPs.
+	// So that, I can safely move the 1st 2K part into the address with 11-bits alignment.
 	WORD $0xd503201f	//nop
 	nop31Instructions()
 	WORD $0xd503201f
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index 900ee6380..8bcfe1032 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -16,10 +16,24 @@
 
 package ring0
 
-// LoadFloatingPoint loads floating point state by the most efficient mechanism
-// available (set by Init).
-var LoadFloatingPoint func(*byte)
+// CPACREL1 returns the value of the CPACR_EL1 register.
+func CPACREL1() (value uintptr)
 
-// SaveFloatingPoint saves floating point state by the most efficient mechanism
-// available (set by Init).
-var SaveFloatingPoint func(*byte)
+// FPCR returns the value of FPCR register.
+func FPCR() (value uintptr)
+
+// SetFPCR writes the FPCR value.
+func SetFPCR(value uintptr)
+
+// FPSR returns the value of FPSR register.
+func FPSR() (value uintptr)
+
+// SetFPSR writes the FPSR value.
+func SetFPSR(value uintptr)
+
+// SaveVRegs saves V0-V31 registers.
+// V0-V31: 32 128-bit registers for floating point and simd.
+func SaveVRegs(*byte)
+
+// LoadVRegs loads V0-V31 registers.
+func LoadVRegs(*byte)
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
new file mode 100644
index 000000000..0e6a6235b
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -0,0 +1,121 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+TEXT ·CPACREL1(SB),NOSPLIT,$0-8
+	WORD $0xd5381041 	// MRS CPACR_EL1, R1
+	MOVD R1, ret+0(FP)
+	RET
+
+TEXT ·GetFPCR(SB),NOSPLIT,$0-8
+	WORD $0xd53b4201    	// MRS NZCV, R1
+	MOVD R1, ret+0(FP)
+	RET
+
+TEXT ·GetFPSR(SB),NOSPLIT,$0-8
+	WORD $0xd53b4421   	// MRS FPSR, R1
+	MOVD R1, ret+0(FP)
+	RET
+
+TEXT ·SetFPCR(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R1
+	WORD $0xd51b4201  	// MSR R1, NZCV
+	RET
+
+TEXT ·SetFPSR(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R1
+	WORD $0xd51b4421   	// MSR R1, FPSR
+	RET
+
+TEXT ·SaveVRegs(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R0
+
+	// Skip aarch64_ctx, fpsr, fpcr.
+	FMOVD F0, 16*1(R0)
+	FMOVD F1, 16*2(R0)
+	FMOVD F2, 16*3(R0)
+	FMOVD F3, 16*4(R0)
+	FMOVD F4, 16*5(R0)
+	FMOVD F5, 16*6(R0)
+	FMOVD F6, 16*7(R0)
+	FMOVD F7, 16*8(R0)
+	FMOVD F8, 16*9(R0)
+	FMOVD F9, 16*10(R0)
+	FMOVD F10, 16*11(R0)
+	FMOVD F11, 16*12(R0)
+	FMOVD F12, 16*13(R0)
+	FMOVD F13, 16*14(R0)
+	FMOVD F14, 16*15(R0)
+	FMOVD F15, 16*16(R0)
+	FMOVD F16, 16*17(R0)
+	FMOVD F17, 16*18(R0)
+	FMOVD F18, 16*19(R0)
+	FMOVD F19, 16*20(R0)
+	FMOVD F20, 16*21(R0)
+	FMOVD F21, 16*22(R0)
+	FMOVD F22, 16*23(R0)
+	FMOVD F23, 16*24(R0)
+	FMOVD F24, 16*25(R0)
+	FMOVD F25, 16*26(R0)
+	FMOVD F26, 16*27(R0)
+	FMOVD F27, 16*28(R0)
+	FMOVD F28, 16*29(R0)
+	FMOVD F29, 16*30(R0)
+	FMOVD F30, 16*31(R0)
+	FMOVD F31, 16*32(R0)
+	ISB $15
+
+	RET
+
+TEXT ·LoadVRegs(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R0
+
+	// Skip aarch64_ctx, fpsr, fpcr.
+	FMOVD 16*1(R0), F0
+	FMOVD 16*2(R0), F1
+	FMOVD 16*3(R0), F2
+	FMOVD 16*4(R0), F3
+	FMOVD 16*5(R0), F4
+	FMOVD 16*6(R0), F5
+	FMOVD 16*7(R0), F6
+	FMOVD 16*8(R0), F7
+	FMOVD 16*9(R0), F8
+	FMOVD 16*10(R0), F9
+	FMOVD 16*11(R0), F10
+	FMOVD 16*12(R0), F11
+	FMOVD 16*13(R0), F12
+	FMOVD 16*14(R0), F13
+	FMOVD 16*15(R0), F14
+	FMOVD 16*16(R0), F15
+	FMOVD 16*17(R0), F16
+	FMOVD 16*18(R0), F17
+	FMOVD 16*19(R0), F18
+	FMOVD 16*20(R0), F19
+	FMOVD 16*21(R0), F20
+	FMOVD 16*22(R0), F21
+	FMOVD 16*23(R0), F22
+	FMOVD 16*24(R0), F23
+	FMOVD 16*25(R0), F24
+	FMOVD 16*26(R0), F25
+	FMOVD 16*27(R0), F26
+	FMOVD 16*28(R0), F27
+	FMOVD 16*29(R0), F28
+	FMOVD 16*30(R0), F29
+	FMOVD 16*31(R0), F30
+	FMOVD 16*32(R0), F31
+	ISB $15
+
+	RET
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index d7aa1c7cc..cd2a65f97 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -39,6 +39,7 @@ func Emit(w io.Writer) {
 	fmt.Fprintf(w, "#define CPU_TTBR0_APP        0x%02x\n", reflect.ValueOf(&c.ttbr0App).Pointer()-reflect.ValueOf(c).Pointer())
 	fmt.Fprintf(w, "#define CPU_VECTOR_CODE      0x%02x\n", reflect.ValueOf(&c.vecCode).Pointer()-reflect.ValueOf(c).Pointer())
 	fmt.Fprintf(w, "#define CPU_APP_ADDR         0x%02x\n", reflect.ValueOf(&c.appAddr).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_LAZY_VFP         0x%02x\n", reflect.ValueOf(&c.lazyVFP).Pointer()-reflect.ValueOf(c).Pointer())
 
 	fmt.Fprintf(w, "\n// Bits.\n")
 	fmt.Fprintf(w, "#define _KERNEL_FLAGS        0x%02x\n", KernelFlagsSet)
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index af1a4e95f..4301b697c 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -471,6 +471,9 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 		case linux.SOL_IP:
 			switch h.Type {
 			case linux.IP_TOS:
+				if length < linux.SizeOfControlMessageTOS {
+					return socket.ControlMessages{}, syserror.EINVAL
+				}
 				cmsgs.IP.HasTOS = true
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS)
 				i += AlignUp(length, width)
@@ -481,6 +484,9 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 		case linux.SOL_IPV6:
 			switch h.Type {
 			case linux.IPV6_TCLASS:
+				if length < linux.SizeOfControlMessageTClass {
+					return socket.ControlMessages{}, syserror.EINVAL
+				}
 				cmsgs.IP.HasTClass = true
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], usermem.ByteOrder, &cmsgs.IP.TClass)
 				i += AlignUp(length, width)
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 79589e3c8..136821963 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -22,7 +22,6 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/netlink/port",
         "//pkg/sentry/socket/unix",
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 4a1b87a9a..d2e3644a6 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -29,7 +29,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
@@ -500,29 +499,29 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
 	trunc := flags&linux.MSG_TRUNC != 0
 
 	r := unix.EndpointReader{
+		Ctx:      t,
 		Endpoint: s.ep,
 		Peek:     flags&linux.MSG_PEEK != 0,
 	}
 
+	doRead := func() (int64, error) {
+		return dst.CopyOutFrom(t, &r)
+	}
+
 	// If MSG_TRUNC is set with a zero byte destination then we still need
 	// to read the message and discard it, or in the case where MSG_PEEK is
 	// set, leave it be. In both cases the full message length must be
-	// returned. However, the memory manager for the destination will not read
-	// the endpoint if the destination is zero length.
-	//
-	// In order for the endpoint to be read when the destination size is zero,
-	// we must cause a read of the endpoint by using a separate fake zero
-	// length block sequence and calling the EndpointReader directly.
+	// returned.
 	if trunc && dst.Addrs.NumBytes() == 0 {
-		// Perform a read to a zero byte block sequence. We can ignore the
-		// original destination since it was zero bytes. The length returned by
-		// ReadToBlocks is ignored and we return the full message length to comply
-		// with MSG_TRUNC.
-		_, err := r.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(make([]byte, 0))))
-		return int(r.MsgSize), linux.MSG_TRUNC, from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
+		doRead = func() (int64, error) {
+			err := r.Truncate()
+			// Always return zero for bytes read since the destination size is
+			// zero.
+			return 0, err
+		}
 	}
 
-	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+	if n, err := doRead(); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
 		var mflags int
 		if n < int64(r.MsgSize) {
 			mflags |= linux.MSG_TRUNC
@@ -540,7 +539,7 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
 	defer s.EventUnregister(&e)
 
 	for {
-		if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock {
+		if n, err := doRead(); err != syserror.ErrWouldBlock {
 			var mflags int
 			if n < int64(r.MsgSize) {
 				mflags |= linux.MSG_TRUNC
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 140851c17..764f11a6b 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -222,17 +222,25 @@ type commonEndpoint interface {
 	// transport.Endpoint.SetSockOpt.
 	SetSockOpt(interface{}) *tcpip.Error
 
+	// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool and
+	// transport.Endpoint.SetSockOptBool.
+	SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error
+
 	// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and
 	// transport.Endpoint.SetSockOptInt.
-	SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error
+	SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
 
 	// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
 	// transport.Endpoint.GetSockOpt.
 	GetSockOpt(interface{}) *tcpip.Error
 
+	// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool and
+	// transport.Endpoint.GetSockOpt.
+	GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error)
+
 	// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
 	// transport.Endpoint.GetSockOpt.
-	GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error)
+	GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
 }
 
 // SocketOperations encapsulates all the state needed to represent a network stack
@@ -977,13 +985,23 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 		if err := ep.GetSockOpt(&v); err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		if len(v) == 0 {
+		if v == 0 {
 			return []byte{}, nil
 		}
 		if outLen < linux.IFNAMSIZ {
 			return nil, syserr.ErrInvalidArgument
 		}
-		return append([]byte(v), 0), nil
+		s := t.NetworkContext()
+		if s == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		nic, ok := s.Interfaces()[int32(v)]
+		if !ok {
+			// The NICID no longer indicates a valid interface, probably because that
+			// interface was removed.
+			return nil, syserr.ErrUnknownDevice
+		}
+		return append([]byte(nic.Name), 0), nil
 
 	case linux.SO_BROADCAST:
 		if outLen < sizeOfInt32 {
@@ -1213,12 +1231,15 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.V6OnlyOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.V6OnlyOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		var o uint32
+		if v {
+			o = 1
+		}
+		return int32(o), nil
 
 	case linux.IPV6_PATHMTU:
 		t.Kernel().EmitUnimplementedEvent(t)
@@ -1427,7 +1448,20 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		if n == -1 {
 			n = len(optVal)
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(optVal[:n])))
+		name := string(optVal[:n])
+		if name == "" {
+			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(0)))
+		}
+		s := t.NetworkContext()
+		if s == nil {
+			return syserr.ErrNoDevice
+		}
+		for nicID, nic := range s.Interfaces() {
+			if nic.Name == name {
+				return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(nicID)))
+			}
+		}
+		return syserr.ErrUnknownDevice
 
 	case linux.SO_BROADCAST:
 		if len(optVal) < sizeOfInt32 {
@@ -1621,7 +1655,7 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.V6OnlyOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.V6OnlyOption, v != 0))
 
 	case linux.IPV6_ADD_MEMBERSHIP,
 		linux.IPV6_DROP_MEMBERSHIP,
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 2ec1a662d..2447f24ef 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -83,6 +83,19 @@ type EndpointReader struct {
 	ControlTrunc bool
 }
 
+// Truncate calls RecvMsg on the endpoint without writing to a destination.
+func (r *EndpointReader) Truncate() error {
+	// Ignore bytes read since it will always be zero.
+	_, ms, c, ct, err := r.Endpoint.RecvMsg(r.Ctx, [][]byte{}, r.Creds, r.NumRights, r.Peek, r.From)
+	r.Control = c
+	r.ControlTrunc = ct
+	r.MsgSize = ms
+	if err != nil {
+		return err.ToError()
+	}
+	return nil
+}
+
 // ReadToBlocks implements safemem.Reader.ReadToBlocks.
 func (r *EndpointReader) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 	return safemem.FromVecReaderFunc{func(bufs [][]byte) (int64, error) {
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 529a7a7a9..37c7ac3c1 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -175,17 +175,25 @@ type Endpoint interface {
 	// types.
 	SetSockOpt(opt interface{}) *tcpip.Error
 
+	// SetSockOptBool sets a socket option for simple cases when a value has
+	// the int type.
+	SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error
+
 	// SetSockOptInt sets a socket option for simple cases when a value has
 	// the int type.
-	SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error
+	SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
 
 	// GetSockOpt gets a socket option. opt should be a pointer to one of the
 	// tcpip.*Option types.
 	GetSockOpt(opt interface{}) *tcpip.Error
 
+	// GetSockOptBool gets a socket option for simple cases when a return
+	// value has the int type.
+	GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error)
+
 	// GetSockOptInt gets a socket option for simple cases when a return
 	// value has the int type.
-	GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error)
+	GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
 
 	// State returns the current state of the socket, as represented by Linux in
 	// procfs.
@@ -851,11 +859,19 @@ func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 	return nil
 }
 
-func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (e *baseEndpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 	return nil
 }
 
-func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	return nil
+}
+
+func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	return false, tcpip.ErrUnknownProtocolOption
+}
+
+func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
 	case tcpip.ReceiveQueueSizeOption:
 		v := 0
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 885758054..91effe89a 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -544,8 +544,27 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	if senderRequested {
 		r.From = &tcpip.FullAddress{}
 	}
+
+	doRead := func() (int64, error) {
+		return dst.CopyOutFrom(t, &r)
+	}
+
+	// If MSG_TRUNC is set with a zero byte destination then we still need
+	// to read the message and discard it, or in the case where MSG_PEEK is
+	// set, leave it be. In both cases the full message length must be
+	// returned.
+	if trunc && dst.Addrs.NumBytes() == 0 {
+		doRead = func() (int64, error) {
+			err := r.Truncate()
+			// Always return zero for bytes read since the destination size is
+			// zero.
+			return 0, err
+		}
+
+	}
+
 	var total int64
-	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || dontWait {
+	if n, err := doRead(); err != syserror.ErrWouldBlock || dontWait {
 		var from linux.SockAddr
 		var fromLen uint32
 		if r.From != nil && len([]byte(r.From.Addr)) != 0 {
@@ -580,7 +599,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	defer s.EventUnregister(&e)
 
 	for {
-		if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock {
+		if n, err := doRead(); err != syserror.ErrWouldBlock {
 			var from linux.SockAddr
 			var fromLen uint32
 			if r.From != nil {
diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go
index 9fa2f0e16..1e823b685 100644
--- a/pkg/sentry/strace/linux64_amd64.go
+++ b/pkg/sentry/strace/linux64_amd64.go
@@ -378,5 +378,7 @@ func init() {
 		syscallTable{
 			os:       abi.Linux,
 			arch:     arch.AMD64,
-			syscalls: linuxAMD64})
+			syscalls: linuxAMD64,
+		},
+	)
 }
diff --git a/pkg/sentry/strace/select.go b/pkg/sentry/strace/select.go
index dea309fda..c77d418e6 100644
--- a/pkg/sentry/strace/select.go
+++ b/pkg/sentry/strace/select.go
@@ -36,6 +36,9 @@ func fdsFromSet(t *kernel.Task, set []byte) []int {
 }
 
 func fdSet(t *kernel.Task, nfds int, addr usermem.Addr) string {
+	if nfds < 0 {
+		return fmt.Sprintf("%#x (negative nfds)", addr)
+	}
 	if addr == 0 {
 		return "null"
 	}
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 6766ba587..a76975cee 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -30,6 +30,7 @@ go_library(
         "sys_random.go",
         "sys_read.go",
         "sys_rlimit.go",
+        "sys_rseq.go",
         "sys_rusage.go",
         "sys_sched.go",
         "sys_seccomp.go",
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 272ae9991..479c5f6ff 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -377,7 +377,7 @@ var AMD64 = &kernel.SyscallTable{
 		331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
 		332: syscalls.Supported("statx", Statx),
 		333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
-		334: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil),
+		334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
 
 		// Linux skips ahead to syscall 424 to sync numbers between arches.
 		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 3b584eed9..d3f61f5e8 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -307,7 +307,7 @@ var ARM64 = &kernel.SyscallTable{
 		290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
 		291: syscalls.Supported("statx", Statx),
 		292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
-		293: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil),
+		293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
 
 		// Linux skips ahead to syscall 424 to sync numbers between arches.
 		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index b9bd25464..bde17a767 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -226,6 +226,11 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		if mask == 0 {
 			return 0, nil, syserror.EINVAL
 		}
+		if val <= 0 {
+			// The Linux kernel wakes one waiter even if val is
+			// non-positive.
+			val = 1
+		}
 		n, err := t.Futex().Wake(t, addr, private, mask, val)
 		return uintptr(n), nil, err
 
@@ -242,6 +247,11 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	case linux.FUTEX_WAKE_OP:
 		op := uint32(val3)
+		if val <= 0 {
+			// The Linux kernel wakes one waiter even if val is
+			// non-positive.
+			val = 1
+		}
 		n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
 		return uintptr(n), nil, err
 
diff --git a/pkg/sentry/syscalls/linux/sys_rseq.go b/pkg/sentry/syscalls/linux/sys_rseq.go
new file mode 100644
index 000000000..90db10ea6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rseq.go
@@ -0,0 +1,48 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// RSeq implements syscall rseq(2).
+func RSeq(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Uint()
+	flags := args[2].Int()
+	signature := args[3].Uint()
+
+	if !t.RSeqAvailable() {
+		// Event for applications that want rseq on a configuration
+		// that doesn't support them.
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.ENOSYS
+	}
+
+	switch flags {
+	case 0:
+		// Register.
+		return 0, nil, t.SetRSeq(addr, length, signature)
+	case linux.RSEQ_FLAG_UNREGISTER:
+		return 0, nil, t.ClearRSeq(addr, length, signature)
+	default:
+		// Unknown flag.
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index d57ffb3a1..4a8bc24a2 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -39,10 +39,13 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	if err != nil {
 		return 0, nil, err
 	}
+	defer segment.DecRef()
 	return uintptr(segment.ID), nil, nil
 }
 
 // findSegment retrives a shm segment by the given id.
+//
+// findSegment returns a reference on Shm.
 func findSegment(t *kernel.Task, id shm.ID) (*shm.Shm, error) {
 	r := t.IPCNamespace().ShmRegistry()
 	segment := r.FindByID(id)
@@ -63,6 +66,7 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if err != nil {
 		return 0, nil, syserror.EINVAL
 	}
+	defer segment.DecRef()
 
 	opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{
 		Execute:  flag&linux.SHM_EXEC == linux.SHM_EXEC,
@@ -72,7 +76,6 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if err != nil {
 		return 0, nil, err
 	}
-	defer segment.DecRef()
 	addr, err = t.MemoryManager().MMap(t, opts)
 	return uintptr(addr), nil, err
 }
@@ -105,6 +108,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		if err != nil {
 			return 0, nil, syserror.EINVAL
 		}
+		defer segment.DecRef()
 
 		stat, err := segment.IPCStat(t)
 		if err == nil {
@@ -128,6 +132,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	if err != nil {
 		return 0, nil, syserror.EINVAL
 	}
+	defer segment.DecRef()
 
 	switch cmd {
 	case linux.IPC_SET:
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index e3e554b88..4c6aa04a1 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -9,6 +9,7 @@ go_library(
         "context.go",
         "debug.go",
         "dentry.go",
+        "device.go",
         "file_description.go",
         "file_description_impl_util.go",
         "filesystem.go",
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 40f4c1d09..1bc9c4a38 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -85,12 +85,12 @@ type Dentry struct {
 	// mounts is accessed using atomic memory operations.
 	mounts uint32
 
-	// mu synchronizes disowning and mounting over this Dentry.
-	mu sync.Mutex
-
 	// children are child Dentries.
 	children map[string]*Dentry
 
+	// mu synchronizes disowning and mounting over this Dentry.
+	mu sync.Mutex
+
 	// impl is the DentryImpl associated with this Dentry. impl is immutable.
 	// This should be the last field in Dentry.
 	impl DentryImpl
@@ -199,6 +199,18 @@ func (d *Dentry) HasChildren() bool {
 	return len(d.children) != 0
 }
 
+// Children returns a map containing all of d's children.
+func (d *Dentry) Children() map[string]*Dentry {
+	if !d.HasChildren() {
+		return nil
+	}
+	m := make(map[string]*Dentry)
+	for name, child := range d.children {
+		m[name] = child
+	}
+	return m
+}
+
 // InsertChild makes child a child of d with the given name.
 //
 // InsertChild is a mutator of d and child.
@@ -222,6 +234,18 @@ func (d *Dentry) InsertChild(child *Dentry, name string) {
 	child.name = name
 }
 
+// IsAncestorOf returns true if d is an ancestor of d2; that is, d is either
+// d2's parent or an ancestor of d2's parent.
+func (d *Dentry) IsAncestorOf(d2 *Dentry) bool {
+	for d2.parent != nil {
+		if d2.parent == d {
+			return true
+		}
+		d2 = d2.parent
+	}
+	return false
+}
+
 // PrepareDeleteDentry must be called before attempting to delete the file
 // represented by d. If PrepareDeleteDentry succeeds, the caller must call
 // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome.
@@ -271,21 +295,6 @@ func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
 	}
 }
 
-// DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as
-// appropriate for in-memory filesystems that don't need to ensure that some
-// external state change succeeds before committing the deletion.
-//
-// DeleteDentry is a mutator of d and d.Parent().
-//
-// Preconditions: d is a child Dentry.
-func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error {
-	if err := vfs.PrepareDeleteDentry(mntns, d); err != nil {
-		return err
-	}
-	vfs.CommitDeleteDentry(d)
-	return nil
-}
-
 // ForceDeleteDentry causes d to become disowned. It should only be used in
 // cases where VFS has no ability to stop the deletion (e.g. d represents the
 // local state of a file on a remote filesystem on which the file has already
@@ -314,7 +323,7 @@ func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) {
 // CommitRenameExchangeDentry depending on the rename's outcome.
 //
 // Preconditions: from is a child Dentry. If to is not nil, it must be a child
-// Dentry from the same Filesystem.
+// Dentry from the same Filesystem. from != to.
 func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
 	if checkInvariants {
 		if from.parent == nil {
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
new file mode 100644
index 000000000..cb672e36f
--- /dev/null
+++ b/pkg/sentry/vfs/device.go
@@ -0,0 +1,100 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// DeviceKind indicates whether a device is a block or character device.
+type DeviceKind uint32
+
+const (
+	// BlockDevice indicates a block device.
+	BlockDevice DeviceKind = iota
+
+	// CharDevice indicates a character device.
+	CharDevice
+)
+
+// String implements fmt.Stringer.String.
+func (kind DeviceKind) String() string {
+	switch kind {
+	case BlockDevice:
+		return "block"
+	case CharDevice:
+		return "character"
+	default:
+		return fmt.Sprintf("invalid device kind %d", kind)
+	}
+}
+
+type devTuple struct {
+	kind  DeviceKind
+	major uint32
+	minor uint32
+}
+
+// A Device backs device special files.
+type Device interface {
+	// Open returns a FileDescription representing this device.
+	Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error)
+}
+
+type registeredDevice struct {
+	dev  Device
+	opts RegisterDeviceOptions
+}
+
+// RegisterDeviceOptions contains options to
+// VirtualFilesystem.RegisterDevice().
+type RegisterDeviceOptions struct {
+	// GroupName is the name shown for this device registration in
+	// /proc/devices. If GroupName is empty, this registration will not be
+	// shown in /proc/devices.
+	GroupName string
+}
+
+// RegisterDevice registers the given Device in vfs with the given major and
+// minor device numbers.
+func (vfs *VirtualFilesystem) RegisterDevice(kind DeviceKind, major, minor uint32, dev Device, opts *RegisterDeviceOptions) error {
+	tup := devTuple{kind, major, minor}
+	vfs.devicesMu.Lock()
+	defer vfs.devicesMu.Unlock()
+	if existing, ok := vfs.devices[tup]; ok {
+		return fmt.Errorf("%s device number (%d, %d) is already registered to device type %T", kind, major, minor, existing.dev)
+	}
+	vfs.devices[tup] = &registeredDevice{
+		dev:  dev,
+		opts: *opts,
+	}
+	return nil
+}
+
+// OpenDeviceSpecialFile returns a FileDescription representing the given
+// device.
+func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mount, d *Dentry, kind DeviceKind, major, minor uint32, opts *OpenOptions) (*FileDescription, error) {
+	tup := devTuple{kind, major, minor}
+	vfs.devicesMu.RLock()
+	defer vfs.devicesMu.RUnlock()
+	rd, ok := vfs.devices[tup]
+	if !ok {
+		return nil, syserror.ENXIO
+	}
+	return rd.dev.Open(ctx, mnt, d, *opts)
+}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 6575afd16..6afe280bc 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -20,8 +20,10 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -38,49 +40,58 @@ type FileDescription struct {
 	// operations.
 	refs int64
 
+	// statusFlags contains status flags, "initialized by open(2) and possibly
+	// modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic
+	// memory operations.
+	statusFlags uint32
+
 	// vd is the filesystem location at which this FileDescription was opened.
 	// A reference is held on vd. vd is immutable.
 	vd VirtualDentry
 
+	opts FileDescriptionOptions
+
 	// impl is the FileDescriptionImpl associated with this Filesystem. impl is
 	// immutable. This should be the last field in FileDescription.
 	impl FileDescriptionImpl
 }
 
-// Init must be called before first use of fd. It takes ownership of references
-// on mnt and d held by the caller.
-func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) {
+// FileDescriptionOptions contains options to FileDescription.Init().
+type FileDescriptionOptions struct {
+	// If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is
+	// usually only the case if O_DIRECT would actually have an effect.
+	AllowDirectIO bool
+
+	// If UseDentryMetadata is true, calls to FileDescription methods that
+	// interact with file and filesystem metadata (Stat, SetStat, StatFS,
+	// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
+	// the corresponding FilesystemImpl methods instead of the corresponding
+	// FileDescriptionImpl methods.
+	//
+	// UseDentryMetadata is intended for file descriptions that are implemented
+	// outside of individual filesystems, such as pipes, sockets, and device
+	// special files. FileDescriptions for which UseDentryMetadata is true may
+	// embed DentryMetadataFileDescriptionImpl to obtain appropriate
+	// implementations of FileDescriptionImpl methods that should not be
+	// called.
+	UseDentryMetadata bool
+}
+
+// Init must be called before first use of fd. It takes references on mnt and
+// d. statusFlags is the initial file description status flags, which is
+// usually the full set of flags passed to open(2).
+func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) {
 	fd.refs = 1
+	fd.statusFlags = statusFlags | linux.O_LARGEFILE
 	fd.vd = VirtualDentry{
 		mount:  mnt,
 		dentry: d,
 	}
+	fd.vd.IncRef()
+	fd.opts = *opts
 	fd.impl = impl
 }
 
-// Impl returns the FileDescriptionImpl associated with fd.
-func (fd *FileDescription) Impl() FileDescriptionImpl {
-	return fd.impl
-}
-
-// Mount returns the mount on which fd was opened. It does not take a reference
-// on the returned Mount.
-func (fd *FileDescription) Mount() *Mount {
-	return fd.vd.mount
-}
-
-// Dentry returns the dentry at which fd was opened. It does not take a
-// reference on the returned Dentry.
-func (fd *FileDescription) Dentry() *Dentry {
-	return fd.vd.dentry
-}
-
-// VirtualDentry returns the location at which fd was opened. It does not take
-// a reference on the returned VirtualDentry.
-func (fd *FileDescription) VirtualDentry() VirtualDentry {
-	return fd.vd
-}
-
 // IncRef increments fd's reference count.
 func (fd *FileDescription) IncRef() {
 	atomic.AddInt64(&fd.refs, 1)
@@ -112,6 +123,82 @@ func (fd *FileDescription) DecRef() {
 	}
 }
 
+// Mount returns the mount on which fd was opened. It does not take a reference
+// on the returned Mount.
+func (fd *FileDescription) Mount() *Mount {
+	return fd.vd.mount
+}
+
+// Dentry returns the dentry at which fd was opened. It does not take a
+// reference on the returned Dentry.
+func (fd *FileDescription) Dentry() *Dentry {
+	return fd.vd.dentry
+}
+
+// VirtualDentry returns the location at which fd was opened. It does not take
+// a reference on the returned VirtualDentry.
+func (fd *FileDescription) VirtualDentry() VirtualDentry {
+	return fd.vd
+}
+
+// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
+func (fd *FileDescription) StatusFlags() uint32 {
+	return atomic.LoadUint32(&fd.statusFlags)
+}
+
+// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
+func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error {
+	// Compare Linux's fs/fcntl.c:setfl().
+	oldFlags := fd.StatusFlags()
+	// Linux documents this check as "O_APPEND cannot be cleared if the file is
+	// marked as append-only and the file is open for write", which would make
+	// sense. However, the check as actually implemented seems to be "O_APPEND
+	// cannot be changed if the file is marked as append-only".
+	if (flags^oldFlags)&linux.O_APPEND != 0 {
+		stat, err := fd.Stat(ctx, StatOptions{
+			// There is no mask bit for stx_attributes.
+			Mask: 0,
+			// Linux just reads inode::i_flags directly.
+			Sync: linux.AT_STATX_DONT_SYNC,
+		})
+		if err != nil {
+			return err
+		}
+		if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) {
+			return syserror.EPERM
+		}
+	}
+	if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) {
+		stat, err := fd.Stat(ctx, StatOptions{
+			Mask: linux.STATX_UID,
+			// Linux's inode_owner_or_capable() just reads inode::i_uid
+			// directly.
+			Sync: linux.AT_STATX_DONT_SYNC,
+		})
+		if err != nil {
+			return err
+		}
+		if stat.Mask&linux.STATX_UID == 0 {
+			return syserror.EPERM
+		}
+		if !CanActAsOwner(creds, auth.KUID(stat.UID)) {
+			return syserror.EPERM
+		}
+	}
+	if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO {
+		return syserror.EINVAL
+	}
+	// TODO(jamieliu): FileDescriptionImpl.SetOAsync()?
+	const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK
+	atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags))
+	return nil
+}
+
+// Impl returns the FileDescriptionImpl associated with fd.
+func (fd *FileDescription) Impl() FileDescriptionImpl {
+	return fd.impl
+}
+
 // FileDescriptionImpl contains implementation details for an FileDescription.
 // Implementations of FileDescriptionImpl should contain their associated
 // FileDescription by value as their first field.
@@ -120,6 +207,8 @@ func (fd *FileDescription) DecRef() {
 // be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and
 // auth.KGID respectively).
 //
+// All methods may return errors not specified.
+//
 // FileDescriptionImpl is analogous to Linux's struct file_operations.
 type FileDescriptionImpl interface {
 	// Release is called when the associated FileDescription reaches zero
@@ -131,14 +220,6 @@ type FileDescriptionImpl interface {
 	// prevent the file descriptor from being closed.
 	OnClose(ctx context.Context) error
 
-	// StatusFlags returns file description status flags, as for
-	// fcntl(F_GETFL).
-	StatusFlags(ctx context.Context) (uint32, error)
-
-	// SetStatusFlags sets file description status flags, as for
-	// fcntl(F_SETFL).
-	SetStatusFlags(ctx context.Context, flags uint32) error
-
 	// Stat returns metadata for the file represented by the FileDescription.
 	Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
 
@@ -156,6 +237,10 @@ type FileDescriptionImpl interface {
 	// PRead reads from the file into dst, starting at the given offset, and
 	// returns the number of bytes read. PRead is permitted to return partial
 	// reads with a nil error.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
 	PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
 
 	// Read is similar to PRead, but does not specify an offset.
@@ -165,6 +250,10 @@ type FileDescriptionImpl interface {
 	// the number of bytes read; note that POSIX 2.9.7 "Thread Interactions
 	// with Regular File Operations" requires that all operations that may
 	// mutate the FileDescription offset are serialized.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP.
 	Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)
 
 	// PWrite writes src to the file, starting at the given offset, and returns
@@ -174,6 +263,11 @@ type FileDescriptionImpl interface {
 	// As in Linux (but not POSIX), if O_APPEND is in effect for the
 	// FileDescription, PWrite should ignore the offset and append data to the
 	// end of the file.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, PWrite returns
+	// EOPNOTSUPP.
 	PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
 
 	// Write is similar to PWrite, but does not specify an offset, which is
@@ -183,6 +277,10 @@ type FileDescriptionImpl interface {
 	// PWrite that uses a FileDescription offset, to make it possible for
 	// remote filesystems to implement O_APPEND correctly (i.e. atomically with
 	// respect to writers outside the scope of VFS).
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP.
 	Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)
 
 	// IterDirents invokes cb on each entry in the directory represented by the
@@ -212,7 +310,21 @@ type FileDescriptionImpl interface {
 	// Ioctl implements the ioctl(2) syscall.
 	Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
 
-	// TODO: extended attributes; file locking
+	// Listxattr returns all extended attribute names for the file.
+	Listxattr(ctx context.Context) ([]string, error)
+
+	// Getxattr returns the value associated with the given extended attribute
+	// for the file.
+	Getxattr(ctx context.Context, name string) (string, error)
+
+	// Setxattr changes the value associated with the given extended attribute
+	// for the file.
+	Setxattr(ctx context.Context, opts SetxattrOptions) error
+
+	// Removexattr removes the given extended attribute from the file.
+	Removexattr(ctx context.Context, name string) error
+
+	// TODO: file locking
 }
 
 // Dirent holds the information contained in struct linux_dirent64.
@@ -249,31 +361,49 @@ func (fd *FileDescription) OnClose(ctx context.Context) error {
 	return fd.impl.OnClose(ctx)
 }
 
-// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
-func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	flags, err := fd.impl.StatusFlags(ctx)
-	flags |= linux.O_LARGEFILE
-	return flags, err
-}
-
-// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
-func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	return fd.impl.SetStatusFlags(ctx, flags)
-}
-
 // Stat returns metadata for the file represented by fd.
 func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
+		vfsObj.putResolvingPath(rp)
+		return stat, err
+	}
 	return fd.impl.Stat(ctx, opts)
 }
 
 // SetStat updates metadata for the file represented by fd.
 func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
+		vfsObj.putResolvingPath(rp)
+		return err
+	}
 	return fd.impl.SetStat(ctx, opts)
 }
 
 // StatFS returns metadata for the filesystem containing the file represented
 // by fd.
 func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
+		vfsObj.putResolvingPath(rp)
+		return statfs, err
+	}
 	return fd.impl.StatFS(ctx)
 }
 
@@ -329,6 +459,78 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
 	return fd.impl.Ioctl(ctx, uio, args)
 }
 
+// Listxattr returns all extended attribute names for the file represented by
+// fd.
+func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp)
+		vfsObj.putResolvingPath(rp)
+		return names, err
+	}
+	names, err := fd.impl.Listxattr(ctx)
+	if err == syserror.ENOTSUP {
+		// Linux doesn't actually return ENOTSUP in this case; instead,
+		// fs/xattr.c:vfs_listxattr() falls back to allowing the security
+		// subsystem to return security extended attributes, which by default
+		// don't exist.
+		return nil, nil
+	}
+	return names, err
+}
+
+// Getxattr returns the value associated with the given extended attribute for
+// the file represented by fd.
+func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		vfsObj.putResolvingPath(rp)
+		return val, err
+	}
+	return fd.impl.Getxattr(ctx, name)
+}
+
+// Setxattr changes the value associated with the given extended attribute for
+// the file represented by fd.
+func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts)
+		vfsObj.putResolvingPath(rp)
+		return err
+	}
+	return fd.impl.Setxattr(ctx, opts)
+}
+
+// Removexattr removes the given extended attribute from the file represented
+// by fd.
+func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		vfsObj.putResolvingPath(rp)
+		return err
+	}
+	return fd.impl.Removexattr(ctx, name)
+}
+
 // SyncFS instructs the filesystem containing fd to execute the semantics of
 // syncfs(2).
 func (fd *FileDescription) SyncFS(ctx context.Context) error {
@@ -347,7 +549,7 @@ func (fd *FileDescription) MappedName(ctx context.Context) string {
 
 // DeviceID implements memmap.MappingIdentity.DeviceID.
 func (fd *FileDescription) DeviceID() uint64 {
-	stat, err := fd.impl.Stat(context.Background(), StatOptions{
+	stat, err := fd.Stat(context.Background(), StatOptions{
 		// There is no STATX_DEV; we assume that Stat will return it if it's
 		// available regardless of mask.
 		Mask: 0,
@@ -363,7 +565,7 @@ func (fd *FileDescription) DeviceID() uint64 {
 
 // InodeID implements memmap.MappingIdentity.InodeID.
 func (fd *FileDescription) InodeID() uint64 {
-	stat, err := fd.impl.Stat(context.Background(), StatOptions{
+	stat, err := fd.Stat(context.Background(), StatOptions{
 		Mask: linux.STATX_INO,
 		// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly.
 		Sync: linux.AT_STATX_DONT_SYNC,
@@ -376,5 +578,5 @@ func (fd *FileDescription) InodeID() uint64 {
 
 // Msync implements memmap.MappingIdentity.Msync.
 func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
-	return fd.impl.Sync(ctx)
+	return fd.Sync(ctx)
 }
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index aae023254..66eb57bc2 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -127,6 +127,31 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
 	return 0, syserror.ENOTTY
 }
 
+// Listxattr implements FileDescriptionImpl.Listxattr analogously to
+// inode_operations::listxattr == NULL in Linux.
+func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) {
+	// This isn't exactly accurate; see FileDescription.Listxattr.
+	return nil, syserror.ENOTSUP
+}
+
+// Getxattr implements FileDescriptionImpl.Getxattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) {
+	return "", syserror.ENOTSUP
+}
+
+// Setxattr implements FileDescriptionImpl.Setxattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+	return syserror.ENOTSUP
+}
+
+// Removexattr implements FileDescriptionImpl.Removexattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error {
+	return syserror.ENOTSUP
+}
+
 // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
 // FileDescriptionImpl that always represent directories to obtain
 // implementations of non-directory I/O methods that return EISDIR.
@@ -152,6 +177,21 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme
 	return 0, syserror.EISDIR
 }
 
+// DentryMetadataFileDescriptionImpl may be embedded by implementations of
+// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is
+// true to obtain implementations of Stat and SetStat that panic.
+type DentryMetadataFileDescriptionImpl struct{}
+
+// Stat implements FileDescriptionImpl.Stat.
+func (DentryMetadataFileDescriptionImpl) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+	panic("illegal call to DentryMetadataFileDescriptionImpl.Stat")
+}
+
+// SetStat implements FileDescriptionImpl.SetStat.
+func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetStatOptions) error {
+	panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat")
+}
+
 // DynamicBytesFileDescriptionImpl may be embedded by implementations of
 // FileDescriptionImpl that represent read-only regular files whose contents
 // are backed by a bytes.Buffer that is regenerated when necessary, consistent
@@ -174,6 +214,17 @@ type DynamicBytesSource interface {
 	Generate(ctx context.Context, buf *bytes.Buffer) error
 }
 
+// StaticData implements DynamicBytesSource over a static string.
+type StaticData struct {
+	Data string
+}
+
+// Generate implements DynamicBytesSource.
+func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString(s.Data)
+	return nil
+}
+
 // SetDataSource must be called exactly once on fd before first use.
 func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
 	fd.data = data
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index ac7799296..9ed58512f 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -48,7 +48,7 @@ type genCountFD struct {
 
 func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription {
 	var fd genCountFD
-	fd.vfsfd.Init(&fd, mnt, vfsd)
+	fd.vfsfd.Init(&fd, 0 /* statusFlags */, mnt, vfsd, &FileDescriptionOptions{})
 	fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd)
 	return &fd.vfsfd
 }
@@ -89,7 +89,7 @@ func TestGenCountFD(t *testing.T) {
 	creds := auth.CredentialsFromContext(ctx)
 
 	vfsObj := New() // vfs.New()
-	vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{})
+	vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{}, &RegisterFilesystemTypeOptions{})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("failed to create testfs root mount: %v", err)
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 8011eba3f..ea78f555b 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -108,6 +108,24 @@ func (fs *Filesystem) DecRef() {
 // (responsible for actually implementing the operation) isn't known until path
 // resolution is complete.
 //
+// Unless otherwise specified, FilesystemImpl methods are responsible for
+// performing permission checks. In many cases, vfs package functions in
+// permissions.go may be used to help perform these checks.
+//
+// When multiple specified error conditions apply to a given method call, the
+// implementation may return any applicable errno unless otherwise specified,
+// but returning the earliest error specified is preferable to maximize
+// compatibility with Linux.
+//
+// All methods may return errors not specified, notably including:
+//
+// - ENOENT if a required path component does not exist.
+//
+// - ENOTDIR if an intermediate path component is not a directory.
+//
+// - Errors from vfs-package functions (ResolvingPath.Resolve*(),
+// Mount.CheckBeginWrite(), permission-checking functions, etc.)
+//
 // For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid
 // should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID
 // and auth.KGID respectively).
@@ -130,46 +148,223 @@ type FilesystemImpl interface {
 	// GetDentryAt does not correspond directly to a Linux syscall; it is used
 	// in the implementation of:
 	//
-	// - Syscalls that need to resolve two paths: rename(), renameat(),
-	// renameat2(), link(), linkat().
+	// - Syscalls that need to resolve two paths: link(), linkat().
 	//
 	// - Syscalls that need to refer to a filesystem position outside the
 	// context of a file description: chdir(), fchdir(), chroot(), mount(),
 	// umount().
 	GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error)
 
+	// GetParentDentryAt returns a Dentry representing the directory at the
+	// second-to-last path component in rp. (Note that, despite the name, this
+	// is not necessarily the parent directory of the file at rp, since the
+	// last path component in rp may be "." or "..".) A reference is taken on
+	// the returned Dentry.
+	//
+	// GetParentDentryAt does not correspond directly to a Linux syscall; it is
+	// used in the implementation of the rename() family of syscalls, which
+	// must resolve the parent directories of two paths.
+	//
+	// Preconditions: !rp.Done().
+	//
+	// Postconditions: If GetParentDentryAt returns a nil error, then
+	// rp.Final(). If GetParentDentryAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error)
+
 	// LinkAt creates a hard link at rp representing the same file as vd. It
 	// does not take ownership of references on vd.
 	//
-	// The implementation is responsible for checking that vd.Mount() ==
-	// rp.Mount(), and that vd does not represent a directory.
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", LinkAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, LinkAt returns EEXIST.
+	//
+	// - If rp.MustBeDir(), LinkAt returns ENOENT.
+	//
+	// - If the directory in which the link would be created has been removed
+	// by RmdirAt or RenameAt, LinkAt returns ENOENT.
+	//
+	// - If rp.Mount != vd.Mount(), LinkAt returns EXDEV.
+	//
+	// - If vd represents a directory, LinkAt returns EPERM.
+	//
+	// - If vd represents a file for which all existing links have been
+	// removed, or a file created by open(O_TMPFILE|O_EXCL), LinkAt returns
+	// ENOENT. Equivalently, if vd represents a file with a link count of 0 not
+	// created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If LinkAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error
 
 	// MkdirAt creates a directory at rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", MkdirAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, MkdirAt returns EEXIST.
+	//
+	// - If the directory in which the new directory would be created has been
+	// removed by RmdirAt or RenameAt, MkdirAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If MkdirAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error
 
 	// MknodAt creates a regular file, device special file, or named pipe at
 	// rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", MknodAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, MknodAt returns EEXIST.
+	//
+	// - If rp.MustBeDir(), MknodAt returns ENOENT.
+	//
+	// - If the directory in which the file would be created has been removed
+	// by RmdirAt or RenameAt, MknodAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If MknodAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error
 
 	// OpenAt returns an FileDescription providing access to the file at rp. A
 	// reference is taken on the returned FileDescription.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies O_TMPFILE and this feature is unsupported by
+	// the implementation, OpenAt returns EOPNOTSUPP. (All other unsupported
+	// features are silently ignored, consistently with Linux's open*(2).)
 	OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error)
 
 	// ReadlinkAt returns the target of the symbolic link at rp.
+	//
+	// Errors:
+	//
+	// - If the file at rp is not a symbolic link, ReadlinkAt returns EINVAL.
 	ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error)
 
-	// RenameAt renames the Dentry represented by vd to rp. It does not take
-	// ownership of references on vd.
+	// RenameAt renames the file named oldName in directory oldParentVD to rp.
+	// It does not take ownership of references on oldParentVD.
+	//
+	// Errors [1]:
+	//
+	// - If opts.Flags specifies unsupported options, RenameAt returns EINVAL.
+	//
+	// - If the last path component in rp is "." or "..", and opts.Flags
+	// contains RENAME_NOREPLACE, RenameAt returns EEXIST.
+	//
+	// - If the last path component in rp is "." or "..", and opts.Flags does
+	// not contain RENAME_NOREPLACE, RenameAt returns EBUSY.
+	//
+	// - If rp.Mount != oldParentVD.Mount(), RenameAt returns EXDEV.
+	//
+	// - If the renamed file is not a directory, and opts.MustBeDir is true,
+	// RenameAt returns ENOTDIR.
+	//
+	// - If renaming would replace an existing file and opts.Flags contains
+	// RENAME_NOREPLACE, RenameAt returns EEXIST.
+	//
+	// - If there is no existing file at rp and opts.Flags contains
+	// RENAME_EXCHANGE, RenameAt returns ENOENT.
+	//
+	// - If there is an existing non-directory file at rp, and rp.MustBeDir()
+	// is true, RenameAt returns ENOTDIR.
+	//
+	// - If the renamed file is not a directory, opts.Flags does not contain
+	// RENAME_EXCHANGE, and rp.MustBeDir() is true, RenameAt returns ENOTDIR.
+	// (This check is not subsumed by the check for directory replacement below
+	// since it applies even if there is no file to replace.)
+	//
+	// - If the renamed file is a directory, and the new parent directory of
+	// the renamed file is either the renamed directory or a descendant
+	// subdirectory of the renamed directory, RenameAt returns EINVAL.
 	//
-	// The implementation is responsible for checking that vd.Mount() ==
-	// rp.Mount().
-	RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error
+	// - If renaming would exchange the renamed file with an ancestor directory
+	// of the renamed file, RenameAt returns EINVAL.
+	//
+	// - If renaming would replace an ancestor directory of the renamed file,
+	// RenameAt returns ENOTEMPTY. (This check would be subsumed by the
+	// non-empty directory check below; however, this check takes place before
+	// the self-rename check.)
+	//
+	// - If the renamed file would replace or exchange with itself (i.e. the
+	// source and destination paths resolve to the same file), RenameAt returns
+	// nil, skipping the checks described below.
+	//
+	// - If the source or destination directory is not writable by the provider
+	// of rp.Credentials(), RenameAt returns EACCES.
+	//
+	// - If the renamed file is a directory, and renaming would replace a
+	// non-directory file, RenameAt returns ENOTDIR.
+	//
+	// - If the renamed file is not a directory, and renaming would replace a
+	// directory, RenameAt returns EISDIR.
+	//
+	// - If the new parent directory of the renamed file has been removed by
+	// RmdirAt or a preceding call to RenameAt, RenameAt returns ENOENT.
+	//
+	// - If the renamed file is a directory, it is not writable by the
+	// provider of rp.Credentials(), and the source and destination parent
+	// directories are different, RenameAt returns EACCES. (This is nominally
+	// required to change the ".." entry in the renamed directory.)
+	//
+	// - If renaming would replace a non-empty directory, RenameAt returns
+	// ENOTEMPTY.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink(). oldName is not "." or "..".
+	//
+	// Postconditions: If RenameAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	//
+	// [1] "The worst of all namespace operations - renaming directory.
+	// "Perverted" doesn't even start to describe it. Somebody in UCB had a
+	// heck of a trip..." - fs/namei.c:vfs_rename()
+	RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error
 
 	// RmdirAt removes the directory at rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is ".", RmdirAt returns EINVAL.
+	//
+	// - If the last path component in rp is "..", RmdirAt returns ENOTEMPTY.
+	//
+	// - If no file exists at rp, RmdirAt returns ENOENT.
+	//
+	// - If the file at rp exists but is not a directory, RmdirAt returns
+	// ENOTDIR.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If RmdirAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	RmdirAt(ctx context.Context, rp *ResolvingPath) error
 
 	// SetStatAt updates metadata for the file at the given path.
+	//
+	// Errors:
+	//
+	// - If opts specifies unsupported options, SetStatAt returns EINVAL.
 	SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error
 
 	// StatAt returns metadata for the file at rp.
@@ -181,11 +376,82 @@ type FilesystemImpl interface {
 	StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error)
 
 	// SymlinkAt creates a symbolic link at rp referring to the given target.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", SymlinkAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, SymlinkAt returns EEXIST.
+	//
+	// - If rp.MustBeDir(), SymlinkAt returns ENOENT.
+	//
+	// - If the directory in which the symbolic link would be created has been
+	// removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If SymlinkAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error
 
-	// UnlinkAt removes the non-directory file at rp.
+	// UnlinkAt removes the file at rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", UnlinkAt returns
+	// EISDIR.
+	//
+	// - If no file exists at rp, UnlinkAt returns ENOENT.
+	//
+	// - If rp.MustBeDir(), and the file at rp exists and is not a directory,
+	// UnlinkAt returns ENOTDIR.
+	//
+	// - If the file at rp exists but is a directory, UnlinkAt returns EISDIR.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If UnlinkAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 
+	// ListxattrAt returns all extended attribute names for the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem,
+	// ListxattrAt returns nil. (See FileDescription.Listxattr for an
+	// explanation.)
+	ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error)
+
+	// GetxattrAt returns the value associated with the given extended
+	// attribute for the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem, GetxattrAt
+	// returns ENOTSUP.
+	GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error)
+
+	// SetxattrAt changes the value associated with the given extended
+	// attribute for the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem, SetxattrAt
+	// returns ENOTSUP.
+	SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
+
+	// RemovexattrAt removes the given extended attribute from the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem,
+	// RemovexattrAt returns ENOTSUP.
+	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
+
 	// PrependPath prepends a path from vd to vd.Mount().Root() to b.
 	//
 	// If vfsroot.Ok(), it is the contextual VFS root; if it is encountered
@@ -208,7 +474,7 @@ type FilesystemImpl interface {
 	// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
 	PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
 
-	// TODO: extended attributes; inotify_add_watch(); bind()
+	// TODO: inotify_add_watch(); bind()
 }
 
 // PrependPathAtVFSRootError is returned by implementations of
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index c335e206d..023301780 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -15,6 +15,7 @@
 package vfs
 
 import (
+	"bytes"
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -43,28 +44,70 @@ type GetFilesystemOptions struct {
 	InternalData interface{}
 }
 
+type registeredFilesystemType struct {
+	fsType FilesystemType
+	opts   RegisterFilesystemTypeOptions
+}
+
+// RegisterFilesystemTypeOptions contains options to
+// VirtualFilesystem.RegisterFilesystem().
+type RegisterFilesystemTypeOptions struct {
+	// If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt()
+	// for which MountOptions.InternalMount == false to use this filesystem
+	// type.
+	AllowUserMount bool
+
+	// If AllowUserList is true, make this filesystem type visible in
+	// /proc/filesystems.
+	AllowUserList bool
+
+	// If RequiresDevice is true, indicate that mounting this filesystem
+	// requires a block device as the mount source in /proc/filesystems.
+	RequiresDevice bool
+}
+
 // RegisterFilesystemType registers the given FilesystemType in vfs with the
 // given name.
-func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error {
+func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) error {
 	vfs.fsTypesMu.Lock()
 	defer vfs.fsTypesMu.Unlock()
 	if existing, ok := vfs.fsTypes[name]; ok {
-		return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing)
+		return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing.fsType)
+	}
+	vfs.fsTypes[name] = &registeredFilesystemType{
+		fsType: fsType,
+		opts:   *opts,
 	}
-	vfs.fsTypes[name] = fsType
 	return nil
 }
 
 // MustRegisterFilesystemType is equivalent to RegisterFilesystemType but
 // panics on failure.
-func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) {
-	if err := vfs.RegisterFilesystemType(name, fsType); err != nil {
+func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) {
+	if err := vfs.RegisterFilesystemType(name, fsType, opts); err != nil {
 		panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err))
 	}
 }
 
-func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType {
+func (vfs *VirtualFilesystem) getFilesystemType(name string) *registeredFilesystemType {
 	vfs.fsTypesMu.RLock()
 	defer vfs.fsTypesMu.RUnlock()
 	return vfs.fsTypes[name]
 }
+
+// GenerateProcFilesystems emits the contents of /proc/filesystems for vfs to
+// buf.
+func (vfs *VirtualFilesystem) GenerateProcFilesystems(buf *bytes.Buffer) {
+	vfs.fsTypesMu.RLock()
+	defer vfs.fsTypesMu.RUnlock()
+	for name, rft := range vfs.fsTypes {
+		if !rft.opts.AllowUserList {
+			continue
+		}
+		var nodev string
+		if !rft.opts.RequiresDevice {
+			nodev = "nodev"
+		}
+		fmt.Fprintf(buf, "%s\t%s\n", nodev, name)
+	}
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index ec23ab0dd..00177b371 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -112,11 +112,11 @@ type MountNamespace struct {
 // configured by the given arguments. A reference is taken on the returned
 // MountNamespace.
 func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
-	fsType := vfs.getFilesystemType(fsTypeName)
-	if fsType == nil {
+	rft := vfs.getFilesystemType(fsTypeName)
+	if rft == nil {
 		return nil, syserror.ENODEV
 	}
-	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
 	if err != nil {
 		return nil, err
 	}
@@ -136,11 +136,14 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 
 // MountAt creates and mounts a Filesystem configured by the given arguments.
 func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
-	fsType := vfs.getFilesystemType(fsTypeName)
-	if fsType == nil {
+	rft := vfs.getFilesystemType(fsTypeName)
+	if rft == nil {
 		return syserror.ENODEV
 	}
-	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
+	if !opts.InternalMount && !rft.opts.AllowUserMount {
+		return syserror.ENODEV
+	}
+	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 3ecbc8fc1..b7774bf28 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -50,6 +50,10 @@ type MknodOptions struct {
 type MountOptions struct {
 	// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
 	GetFilesystemOptions GetFilesystemOptions
+
+	// If InternalMount is true, allow the use of filesystem types for which
+	// RegisterFilesystemTypeOptions.AllowUserMount == false.
+	InternalMount bool
 }
 
 // OpenOptions contains options to VirtualFilesystem.OpenAt() and
@@ -83,6 +87,9 @@ type ReadOptions struct {
 type RenameOptions struct {
 	// Flags contains flags as specified for renameat2(2).
 	Flags uint32
+
+	// If MustBeDir is true, the renamed file must be a directory.
+	MustBeDir bool
 }
 
 // SetStatOptions contains options to VirtualFilesystem.SetStatAt(),
@@ -101,6 +108,20 @@ type SetStatOptions struct {
 	Stat linux.Statx
 }
 
+// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
+// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
+// FileDescriptionImpl.Setxattr().
+type SetxattrOptions struct {
+	// Name is the name of the extended attribute being mutated.
+	Name string
+
+	// Value is the extended attribute's new value.
+	Value string
+
+	// Flags contains flags as specified for setxattr/lsetxattr/fsetxattr(2).
+	Flags uint32
+}
+
 // StatOptions contains options to VirtualFilesystem.StatAt(),
 // FilesystemImpl.StatAt(), FileDescription.Stat(), and
 // FileDescriptionImpl.Stat().
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 621f5a6f8..f0641d314 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -85,11 +85,11 @@ func init() {
 // so error "constants" are really mutable vars, necessitating somewhat
 // expensive interface object comparisons.
 
-type resolveMountRootError struct{}
+type resolveMountRootOrJumpError struct{}
 
 // Error implements error.Error.
-func (resolveMountRootError) Error() string {
-	return "resolving mount root"
+func (resolveMountRootOrJumpError) Error() string {
+	return "resolving mount root or jump"
 }
 
 type resolveMountPointError struct{}
@@ -112,30 +112,26 @@ var resolvingPathPool = sync.Pool{
 	},
 }
 
-func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) (*ResolvingPath, error) {
-	path, err := fspath.Parse(pop.Pathname)
-	if err != nil {
-		return nil, err
-	}
+func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) *ResolvingPath {
 	rp := resolvingPathPool.Get().(*ResolvingPath)
 	rp.vfs = vfs
 	rp.root = pop.Root
 	rp.mount = pop.Start.mount
 	rp.start = pop.Start.dentry
-	rp.pit = path.Begin
+	rp.pit = pop.Path.Begin
 	rp.flags = 0
 	if pop.FollowFinalSymlink {
 		rp.flags |= rpflagsFollowFinalSymlink
 	}
-	rp.mustBeDir = path.Dir
-	rp.mustBeDirOrig = path.Dir
+	rp.mustBeDir = pop.Path.Dir
+	rp.mustBeDirOrig = pop.Path.Dir
 	rp.symlinks = 0
 	rp.curPart = 0
 	rp.numOrigParts = 1
 	rp.creds = creds
-	rp.parts[0] = path.Begin
-	rp.origParts[0] = path.Begin
-	return rp, nil
+	rp.parts[0] = pop.Path.Begin
+	rp.origParts[0] = pop.Path.Begin
+	return rp
 }
 
 func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) {
@@ -274,7 +270,7 @@ func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
 			// ... of non-root mount.
 			rp.nextMount = vd.mount
 			rp.nextStart = vd.dentry
-			return nil, resolveMountRootError{}
+			return nil, resolveMountRootOrJumpError{}
 		}
 		// ... of root mount.
 		parent = d
@@ -345,29 +341,34 @@ func (rp *ResolvingPath) ShouldFollowSymlink() bool {
 // symlink target and returns nil. Otherwise it returns a non-nil error.
 //
 // Preconditions: !rp.Done().
+//
+// Postconditions: If HandleSymlink returns a nil error, then !rp.Done().
 func (rp *ResolvingPath) HandleSymlink(target string) error {
 	if rp.symlinks >= linux.MaxSymlinkTraversals {
 		return syserror.ELOOP
 	}
-	targetPath, err := fspath.Parse(target)
-	if err != nil {
-		return err
+	if len(target) == 0 {
+		return syserror.ENOENT
 	}
 	rp.symlinks++
+	targetPath := fspath.Parse(target)
 	if targetPath.Absolute {
 		rp.absSymlinkTarget = targetPath
 		return resolveAbsSymlinkError{}
 	}
-	if !targetPath.Begin.Ok() {
-		panic(fmt.Sprintf("symbolic link has non-empty target %q that is both relative and has no path components?", target))
-	}
 	// Consume the path component that represented the symlink.
 	rp.Advance()
 	// Prepend the symlink target to the relative path.
+	if checkInvariants {
+		if !targetPath.HasComponents() {
+			panic(fmt.Sprintf("non-empty pathname %q parsed to relative path with no components", target))
+		}
+	}
 	rp.relpathPrepend(targetPath)
 	return nil
 }
 
+// Preconditions: path.HasComponents().
 func (rp *ResolvingPath) relpathPrepend(path fspath.Path) {
 	if rp.pit.Ok() {
 		rp.parts[rp.curPart] = rp.pit
@@ -385,11 +386,32 @@ func (rp *ResolvingPath) relpathPrepend(path fspath.Path) {
 	}
 }
 
+// HandleJump is called when the current path component is a "magic" link to
+// the given VirtualDentry, like /proc/[pid]/fd/[fd]. If the calling Filesystem
+// method should continue path traversal, HandleMagicSymlink updates the path
+// component stream to reflect the magic link target and returns nil. Otherwise
+// it returns a non-nil error.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) HandleJump(target VirtualDentry) error {
+	if rp.symlinks >= linux.MaxSymlinkTraversals {
+		return syserror.ELOOP
+	}
+	rp.symlinks++
+	// Consume the path component that represented the magic link.
+	rp.Advance()
+	// Unconditionally return a resolveMountRootOrJumpError, even if the Mount
+	// isn't changing, to force restarting at the new Dentry.
+	target.IncRef()
+	rp.nextMount = target.mount
+	rp.nextStart = target.dentry
+	return resolveMountRootOrJumpError{}
+}
+
 func (rp *ResolvingPath) handleError(err error) bool {
 	switch err.(type) {
-	case resolveMountRootError:
-		// Switch to the new Mount. We hold references on the Mount and Dentry
-		// (from VFS.getMountpointAt()).
+	case resolveMountRootOrJumpError:
+		// Switch to the new Mount. We hold references on the Mount and Dentry.
 		rp.decRefStartAndMount()
 		rp.mount = rp.nextMount
 		rp.start = rp.nextStart
@@ -407,9 +429,8 @@ func (rp *ResolvingPath) handleError(err error) bool {
 		return true
 
 	case resolveMountPointError:
-		// Switch to the new Mount. We hold a reference on the Mount (from
-		// VFS.getMountAt()), but borrow the reference on the mount root from
-		// the Mount.
+		// Switch to the new Mount. We hold a reference on the Mount, but
+		// borrow the reference on the mount root from the Mount.
 		rp.decRefStartAndMount()
 		rp.mount = rp.nextMount
 		rp.start = rp.nextMount.root
@@ -447,6 +468,17 @@ func (rp *ResolvingPath) handleError(err error) bool {
 	}
 }
 
+// canHandleError returns true if err is an error returned by rp.Resolve*()
+// that rp.handleError() may attempt to handle.
+func (rp *ResolvingPath) canHandleError(err error) bool {
+	switch err.(type) {
+	case resolveMountRootOrJumpError, resolveMountPointError, resolveAbsSymlinkError:
+		return true
+	default:
+		return false
+	}
+}
+
 // MustBeDir returns true if the file traversed by rp must be a directory.
 func (rp *ResolvingPath) MustBeDir() bool {
 	return rp.mustBeDir
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
index 7a1d9e383..ee5c8b9e2 100644
--- a/pkg/sentry/vfs/testutil.go
+++ b/pkg/sentry/vfs/testutil.go
@@ -57,6 +57,11 @@ func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath,
 	return nil, syserror.EPERM
 }
 
+// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt.
+func (fs *FDTestFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) {
+	return nil, syserror.EPERM
+}
+
 // LinkAt implements FilesystemImpl.LinkAt.
 func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
 	return syserror.EPERM
@@ -83,7 +88,7 @@ func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (
 }
 
 // RenameAt implements FilesystemImpl.RenameAt.
-func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error {
+func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error {
 	return syserror.EPERM
 }
 
@@ -117,6 +122,26 @@ func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) err
 	return syserror.EPERM
 }
 
+// ListxattrAt implements FilesystemImpl.ListxattrAt.
+func (fs *FDTestFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
+	return nil, syserror.EPERM
+}
+
+// GetxattrAt implements FilesystemImpl.GetxattrAt.
+func (fs *FDTestFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
+	return "", syserror.EPERM
+}
+
+// SetxattrAt implements FilesystemImpl.SetxattrAt.
+func (fs *FDTestFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+	return syserror.EPERM
+}
+
+// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
+func (fs *FDTestFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+	return syserror.EPERM
+}
+
 // PrependPath implements FilesystemImpl.PrependPath.
 func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
 	b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry)))
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 7262b0d0a..ea2db7031 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -28,9 +28,11 @@
 package vfs
 
 import (
+	"fmt"
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -73,23 +75,29 @@ type VirtualFilesystem struct {
 	// mountpoints is analogous to Linux's mountpoint_hashtable.
 	mountpoints map[*Dentry]map[*Mount]struct{}
 
+	// devices contains all registered Devices. devices is protected by
+	// devicesMu.
+	devicesMu sync.RWMutex
+	devices   map[devTuple]*registeredDevice
+
+	// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
+	// fsTypesMu.
+	fsTypesMu sync.RWMutex
+	fsTypes   map[string]*registeredFilesystemType
+
 	// filesystems contains all Filesystems. filesystems is protected by
 	// filesystemsMu.
 	filesystemsMu sync.Mutex
 	filesystems   map[*Filesystem]struct{}
-
-	// fsTypes contains all FilesystemTypes that are usable in the
-	// VirtualFilesystem. fsTypes is protected by fsTypesMu.
-	fsTypesMu sync.RWMutex
-	fsTypes   map[string]FilesystemType
 }
 
 // New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
 func New() *VirtualFilesystem {
 	vfs := &VirtualFilesystem{
 		mountpoints: make(map[*Dentry]map[*Mount]struct{}),
+		devices:     make(map[devTuple]*registeredDevice),
+		fsTypes:     make(map[string]*registeredFilesystemType),
 		filesystems: make(map[*Filesystem]struct{}),
-		fsTypes:     make(map[string]FilesystemType),
 	}
 	vfs.mounts.Init()
 	return vfs
@@ -111,11 +119,11 @@ type PathOperation struct {
 	// are borrowed from the provider of the PathOperation (i.e. the caller of
 	// the VFS method to which the PathOperation was passed).
 	//
-	// Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
+	// Invariants: Start.Ok(). If Path.Absolute, then Start == Root.
 	Start VirtualDentry
 
 	// Path is the pathname traversed by this operation.
-	Pathname string
+	Path fspath.Path
 
 	// If FollowFinalSymlink is true, and the Dentry traversed by the final
 	// path component represents a symbolic link, the symbolic link should be
@@ -126,10 +134,7 @@ type PathOperation struct {
 // GetDentryAt returns a VirtualDentry representing the given path, at which a
 // file must exist. A reference is taken on the returned VirtualDentry.
 func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return VirtualDentry{}, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
 		if err == nil {
@@ -148,6 +153,33 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede
 	}
 }
 
+// Preconditions: pop.Path.Begin.Ok().
+func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp)
+		if err == nil {
+			parentVD := VirtualDentry{
+				mount:  rp.mount,
+				dentry: parent,
+			}
+			rp.mount.IncRef()
+			name := rp.Component()
+			vfs.putResolvingPath(rp)
+			return parentVD, name, nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return VirtualDentry{}, "", err
+		}
+	}
+}
+
 // LinkAt creates a hard link at newpop representing the existing file at
 // oldpop.
 func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error {
@@ -155,21 +187,36 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
 	if err != nil {
 		return err
 	}
-	rp, err := vfs.getResolvingPath(creds, newpop)
-	if err != nil {
+
+	if !newpop.Path.Begin.Ok() {
 		oldVD.DecRef()
-		return err
+		if newpop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
+	}
+	if newpop.FollowFinalSymlink {
+		oldVD.DecRef()
+		ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
 	}
+
+	rp := vfs.getResolvingPath(creds, newpop)
 	for {
 		err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
 		if err == nil {
-			oldVD.DecRef()
 			vfs.putResolvingPath(rp)
+			oldVD.DecRef()
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
-			oldVD.DecRef()
 			vfs.putResolvingPath(rp)
+			oldVD.DecRef()
 			return err
 		}
 	}
@@ -177,19 +224,32 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
 
 // MkdirAt creates a directory at the given path.
 func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
+	}
 	// "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
 	// also honored." - mkdir(2)
 	opts.Mode &= 0777 | linux.S_ISVTX
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
-	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -200,16 +260,29 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
 // MknodAt creates a file of the given mode at the given path. It returns an
 // error from the syserror package.
 func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
 	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil {
+		err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
+		if err != nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
-		// Handle mount traversals.
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -259,10 +332,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 	if opts.Flags&linux.O_NOFOLLOW != 0 {
 		pop.FollowFinalSymlink = false
 	}
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	if opts.Flags&linux.O_DIRECTORY != 0 {
 		rp.mustBeDir = true
 		rp.mustBeDirOrig = true
@@ -282,10 +352,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 
 // ReadlinkAt returns the target of the symbolic link at the given path.
 func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return "", err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
 		if err == nil {
@@ -301,25 +368,59 @@ func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Creden
 
 // RenameAt renames the file at oldpop to newpop.
 func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error {
-	oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
-	if err != nil {
-		return err
+	if !oldpop.Path.Begin.Ok() {
+		if oldpop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
+	}
+	if oldpop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink")
+		return syserror.EINVAL
 	}
-	rp, err := vfs.getResolvingPath(creds, newpop)
+
+	oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop)
 	if err != nil {
-		oldVD.DecRef()
 		return err
 	}
+	if oldName == "." || oldName == ".." {
+		oldParentVD.DecRef()
+		return syserror.EBUSY
+	}
+
+	if !newpop.Path.Begin.Ok() {
+		oldParentVD.DecRef()
+		if newpop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
+	}
+	if newpop.FollowFinalSymlink {
+		oldParentVD.DecRef()
+		ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, newpop)
+	renameOpts := *opts
+	if oldpop.Path.Dir {
+		renameOpts.MustBeDir = true
+	}
 	for {
-		err := rp.mount.fs.impl.RenameAt(ctx, rp, oldVD, *opts)
+		err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts)
 		if err == nil {
-			oldVD.DecRef()
 			vfs.putResolvingPath(rp)
+			oldParentVD.DecRef()
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
-			oldVD.DecRef()
 			vfs.putResolvingPath(rp)
+			oldParentVD.DecRef()
 			return err
 		}
 	}
@@ -327,16 +428,29 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
 
 // RmdirAt removes the directory at the given path.
 func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink")
+		return syserror.EINVAL
 	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.RmdirAt(ctx, rp)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -346,10 +460,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia
 
 // SetStatAt changes metadata for the file at the given path.
 func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
 		if err == nil {
@@ -365,10 +476,7 @@ func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credent
 
 // StatAt returns metadata for the file at the given path.
 func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return linux.Statx{}, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
 		if err == nil {
@@ -385,10 +493,7 @@ func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credential
 // StatFSAt returns metadata for the filesystem containing the file at the
 // given path.
 func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return linux.Statfs{}, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
 		if err == nil {
@@ -404,16 +509,29 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti
 
 // SymlinkAt creates a symbolic link at the given path with the given target.
 func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
 	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -423,16 +541,104 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
 
 // UnlinkAt deletes the non-directory file at the given path.
 func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
 	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// ListxattrAt returns all extended attribute names for the file at the given
+// path.
+func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return names, nil
+		}
+		if err == syserror.ENOTSUP {
+			// Linux doesn't actually return ENOTSUP in this case; instead,
+			// fs/xattr.c:vfs_listxattr() falls back to allowing the security
+			// subsystem to return security extended attributes, which by
+			// default don't exist.
+			vfs.putResolvingPath(rp)
+			return nil, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return nil, err
+		}
+	}
+}
+
+// GetxattrAt returns the value associated with the given extended attribute
+// for the file at the given path.
+func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return val, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return "", err
+		}
+	}
+}
+
+// SetxattrAt changes the value associated with the given extended attribute
+// for the file at the given path.
+func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// RemovexattrAt removes the given extended attribute from the file at rp.
+func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err