156 files changed, 7417 insertions, 1561 deletions
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 3f9772b87..c35faeb4c 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -56,15 +56,10 @@ type ExecArgs struct {
 
 	// MountNamespace is the mount namespace to execute the new process in.
 	// A reference on MountNamespace must be held for the lifetime of the
-	// ExecArgs. If MountNamespace is nil, it will default to the kernel's
-	// root MountNamespace.
+	// ExecArgs. If MountNamespace is nil, it will default to the init
+	// process's MountNamespace.
 	MountNamespace *fs.MountNamespace
 
-	// Root defines the root directory for the new process. A reference on
-	// Root must be held for the lifetime of the ExecArgs. If Root is nil,
-	// it will default to the VFS root.
-	Root *fs.Dirent
-
 	// WorkingDirectory defines the working directory for the new process.
 	WorkingDirectory string `json:"wd"`
 
@@ -155,7 +150,6 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		Envv:                    args.Envv,
 		WorkingDirectory:        args.WorkingDirectory,
 		MountNamespace:          args.MountNamespace,
-		Root:                    args.Root,
 		Credentials:             creds,
 		FDTable:                 fdTable,
 		Umask:                   0022,
@@ -167,11 +161,6 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		ContainerID:             args.ContainerID,
 		PIDNamespace:            args.PIDNamespace,
 	}
-	if initArgs.Root != nil {
-		// initArgs must hold a reference on Root, which will be
-		// donated to the new process in CreateProcess.
-		initArgs.Root.IncRef()
-	}
 	if initArgs.MountNamespace != nil {
 		// initArgs must hold a reference on MountNamespace, which will
 		// be donated to the new process in CreateProcess.
@@ -184,7 +173,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		paths := fs.GetPath(initArgs.Envv)
 		mns := initArgs.MountNamespace
 		if mns == nil {
-			mns = proc.Kernel.RootMountNamespace()
+			mns = proc.Kernel.GlobalInit().Leader().MountNamespace()
 		}
 		f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
 		if err != nil {
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 9fc6a5bc2..4f3d6410e 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -111,6 +111,50 @@ func (n InodeType) LinuxType() uint32 {
 	}
 }
 
+// ToDirentType converts an InodeType to a linux dirent type field.
+func ToDirentType(nodeType InodeType) uint8 {
+	switch nodeType {
+	case RegularFile, SpecialFile:
+		return linux.DT_REG
+	case Symlink:
+		return linux.DT_LNK
+	case Directory, SpecialDirectory:
+		return linux.DT_DIR
+	case Pipe:
+		return linux.DT_FIFO
+	case CharacterDevice:
+		return linux.DT_CHR
+	case BlockDevice:
+		return linux.DT_BLK
+	case Socket:
+		return linux.DT_SOCK
+	default:
+		return linux.DT_UNKNOWN
+	}
+}
+
+// ToInodeType coverts a linux file type to InodeType.
+func ToInodeType(linuxFileType linux.FileMode) InodeType {
+	switch linuxFileType {
+	case linux.ModeRegular:
+		return RegularFile
+	case linux.ModeDirectory:
+		return Directory
+	case linux.ModeSymlink:
+		return Symlink
+	case linux.ModeNamedPipe:
+		return Pipe
+	case linux.ModeCharacterDevice:
+		return CharacterDevice
+	case linux.ModeBlockDevice:
+		return BlockDevice
+	case linux.ModeSocket:
+		return Socket
+	default:
+		panic(fmt.Sprintf("unknown file mode: %d", linuxFileType))
+	}
+}
+
 // StableAttr contains Inode attributes that will be stable throughout the
 // lifetime of the Inode.
 //
diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
index 51b4c7ee1..dd427de5d 100644
--- a/pkg/sentry/fs/context.go
+++ b/pkg/sentry/fs/context.go
@@ -112,3 +112,27 @@ func DirentCacheLimiterFromContext(ctx context.Context) *DirentCacheLimiter {
 	}
 	return nil
 }
+
+type rootContext struct {
+	context.Context
+	root *Dirent
+}
+
+// WithRoot returns a copy of ctx with the given root.
+func WithRoot(ctx context.Context, root *Dirent) context.Context {
+	return &rootContext{
+		Context: ctx,
+		root:    root,
+	}
+}
+
+// Value implements Context.Value.
+func (rc rootContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxRoot:
+		rc.root.IncRef()
+		return rc.root
+	default:
+		return rc.Context.Value(key)
+	}
+}
diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD
deleted file mode 100644
index 2c15875f5..000000000
--- a/pkg/sentry/fs/ext/BUILD
+++ /dev/null
@@ -1,54 +0,0 @@
-package(licenses = ["notice"])
-
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
-
-go_library(
-    name = "ext",
-    srcs = [
-        "dentry.go",
-        "ext.go",
-        "filesystem.go",
-        "inode.go",
-        "utils.go",
-    ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext",
-    visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/abi/linux",
-        "//pkg/binary",
-        "//pkg/sentry/context",
-        "//pkg/sentry/fs/ext/disklayout",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/vfs",
-        "//pkg/syserror",
-    ],
-)
-
-go_test(
-    name = "ext_test",
-    size = "small",
-    srcs = [
-        "ext_test.go",
-        "extent_test.go",
-    ],
-    data = [
-        "//pkg/sentry/fs/ext:assets/bigfile.txt",
-        "//pkg/sentry/fs/ext:assets/file.txt",
-        "//pkg/sentry/fs/ext:assets/tiny.ext2",
-        "//pkg/sentry/fs/ext:assets/tiny.ext3",
-        "//pkg/sentry/fs/ext:assets/tiny.ext4",
-    ],
-    embed = [":ext"],
-    deps = [
-        "//pkg/abi/linux",
-        "//pkg/binary",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
-        "//pkg/sentry/fs/ext/disklayout",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/vfs",
-        "//runsc/test/testutil",
-        "@com_github_google_go-cmp//cmp:go_default_library",
-        "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
-    ],
-)
diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go
deleted file mode 100644
index 10e235fb1..000000000
--- a/pkg/sentry/fs/ext/ext.go
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package ext implements readonly ext(2/3/4) filesystems.
-package ext
-
-import (
-	"errors"
-	"fmt"
-	"io"
-	"os"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// filesystemType implements vfs.FilesystemType.
-type filesystemType struct{}
-
-// Compiles only if filesystemType implements vfs.FilesystemType.
-var _ vfs.FilesystemType = (*filesystemType)(nil)
-
-// getDeviceFd returns the read seeker to the underlying device.
-// Currently there are two ways of mounting an ext(2/3/4) fs:
-//   1. Specify a mount with our internal special MountType in the OCI spec.
-//   2. Expose the device to the container and mount it from application layer.
-func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, error) {
-	if opts.InternalData == nil {
-		// User mount call.
-		// TODO(b/134676337): Open the device specified by `source` and return that.
-		panic("unimplemented")
-	}
-
-	// NewFilesystem call originated from within the sentry.
-	fd, ok := opts.InternalData.(uintptr)
-	if !ok {
-		return nil, errors.New("internal data for ext fs must be a uintptr containing the file descriptor to device")
-	}
-
-	// We do not close this file because that would close the underlying device
-	// file descriptor (which is required for reading the fs from disk).
-	// TODO(b/134676337): Use pkg/fd instead.
-	deviceFile := os.NewFile(fd, source)
-	if deviceFile == nil {
-		return nil, fmt.Errorf("ext4 device file descriptor is not valid: %d", fd)
-	}
-
-	return deviceFile, nil
-}
-
-// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
-func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	dev, err := getDeviceFd(source, opts)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)}
-	fs.vfsfs.Init(&fs)
-	fs.sb, err = readSuperBlock(dev)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	if fs.sb.Magic() != linux.EXT_SUPER_MAGIC {
-		// mount(2) specifies that EINVAL should be returned if the superblock is
-		// invalid.
-		return nil, nil, syserror.EINVAL
-	}
-
-	fs.bgs, err = readBlockGroups(dev, fs.sb)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	rootInode, err := fs.getOrCreateInode(disklayout.RootDirInode)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	return &fs.vfsfs, &newDentry(rootInode).vfsd, nil
-}
diff --git a/pkg/sentry/fs/ext/ext_test.go b/pkg/sentry/fs/ext/ext_test.go
deleted file mode 100644
index ee7f7907c..000000000
--- a/pkg/sentry/fs/ext/ext_test.go
+++ /dev/null
@@ -1,407 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ext
-
-import (
-	"fmt"
-	"os"
-	"path"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-
-	"gvisor.dev/gvisor/runsc/test/testutil"
-)
-
-const (
-	assetsDir = "pkg/sentry/fs/ext/assets"
-)
-
-var (
-	ext2ImagePath = path.Join(assetsDir, "tiny.ext2")
-	ext3ImagePath = path.Join(assetsDir, "tiny.ext3")
-	ext4ImagePath = path.Join(assetsDir, "tiny.ext4")
-)
-
-func beginning(_ uint64) uint64 {
-	return 0
-}
-
-func middle(i uint64) uint64 {
-	return i / 2
-}
-
-func end(i uint64) uint64 {
-	return i
-}
-
-// setUp opens imagePath as an ext Filesystem and returns all necessary
-// elements required to run tests. If error is non-nil, it also returns a tear
-// down function which must be called after the test is run for clean up.
-func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *vfs.Dentry, func(), error) {
-	localImagePath, err := testutil.FindFile(imagePath)
-	if err != nil {
-		return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err)
-	}
-
-	f, err := os.Open(localImagePath)
-	if err != nil {
-		return nil, nil, nil, nil, err
-	}
-
-	// Mount the ext4 fs and retrieve the inode structure for the file.
-	mockCtx := contexttest.Context(t)
-	fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: f.Fd()})
-	if err != nil {
-		f.Close()
-		return nil, nil, nil, nil, err
-	}
-
-	tearDown := func() {
-		if err := f.Close(); err != nil {
-			t.Fatalf("tearDown failed: %v", err)
-		}
-	}
-	return mockCtx, fs, d, tearDown, nil
-}
-
-// TestRootDir tests that the root directory inode is correctly initialized and
-// returned from setUp.
-func TestRootDir(t *testing.T) {
-	type inodeProps struct {
-		Mode      linux.FileMode
-		UID       auth.KUID
-		GID       auth.KGID
-		Size      uint64
-		InodeSize uint16
-		Links     uint16
-		Flags     disklayout.InodeFlags
-	}
-
-	type rootDirTest struct {
-		name      string
-		image     string
-		wantInode inodeProps
-	}
-
-	tests := []rootDirTest{
-		{
-			name:  "ext4 root dir",
-			image: ext4ImagePath,
-			wantInode: inodeProps{
-				Mode:      linux.ModeDirectory | 0755,
-				Size:      0x400,
-				InodeSize: 0x80,
-				Links:     3,
-				Flags:     disklayout.InodeFlags{Extents: true},
-			},
-		},
-		{
-			name:  "ext3 root dir",
-			image: ext3ImagePath,
-			wantInode: inodeProps{
-				Mode:      linux.ModeDirectory | 0755,
-				Size:      0x400,
-				InodeSize: 0x80,
-				Links:     3,
-			},
-		},
-		{
-			name:  "ext2 root dir",
-			image: ext2ImagePath,
-			wantInode: inodeProps{
-				Mode:      linux.ModeDirectory | 0755,
-				Size:      0x400,
-				InodeSize: 0x80,
-				Links:     3,
-			},
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			_, _, vfsd, tearDown, err := setUp(t, test.image)
-			if err != nil {
-				t.Fatalf("setUp failed: %v", err)
-			}
-			defer tearDown()
-
-			d, ok := vfsd.Impl().(*dentry)
-			if !ok {
-				t.Fatalf("ext dentry of incorrect type: %T", vfsd.Impl())
-			}
-
-			// Offload inode contents into local structs for comparison.
-			gotInode := inodeProps{
-				Mode:      d.inode.diskInode.Mode(),
-				UID:       d.inode.diskInode.UID(),
-				GID:       d.inode.diskInode.GID(),
-				Size:      d.inode.diskInode.Size(),
-				InodeSize: d.inode.diskInode.InodeSize(),
-				Links:     d.inode.diskInode.LinksCount(),
-				Flags:     d.inode.diskInode.Flags(),
-			}
-
-			if diff := cmp.Diff(gotInode, test.wantInode); diff != "" {
-				t.Errorf("inode mismatch (-want +got):\n%s", diff)
-			}
-		})
-	}
-}
-
-// TestFilesystemInit tests that the filesystem superblock and block group
-// descriptors are correctly read in and initialized.
-func TestFilesystemInit(t *testing.T) {
-	// sb only contains the immutable properties of the superblock.
-	type sb struct {
-		InodesCount      uint32
-		BlocksCount      uint64
-		MaxMountCount    uint16
-		FirstDataBlock   uint32
-		BlockSize        uint64
-		BlocksPerGroup   uint32
-		ClusterSize      uint64
-		ClustersPerGroup uint32
-		InodeSize        uint16
-		InodesPerGroup   uint32
-		BgDescSize       uint16
-		Magic            uint16
-		Revision         disklayout.SbRevision
-		CompatFeatures   disklayout.CompatFeatures
-		IncompatFeatures disklayout.IncompatFeatures
-		RoCompatFeatures disklayout.RoCompatFeatures
-	}
-
-	// bg only contains the immutable properties of the block group descriptor.
-	type bg struct {
-		InodeTable      uint64
-		BlockBitmap     uint64
-		InodeBitmap     uint64
-		ExclusionBitmap uint64
-		Flags           disklayout.BGFlags
-	}
-
-	type fsInitTest struct {
-		name    string
-		image   string
-		wantSb  sb
-		wantBgs []bg
-	}
-
-	tests := []fsInitTest{
-		{
-			name:  "ext4 filesystem init",
-			image: ext4ImagePath,
-			wantSb: sb{
-				InodesCount:      0x10,
-				BlocksCount:      0x40,
-				MaxMountCount:    0xffff,
-				FirstDataBlock:   0x1,
-				BlockSize:        0x400,
-				BlocksPerGroup:   0x2000,
-				ClusterSize:      0x400,
-				ClustersPerGroup: 0x2000,
-				InodeSize:        0x80,
-				InodesPerGroup:   0x10,
-				BgDescSize:       0x40,
-				Magic:            linux.EXT_SUPER_MAGIC,
-				Revision:         disklayout.DynamicRev,
-				CompatFeatures: disklayout.CompatFeatures{
-					ExtAttr:     true,
-					ResizeInode: true,
-					DirIndex:    true,
-				},
-				IncompatFeatures: disklayout.IncompatFeatures{
-					DirentFileType: true,
-					Extents:        true,
-					Is64Bit:        true,
-					FlexBg:         true,
-				},
-				RoCompatFeatures: disklayout.RoCompatFeatures{
-					Sparse:       true,
-					LargeFile:    true,
-					HugeFile:     true,
-					DirNlink:     true,
-					ExtraIsize:   true,
-					MetadataCsum: true,
-				},
-			},
-			wantBgs: []bg{
-				{
-					InodeTable:  0x23,
-					BlockBitmap: 0x3,
-					InodeBitmap: 0x13,
-					Flags: disklayout.BGFlags{
-						InodeZeroed: true,
-					},
-				},
-			},
-		},
-		{
-			name:  "ext3 filesystem init",
-			image: ext3ImagePath,
-			wantSb: sb{
-				InodesCount:      0x10,
-				BlocksCount:      0x40,
-				MaxMountCount:    0xffff,
-				FirstDataBlock:   0x1,
-				BlockSize:        0x400,
-				BlocksPerGroup:   0x2000,
-				ClusterSize:      0x400,
-				ClustersPerGroup: 0x2000,
-				InodeSize:        0x80,
-				InodesPerGroup:   0x10,
-				BgDescSize:       0x20,
-				Magic:            linux.EXT_SUPER_MAGIC,
-				Revision:         disklayout.DynamicRev,
-				CompatFeatures: disklayout.CompatFeatures{
-					ExtAttr:     true,
-					ResizeInode: true,
-					DirIndex:    true,
-				},
-				IncompatFeatures: disklayout.IncompatFeatures{
-					DirentFileType: true,
-				},
-				RoCompatFeatures: disklayout.RoCompatFeatures{
-					Sparse:    true,
-					LargeFile: true,
-				},
-			},
-			wantBgs: []bg{
-				{
-					InodeTable:  0x5,
-					BlockBitmap: 0x3,
-					InodeBitmap: 0x4,
-					Flags: disklayout.BGFlags{
-						InodeZeroed: true,
-					},
-				},
-			},
-		},
-		{
-			name:  "ext2 filesystem init",
-			image: ext2ImagePath,
-			wantSb: sb{
-				InodesCount:      0x10,
-				BlocksCount:      0x40,
-				MaxMountCount:    0xffff,
-				FirstDataBlock:   0x1,
-				BlockSize:        0x400,
-				BlocksPerGroup:   0x2000,
-				ClusterSize:      0x400,
-				ClustersPerGroup: 0x2000,
-				InodeSize:        0x80,
-				InodesPerGroup:   0x10,
-				BgDescSize:       0x20,
-				Magic:            linux.EXT_SUPER_MAGIC,
-				Revision:         disklayout.DynamicRev,
-				CompatFeatures: disklayout.CompatFeatures{
-					ExtAttr:     true,
-					ResizeInode: true,
-					DirIndex:    true,
-				},
-				IncompatFeatures: disklayout.IncompatFeatures{
-					DirentFileType: true,
-				},
-				RoCompatFeatures: disklayout.RoCompatFeatures{
-					Sparse:    true,
-					LargeFile: true,
-				},
-			},
-			wantBgs: []bg{
-				{
-					InodeTable:  0x5,
-					BlockBitmap: 0x3,
-					InodeBitmap: 0x4,
-					Flags: disklayout.BGFlags{
-						InodeZeroed: true,
-					},
-				},
-			},
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			_, vfsfs, _, tearDown, err := setUp(t, test.image)
-			if err != nil {
-				t.Fatalf("setUp failed: %v", err)
-			}
-			defer tearDown()
-
-			fs, ok := vfsfs.Impl().(*filesystem)
-			if !ok {
-				t.Fatalf("ext filesystem of incorrect type: %T", vfsfs.Impl())
-			}
-
-			// Offload superblock and block group descriptors contents into
-			// local structs for comparison.
-			totalFreeInodes := uint32(0)
-			totalFreeBlocks := uint64(0)
-			gotSb := sb{
-				InodesCount:      fs.sb.InodesCount(),
-				BlocksCount:      fs.sb.BlocksCount(),
-				MaxMountCount:    fs.sb.MaxMountCount(),
-				FirstDataBlock:   fs.sb.FirstDataBlock(),
-				BlockSize:        fs.sb.BlockSize(),
-				BlocksPerGroup:   fs.sb.BlocksPerGroup(),
-				ClusterSize:      fs.sb.ClusterSize(),
-				ClustersPerGroup: fs.sb.ClustersPerGroup(),
-				InodeSize:        fs.sb.InodeSize(),
-				InodesPerGroup:   fs.sb.InodesPerGroup(),
-				BgDescSize:       fs.sb.BgDescSize(),
-				Magic:            fs.sb.Magic(),
-				Revision:         fs.sb.Revision(),
-				CompatFeatures:   fs.sb.CompatibleFeatures(),
-				IncompatFeatures: fs.sb.IncompatibleFeatures(),
-				RoCompatFeatures: fs.sb.ReadOnlyCompatibleFeatures(),
-			}
-			gotNumBgs := len(fs.bgs)
-			gotBgs := make([]bg, gotNumBgs)
-			for i := 0; i < gotNumBgs; i++ {
-				gotBgs[i].InodeTable = fs.bgs[i].InodeTable()
-				gotBgs[i].BlockBitmap = fs.bgs[i].BlockBitmap()
-				gotBgs[i].InodeBitmap = fs.bgs[i].InodeBitmap()
-				gotBgs[i].ExclusionBitmap = fs.bgs[i].ExclusionBitmap()
-				gotBgs[i].Flags = fs.bgs[i].Flags()
-
-				totalFreeInodes += fs.bgs[i].FreeInodesCount()
-				totalFreeBlocks += uint64(fs.bgs[i].FreeBlocksCount())
-			}
-
-			if diff := cmp.Diff(gotSb, test.wantSb); diff != "" {
-				t.Errorf("superblock mismatch (-want +got):\n%s", diff)
-			}
-
-			if diff := cmp.Diff(gotBgs, test.wantBgs); diff != "" {
-				t.Errorf("block group descriptors mismatch (-want +got):\n%s", diff)
-			}
-
-			if diff := cmp.Diff(totalFreeInodes, fs.sb.FreeInodesCount()); diff != "" {
-				t.Errorf("total free inodes mismatch (-want +got):\n%s", diff)
-			}
-
-			if diff := cmp.Diff(totalFreeBlocks, fs.sb.FreeBlocksCount()); diff != "" {
-				t.Errorf("total free blocks mismatch (-want +got):\n%s", diff)
-			}
-		})
-	}
-}
diff --git a/pkg/sentry/fs/ext/filesystem.go b/pkg/sentry/fs/ext/filesystem.go
deleted file mode 100644
index 7150e75a5..000000000
--- a/pkg/sentry/fs/ext/filesystem.go
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ext
-
-import (
-	"io"
-	"sync"
-
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// filesystem implements vfs.FilesystemImpl.
-type filesystem struct {
-	// TODO(b/134676337): Remove when all methods have been implemented.
-	vfs.FilesystemImpl
-
-	vfsfs vfs.Filesystem
-
-	// mu serializes changes to the Dentry tree and the usage of the read seeker.
-	mu sync.Mutex
-
-	// dev is the ReadSeeker for the underlying fs device. It is protected by mu.
-	//
-	// The ext filesystems aim to maximize locality, i.e. place all the data
-	// blocks of a file close together. On a spinning disk, locality reduces the
-	// amount of movement of the head hence speeding up IO operations. On an SSD
-	// there are no moving parts but locality increases the size of each transer
-	// request. Hence, having mutual exclusion on the read seeker while reading a
-	// file *should* help in achieving the intended performance gains.
-	//
-	// Note: This synchronization was not coupled with the ReadSeeker itself
-	// because we want to synchronize across read/seek operations for the
-	// performance gains mentioned above. Helps enforcing one-file-at-a-time IO.
-	dev io.ReadSeeker
-
-	// inodeCache maps absolute inode numbers to the corresponding Inode struct.
-	// Inodes should be removed from this once their reference count hits 0.
-	//
-	// Protected by mu because every addition and removal from this corresponds to
-	// a change in the dentry tree.
-	inodeCache map[uint32]*inode
-
-	// sb represents the filesystem superblock. Immutable after initialization.
-	sb disklayout.SuperBlock
-
-	// bgs represents all the block group descriptors for the filesystem.
-	// Immutable after initialization.
-	bgs []disklayout.BlockGroup
-}
-
-// Compiles only if filesystem implements vfs.FilesystemImpl.
-var _ vfs.FilesystemImpl = (*filesystem)(nil)
-
-// getOrCreateInode gets the inode corresponding to the inode number passed in.
-// It creates a new one with the given inode number if one does not exist.
-//
-// Preconditions: must be holding fs.mu.
-func (fs *filesystem) getOrCreateInode(inodeNum uint32) (*inode, error) {
-	if in, ok := fs.inodeCache[inodeNum]; ok {
-		return in, nil
-	}
-
-	in, err := newInode(fs.dev, fs.sb, fs.bgs, inodeNum)
-	if err != nil {
-		return nil, err
-	}
-
-	fs.inodeCache[inodeNum] = in
-	return in, nil
-}
-
-// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
-}
-
-// Sync implements vfs.FilesystemImpl.Sync.
-func (fs *filesystem) Sync(ctx context.Context) error {
-	// This is a readonly filesystem for now.
-	return nil
-}
-
-// The vfs.FilesystemImpl functions below return EROFS because their respective
-// man pages say that EROFS must be returned if the path resolves to a file on
-// a read-only filesystem.
-
-// TODO(b/134676337): Implement path traversal and return EROFS only if the
-// path resolves to a Dentry within ext fs.
-
-// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
-func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
-	return syserror.EROFS
-}
-
-// MknodAt implements vfs.FilesystemImpl.MknodAt.
-func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
-	return syserror.EROFS
-}
-
-// RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
-	return syserror.EROFS
-}
-
-// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
-func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
-	return syserror.EROFS
-}
-
-// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
-func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
-	return syserror.EROFS
-}
-
-// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
-func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
-	return syserror.EROFS
-}
-
-// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
-func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
-	return syserror.EROFS
-}
diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go
deleted file mode 100644
index df1ea0bda..000000000
--- a/pkg/sentry/fs/ext/inode.go
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ext
-
-import (
-	"io"
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// inode represents an ext inode.
-type inode struct {
-	// refs is a reference count. refs is accessed using atomic memory operations.
-	refs int64
-
-	// inodeNum is the inode number of this inode on disk. This is used to
-	// identify inodes within the ext filesystem.
-	inodeNum uint32
-
-	// diskInode gives us access to the inode struct on disk. Immutable.
-	diskInode disklayout.Inode
-
-	// root is the root extent node. This lives in the 60 byte diskInode.Blocks().
-	// Immutable. Nil if the inode does not use extents.
-	root *disklayout.ExtentNode
-}
-
-// incRef increments the inode ref count.
-func (in *inode) incRef() {
-	atomic.AddInt64(&in.refs, 1)
-}
-
-// tryIncRef tries to increment the ref count. Returns true if successful.
-func (in *inode) tryIncRef() bool {
-	for {
-		refs := atomic.LoadInt64(&in.refs)
-		if refs == 0 {
-			return false
-		}
-		if atomic.CompareAndSwapInt64(&in.refs, refs, refs+1) {
-			return true
-		}
-	}
-}
-
-// decRef decrements the inode ref count and releases the inode resources if
-// the ref count hits 0.
-//
-// Preconditions: Must have locked fs.mu.
-func (in *inode) decRef(fs *filesystem) {
-	if refs := atomic.AddInt64(&in.refs, -1); refs == 0 {
-		delete(fs.inodeCache, in.inodeNum)
-	} else if refs < 0 {
-		panic("ext.inode.decRef() called without holding a reference")
-	}
-}
-
-// newInode is the inode constructor. Reads the inode off disk. Identifies
-// inodes based on the absolute inode number on disk.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func newInode(dev io.ReadSeeker, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) {
-	if inodeNum == 0 {
-		panic("inode number 0 on ext filesystems is not possible")
-	}
-
-	in := &inode{refs: 1, inodeNum: inodeNum}
-	inodeRecordSize := sb.InodeSize()
-	if inodeRecordSize == disklayout.OldInodeSize {
-		in.diskInode = &disklayout.InodeOld{}
-	} else {
-		in.diskInode = &disklayout.InodeNew{}
-	}
-
-	// Calculate where the inode is actually placed.
-	inodesPerGrp := sb.InodesPerGroup()
-	blkSize := sb.BlockSize()
-	inodeTableOff := bgs[getBGNum(inodeNum, inodesPerGrp)].InodeTable() * blkSize
-	inodeOff := inodeTableOff + uint64(uint32(inodeRecordSize)*getBGOff(inodeNum, inodesPerGrp))
-
-	// Read it from disk and figure out which type of inode this is.
-	if err := readFromDisk(dev, int64(inodeOff), in.diskInode); err != nil {
-		return nil, err
-	}
-
-	if in.diskInode.Flags().Extents {
-		in.buildExtTree(dev, blkSize)
-	}
-
-	return in, nil
-}
-
-// getBGNum returns the block group number that a given inode belongs to.
-func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 {
-	return (inodeNum - 1) / inodesPerGrp
-}
-
-// getBGOff returns the offset at which the given inode lives in the block
-// group's inode table, i.e. the index of the inode in the inode table.
-func getBGOff(inodeNum uint32, inodesPerGrp uint32) uint32 {
-	return (inodeNum - 1) % inodesPerGrp
-}
-
-// buildExtTree builds the extent tree by reading it from disk by doing
-// running a simple DFS. It first reads the root node from the inode struct in
-// memory. Then it recursively builds the rest of the tree by reading it off
-// disk.
-//
-// Preconditions:
-//   - Must hold the mutex of the filesystem containing dev.
-//   - Inode flag InExtents must be set.
-func (in *inode) buildExtTree(dev io.ReadSeeker, blkSize uint64) error {
-	rootNodeData := in.diskInode.Data()
-
-	var rootHeader disklayout.ExtentHeader
-	binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &rootHeader)
-
-	// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
-	if rootHeader.NumEntries > 4 {
-		// read(2) specifies that EINVAL should be returned if the file is unsuitable
-		// for reading.
-		return syserror.EINVAL
-	}
-
-	rootEntries := make([]disklayout.ExtentEntryPair, rootHeader.NumEntries)
-	for i, off := uint16(0), disklayout.ExtentStructsSize; i < rootHeader.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
-		var curEntry disklayout.ExtentEntry
-		if rootHeader.Height == 0 {
-			// Leaf node.
-			curEntry = &disklayout.Extent{}
-		} else {
-			// Internal node.
-			curEntry = &disklayout.ExtentIdx{}
-		}
-		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry)
-		rootEntries[i].Entry = curEntry
-	}
-
-	// If this node is internal, perform DFS.
-	if rootHeader.Height > 0 {
-		for i := uint16(0); i < rootHeader.NumEntries; i++ {
-			var err error
-			if rootEntries[i].Node, err = buildExtTreeFromDisk(dev, rootEntries[i].Entry, blkSize); err != nil {
-				return err
-			}
-		}
-	}
-
-	in.root = &disklayout.ExtentNode{rootHeader, rootEntries}
-	return nil
-}
-
-// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively
-// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to
-// by the ExtentEntry.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) {
-	var header disklayout.ExtentHeader
-	off := entry.PhysicalBlock() * blkSize
-	if err := readFromDisk(dev, int64(off), &header); err != nil {
-		return nil, err
-	}
-
-	entries := make([]disklayout.ExtentEntryPair, header.NumEntries)
-	for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
-		var curEntry disklayout.ExtentEntry
-		if header.Height == 0 {
-			// Leaf node.
-			curEntry = &disklayout.Extent{}
-		} else {
-			// Internal node.
-			curEntry = &disklayout.ExtentIdx{}
-		}
-
-		if err := readFromDisk(dev, int64(off), curEntry); err != nil {
-			return nil, err
-		}
-		entries[i].Entry = curEntry
-	}
-
-	// If this node is internal, perform DFS.
-	if header.Height > 0 {
-		for i := uint16(0); i < header.NumEntries; i++ {
-			var err error
-			entries[i].Node, err = buildExtTreeFromDisk(dev, entries[i].Entry, blkSize)
-			if err != nil {
-				return nil, err
-			}
-		}
-	}
-
-	return &disklayout.ExtentNode{header, entries}, nil
-}
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 5a0a67eab..669ffcb75 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -87,7 +87,7 @@ func (p *pipeOperations) init() error {
 		log.Warningf("pipe: cannot stat fd %d: %v", p.file.FD(), err)
 		return syscall.EINVAL
 	}
-	if s.Mode&syscall.S_IFIFO != syscall.S_IFIFO {
+	if (s.Mode & syscall.S_IFMT) != syscall.S_IFIFO {
 		log.Warningf("pipe: cannot load fd %d as pipe, file type: %o", p.file.FD(), s.Mode)
 		return syscall.EINVAL
 	}
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index ed62049a9..20cb9a367 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -66,10 +66,8 @@ type CachingInodeOperations struct {
 	// mfp is used to allocate memory that caches backingFile's contents.
 	mfp pgalloc.MemoryFileProvider
 
-	// forcePageCache indicates the sentry page cache should be used regardless
-	// of whether the platform supports host mapped I/O or not. This must not be
-	// modified after inode creation.
-	forcePageCache bool
+	// opts contains options. opts is immutable.
+	opts CachingInodeOperationsOptions
 
 	attrMu sync.Mutex `state:"nosave"`
 
@@ -116,6 +114,20 @@ type CachingInodeOperations struct {
 	refs frameRefSet
 }
 
+// CachingInodeOperationsOptions configures a CachingInodeOperations.
+//
+// +stateify savable
+type CachingInodeOperationsOptions struct {
+	// If ForcePageCache is true, use the sentry page cache even if a host file
+	// descriptor is available.
+	ForcePageCache bool
+
+	// If LimitHostFDTranslation is true, apply maxFillRange() constraints to
+	// host file descriptor mappings returned by
+	// CachingInodeOperations.Translate().
+	LimitHostFDTranslation bool
+}
+
 // CachedFileObject is a file that may require caching.
 type CachedFileObject interface {
 	// ReadToBlocksAt reads up to dsts.NumBytes() bytes from the file to dsts,
@@ -159,7 +171,7 @@ type CachedFileObject interface {
 
 // NewCachingInodeOperations returns a new CachingInodeOperations backed by
 // a CachedFileObject and its initial unstable attributes.
-func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, forcePageCache bool) *CachingInodeOperations {
+func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, opts CachingInodeOperationsOptions) *CachingInodeOperations {
 	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
 	if mfp == nil {
 		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
@@ -167,7 +179,7 @@ func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject
 	return &CachingInodeOperations{
 		backingFile:    backingFile,
 		mfp:            mfp,
-		forcePageCache: forcePageCache,
+		opts:           opts,
 		attr:           uattr,
 		hostFileMapper: NewHostFileMapper(),
 	}
@@ -568,21 +580,30 @@ type inodeReadWriter struct {
 
 // ReadToBlocks implements safemem.Reader.ReadToBlocks.
 func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	mem := rw.c.mfp.MemoryFile()
+	fillCache := !rw.c.useHostPageCache() && mem.ShouldCacheEvictable()
+
 	// Hot path. Avoid defers.
-	rw.c.dataMu.RLock()
+	var unlock func()
+	if fillCache {
+		rw.c.dataMu.Lock()
+		unlock = rw.c.dataMu.Unlock
+	} else {
+		rw.c.dataMu.RLock()
+		unlock = rw.c.dataMu.RUnlock
+	}
 
 	// Compute the range to read.
 	if rw.offset >= rw.c.attr.Size {
-		rw.c.dataMu.RUnlock()
+		unlock()
 		return 0, io.EOF
 	}
 	end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.c.attr.Size)
 	if end == rw.offset { // dsts.NumBytes() == 0?
-		rw.c.dataMu.RUnlock()
+		unlock()
 		return 0, nil
 	}
 
-	mem := rw.c.mfp.MemoryFile()
 	var done uint64
 	seg, gap := rw.c.cache.Find(uint64(rw.offset))
 	for rw.offset < end {
@@ -592,7 +613,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 			// Get internal mappings from the cache.
 			ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
 			if err != nil {
-				rw.c.dataMu.RUnlock()
+				unlock()
 				return done, err
 			}
 
@@ -602,7 +623,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 			rw.offset += int64(n)
 			dsts = dsts.DropFirst64(n)
 			if err != nil {
-				rw.c.dataMu.RUnlock()
+				unlock()
 				return done, err
 			}
 
@@ -610,27 +631,48 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 			seg, gap = seg.NextNonEmpty()
 
 		case gap.Ok():
-			// Read directly from the backing file.
-			gapmr := gap.Range().Intersect(mr)
-			dst := dsts.TakeFirst64(gapmr.Length())
-			n, err := rw.c.backingFile.ReadToBlocksAt(rw.ctx, dst, gapmr.Start)
-			done += n
-			rw.offset += int64(n)
-			dsts = dsts.DropFirst64(n)
-			// Partial reads are fine. But we must stop reading.
-			if n != dst.NumBytes() || err != nil {
-				rw.c.dataMu.RUnlock()
-				return done, err
+			gapMR := gap.Range().Intersect(mr)
+			if fillCache {
+				// Read into the cache, then re-enter the loop to read from the
+				// cache.
+				reqMR := memmap.MappableRange{
+					Start: uint64(usermem.Addr(gapMR.Start).RoundDown()),
+					End:   fs.OffsetPageEnd(int64(gapMR.End)),
+				}
+				optMR := gap.Range()
+				err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
+				mem.MarkEvictable(rw.c, pgalloc.EvictableRange{optMR.Start, optMR.End})
+				seg, gap = rw.c.cache.Find(uint64(rw.offset))
+				if !seg.Ok() {
+					unlock()
+					return done, err
+				}
+				// err might have occurred in part of gap.Range() outside
+				// gapMR. Forget about it for now; if the error matters and
+				// persists, we'll run into it again in a later iteration of
+				// this loop.
+			} else {
+				// Read directly from the backing file.
+				dst := dsts.TakeFirst64(gapMR.Length())
+				n, err := rw.c.backingFile.ReadToBlocksAt(rw.ctx, dst, gapMR.Start)
+				done += n
+				rw.offset += int64(n)
+				dsts = dsts.DropFirst64(n)
+				// Partial reads are fine. But we must stop reading.
+				if n != dst.NumBytes() || err != nil {
+					unlock()
+					return done, err
+				}
+
+				// Continue.
+				seg, gap = gap.NextSegment(), FileRangeGapIterator{}
 			}
 
-			// Continue.
-			seg, gap = gap.NextSegment(), FileRangeGapIterator{}
-
 		default:
 			break
 		}
 	}
-	rw.c.dataMu.RUnlock()
+	unlock()
 	return done, nil
 }
 
@@ -700,7 +742,10 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 			seg, gap = seg.NextNonEmpty()
 
 		case gap.Ok() && gap.Start() < mr.End:
-			// Write directly to the backing file.
+			// Write directly to the backing file. At present, we never fill
+			// the cache when writing, since doing so can convert small writes
+			// into inefficient read-modify-write cycles, and we have no
+			// mechanism for detecting or avoiding this.
 			gapmr := gap.Range().Intersect(mr)
 			src := srcs.TakeFirst64(gapmr.Length())
 			n, err := rw.c.backingFile.WriteFromBlocksAt(rw.ctx, src, gapmr.Start)
@@ -730,7 +775,7 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 // and memory mappings, and false if c.cache may contain data cached from
 // c.backingFile.
 func (c *CachingInodeOperations) useHostPageCache() bool {
-	return !c.forcePageCache && c.backingFile.FD() >= 0
+	return !c.opts.ForcePageCache && c.backingFile.FD() >= 0
 }
 
 // AddMapping implements memmap.Mappable.AddMapping.
@@ -802,11 +847,15 @@ func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.Mapp
 func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
 	// Hot path. Avoid defer.
 	if c.useHostPageCache() {
+		mr := optional
+		if c.opts.LimitHostFDTranslation {
+			mr = maxFillRange(required, optional)
+		}
 		return []memmap.Translation{
 			{
-				Source: optional,
+				Source: mr,
 				File:   c,
-				Offset: optional.Start,
+				Offset: mr.Start,
 				Perms:  usermem.AnyAccess,
 			},
 		}, nil
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index dc19255ed..eb5730c35 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -61,7 +61,7 @@ func TestSetPermissions(t *testing.T) {
 	uattr := fs.WithCurrentTime(ctx, fs.UnstableAttr{
 		Perms: fs.FilePermsFromMode(0444),
 	})
-	iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+	iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, CachingInodeOperationsOptions{})
 	defer iops.Release()
 
 	perms := fs.FilePermsFromMode(0777)
@@ -150,7 +150,7 @@ func TestSetTimestamps(t *testing.T) {
 				ModificationTime: epoch,
 				StatusChangeTime: epoch,
 			}
-			iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+			iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, CachingInodeOperationsOptions{})
 			defer iops.Release()
 
 			if err := iops.SetTimestamps(ctx, nil, test.ts); err != nil {
@@ -188,7 +188,7 @@ func TestTruncate(t *testing.T) {
 	uattr := fs.UnstableAttr{
 		Size: 0,
 	}
-	iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+	iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, CachingInodeOperationsOptions{})
 	defer iops.Release()
 
 	if err := iops.Truncate(ctx, nil, uattr.Size); err != nil {
@@ -280,7 +280,7 @@ func TestRead(t *testing.T) {
 	uattr := fs.UnstableAttr{
 		Size: int64(len(buf)),
 	}
-	iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/)
+	iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, CachingInodeOperationsOptions{})
 	defer iops.Release()
 
 	// Expect the cache to be initially empty.
@@ -336,7 +336,7 @@ func TestWrite(t *testing.T) {
 	uattr := fs.UnstableAttr{
 		Size: int64(len(buf)),
 	}
-	iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/)
+	iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, CachingInodeOperationsOptions{})
 	defer iops.Release()
 
 	// Expect the cache to be initially empty.
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index 69999dc28..8f8ab5d29 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -54,6 +54,10 @@ const (
 	// sandbox using files backed by the gofer. If set to false, unix sockets
 	// cannot be bound to gofer files without an overlay on top.
 	privateUnixSocketKey = "privateunixsocket"
+
+	// If present, sets CachingInodeOperationsOptions.LimitHostFDTranslation to
+	// true.
+	limitHostFDTranslationKey = "limit_host_fd_translation"
 )
 
 // defaultAname is the default attach name.
@@ -134,12 +138,13 @@ func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
 
 // opts are parsed 9p mount options.
 type opts struct {
-	fd                int
-	aname             string
-	policy            cachePolicy
-	msize             uint32
-	version           string
-	privateunixsocket bool
+	fd                     int
+	aname                  string
+	policy                 cachePolicy
+	msize                  uint32
+	version                string
+	privateunixsocket      bool
+	limitHostFDTranslation bool
 }
 
 // options parses mount(2) data into structured options.
@@ -237,6 +242,11 @@ func options(data string) (opts, error) {
 		delete(options, privateUnixSocketKey)
 	}
 
+	if _, ok := options[limitHostFDTranslationKey]; ok {
+		o.limitHostFDTranslation = true
+		delete(options, limitHostFDTranslationKey)
+	}
+
 	// Fail to attach if the caller wanted us to do something that we
 	// don't support.
 	if len(options) > 0 {
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 69d08a627..50da865c1 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -117,6 +117,11 @@ type session struct {
 	// Flags provided to the mount.
 	superBlockFlags fs.MountSourceFlags `state:"wait"`
 
+	// limitHostFDTranslation is the value used for
+	// CachingInodeOperationsOptions.LimitHostFDTranslation for all
+	// CachingInodeOperations created by the session.
+	limitHostFDTranslation bool
+
 	// connID is a unique identifier for the session connection.
 	connID string `state:"wait"`
 
@@ -218,8 +223,11 @@ func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p
 
 	uattr := unstable(ctx, valid, attr, s.mounter, s.client)
 	return sattr, &inodeOperations{
-		fileState:       fileState,
-		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, s.superBlockFlags.ForcePageCache),
+		fileState: fileState,
+		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, fsutil.CachingInodeOperationsOptions{
+			ForcePageCache:         s.superBlockFlags.ForcePageCache,
+			LimitHostFDTranslation: s.limitHostFDTranslation,
+		}),
 	}
 }
 
@@ -242,13 +250,14 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 
 	// Construct the session.
 	s := session{
-		connID:          dev,
-		msize:           o.msize,
-		version:         o.version,
-		cachePolicy:     o.policy,
-		aname:           o.aname,
-		superBlockFlags: superBlockFlags,
-		mounter:         mounter,
+		connID:                 dev,
+		msize:                  o.msize,
+		version:                o.version,
+		cachePolicy:            o.policy,
+		aname:                  o.aname,
+		superBlockFlags:        superBlockFlags,
+		limitHostFDTranslation: o.limitHostFDTranslation,
+		mounter:                mounter,
 	}
 	s.EnableLeakCheck("gofer.session")
 
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 679d8321a..894ab01f0 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -200,8 +200,10 @@ func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool,
 	// Build the fs.InodeOperations.
 	uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s)
 	iops := &inodeOperations{
-		fileState:       fileState,
-		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, msrc.Flags.ForcePageCache),
+		fileState: fileState,
+		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, fsutil.CachingInodeOperationsOptions{
+			ForcePageCache: msrc.Flags.ForcePageCache,
+		}),
 	}
 
 	// Return the fs.Inode.
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 44c4ee5f2..2392787cb 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -65,7 +65,7 @@ type ConnectedEndpoint struct {
 	// GetSockOpt and message splitting/rejection in SendMsg, but do not
 	// prevent lots of small messages from filling the real send buffer
 	// size on the host.
-	sndbuf int `state:"nosave"`
+	sndbuf int64 `state:"nosave"`
 
 	// mu protects the fields below.
 	mu sync.RWMutex `state:"nosave"`
@@ -107,7 +107,7 @@ func (c *ConnectedEndpoint) init() *syserr.Error {
 	}
 
 	c.stype = linux.SockType(stype)
-	c.sndbuf = sndbuf
+	c.sndbuf = int64(sndbuf)
 
 	return nil
 }
@@ -202,7 +202,7 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error)
 }
 
 // Send implements transport.ConnectedEndpoint.Send.
-func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *syserr.Error) {
+func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
 
@@ -279,7 +279,7 @@ func (c *ConnectedEndpoint) EventUpdate() {
 }
 
 // Recv implements transport.Receiver.Recv.
-func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
 
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
index 05d7c79ad..af6955675 100644
--- a/pkg/sentry/fs/host/socket_iovec.go
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -55,19 +55,19 @@ func copyFromMulti(dst []byte, src [][]byte) {
 //
 // If intermediate != nil, iovecs references intermediate rather than bufs and
 // the caller must copy to/from bufs as necessary.
-func buildIovec(bufs [][]byte, maxlen int, truncate bool) (length uintptr, iovecs []syscall.Iovec, intermediate []byte, err error) {
+func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovecs []syscall.Iovec, intermediate []byte, err error) {
 	var iovsRequired int
 	for _, b := range bufs {
-		length += uintptr(len(b))
+		length += int64(len(b))
 		if len(b) > 0 {
 			iovsRequired++
 		}
 	}
 
 	stopLen := length
-	if length > uintptr(maxlen) {
+	if length > maxlen {
 		if truncate {
-			stopLen = uintptr(maxlen)
+			stopLen = maxlen
 			err = syserror.EAGAIN
 		} else {
 			return 0, nil, nil, syserror.EMSGSIZE
@@ -85,7 +85,7 @@ func buildIovec(bufs [][]byte, maxlen int, truncate bool) (length uintptr, iovec
 		}}, b, err
 	}
 
-	var total uintptr
+	var total int64
 	iovecs = make([]syscall.Iovec, 0, iovsRequired)
 	for i := range bufs {
 		l := len(bufs[i])
@@ -93,9 +93,9 @@ func buildIovec(bufs [][]byte, maxlen int, truncate bool) (length uintptr, iovec
 			continue
 		}
 
-		stop := l
-		if total+uintptr(stop) > stopLen {
-			stop = int(stopLen - total)
+		stop := int64(l)
+		if total+stop > stopLen {
+			stop = stopLen - total
 		}
 
 		iovecs = append(iovecs, syscall.Iovec{
@@ -103,7 +103,7 @@ func buildIovec(bufs [][]byte, maxlen int, truncate bool) (length uintptr, iovec
 			Len:  uint64(stop),
 		})
 
-		total += uintptr(stop)
+		total += stop
 		if total >= stopLen {
 			break
 		}
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index e57be0506..f3bbed7ea 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -23,7 +23,7 @@ import (
 //
 // If the total length of bufs is > maxlen, fdReadVec will do a partial read
 // and err will indicate why the message was truncated.
-func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (readLen uintptr, msgLen uintptr, controlLen uint64, controlTrunc bool, err error) {
+func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (readLen int64, msgLen int64, controlLen uint64, controlTrunc bool, err error) {
 	flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC)
 	if peek {
 		flags |= syscall.MSG_PEEK
@@ -48,11 +48,12 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (re
 		msg.Iovlen = uint64(len(iovecs))
 	}
 
-	n, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
+	rawN, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
 	if e != 0 {
 		// N.B. prioritize the syscall error over the buildIovec error.
 		return 0, 0, 0, false, e
 	}
+	n := int64(rawN)
 
 	// Copy data back to bufs.
 	if intermediate != nil {
@@ -72,7 +73,7 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (re
 //
 // If the total length of bufs is > maxlen && truncate, fdWriteVec will do a
 // partial write and err will indicate why the message was truncated.
-func fdWriteVec(fd int, bufs [][]byte, maxlen int, truncate bool) (uintptr, uintptr, error) {
+func fdWriteVec(fd int, bufs [][]byte, maxlen int64, truncate bool) (int64, int64, error) {
 	length, iovecs, intermediate, err := buildIovec(bufs, maxlen, truncate)
 	if err != nil && len(iovecs) == 0 {
 		// No partial write to do, return error immediately.
@@ -96,5 +97,5 @@ func fdWriteVec(fd int, bufs [][]byte, maxlen int, truncate bool) (uintptr, uint
 		return 0, length, e
 	}
 
-	return n, length, err
+	return int64(n), length, err
 }
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 693ffc760..ac0398bd9 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -171,8 +171,6 @@ type MountNamespace struct {
 // NewMountNamespace returns a new MountNamespace, with the provided node at the
 // root, and the given cache size. A root must always be provided.
 func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error) {
-	creds := auth.CredentialsFromContext(ctx)
-
 	// Set the root dirent and id on the root mount. The reference returned from
 	// NewDirent will be donated to the MountNamespace constructed below.
 	d := NewDirent(ctx, root, "/")
@@ -181,6 +179,7 @@ func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error
 		d: newRootMount(1, d),
 	}
 
+	creds := auth.CredentialsFromContext(ctx)
 	mns := MountNamespace{
 		userns:  creds.UserNamespace,
 		root:    d,
@@ -219,6 +218,13 @@ func (mns *MountNamespace) flushMountSourceRefsLocked() {
 		}
 	}
 
+	if mns.root == nil {
+		// No root? This MountSource must have already been destroyed.
+		// This can happen when a Save is triggered while a process is
+		// exiting. There is nothing to flush.
+		return
+	}
+
 	// Flush root's MountSource references.
 	mns.root.Inode.MountSource.FlushDirentRefs()
 }
@@ -249,6 +255,10 @@ func (mns *MountNamespace) destroy() {
 	// Drop reference on the root.
 	mns.root.DecRef()
 
+	// Ensure that root cannot be accessed via this MountNamespace any
+	// more.
+	mns.root = nil
+
 	// Wait for asynchronous work (queued by dropping Dirent references
 	// above) to complete before destroying this MountNamespace.
 	AsyncBarrier()
@@ -678,7 +688,7 @@ func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name s
 	return "", syserror.ENOENT
 }
 
-// GetPath returns the PATH as a slice of strings given the environemnt
+// GetPath returns the PATH as a slice of strings given the environment
 // variables.
 func GetPath(env []string) []string {
 	const prefix = "PATH="
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 6b839685b..9adb23608 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -348,7 +348,7 @@ func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 		// Field: local_adddress.
 		var localAddr linux.SockAddrInet
 		if local, _, err := sops.GetSockName(t); err == nil {
-			localAddr = local.(linux.SockAddrInet)
+			localAddr = *local.(*linux.SockAddrInet)
 		}
 		binary.LittleEndian.PutUint16(portBuf, localAddr.Port)
 		fmt.Fprintf(&buf, "%08X:%04X ",
@@ -358,7 +358,7 @@ func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 		// Field: rem_address.
 		var remoteAddr linux.SockAddrInet
 		if remote, _, err := sops.GetPeerName(t); err == nil {
-			remoteAddr = remote.(linux.SockAddrInet)
+			remoteAddr = *remote.(*linux.SockAddrInet)
 		}
 		binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port)
 		fmt.Fprintf(&buf, "%08X:%04X ",
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index f3e984c24..78e082b8e 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -53,7 +53,6 @@ type Dir struct {
 	fsutil.InodeGenericChecker `state:"nosave"`
 	fsutil.InodeIsDirAllocate  `state:"nosave"`
 	fsutil.InodeIsDirTruncate  `state:"nosave"`
-	fsutil.InodeNoopRelease    `state:"nosave"`
 	fsutil.InodeNoopWriteOut   `state:"nosave"`
 	fsutil.InodeNotMappable    `state:"nosave"`
 	fsutil.InodeNotSocket      `state:"nosave"`
@@ -84,7 +83,8 @@ type Dir struct {
 
 var _ fs.InodeOperations = (*Dir)(nil)
 
-// NewDir returns a new Dir with the given contents and attributes.
+// NewDir returns a new Dir with the given contents and attributes. A reference
+// on each fs.Inode in the `contents` map will be donated to this Dir.
 func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions) *Dir {
 	d := &Dir{
 		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, perms, linux.RAMFS_MAGIC),
@@ -138,7 +138,7 @@ func (d *Dir) addChildLocked(ctx context.Context, name string, inode *fs.Inode)
 	d.NotifyModificationAndStatusChange(ctx)
 }
 
-// AddChild adds a child to this dir.
+// AddChild adds a child to this dir, inheriting its reference.
 func (d *Dir) AddChild(ctx context.Context, name string, inode *fs.Inode) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
@@ -172,7 +172,9 @@ func (d *Dir) Children() ([]string, map[string]fs.DentAttr) {
 	return namesCopy, entriesCopy
 }
 
-// removeChildLocked attempts to remove an entry from this directory.
+// removeChildLocked attempts to remove an entry from this directory. It
+// returns the removed fs.Inode along with its reference, which callers are
+// responsible for decrementing.
 func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, error) {
 	inode, ok := d.children[name]
 	if !ok {
@@ -253,7 +255,8 @@ func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) err
 	return nil
 }
 
-// Lookup loads an inode at p into a Dirent.
+// Lookup loads an inode at p into a Dirent. It returns the fs.Dirent along
+// with a reference.
 func (d *Dir) Lookup(ctx context.Context, _ *fs.Inode, p string) (*fs.Dirent, error) {
 	if len(p) > linux.NAME_MAX {
 		return nil, syserror.ENAMETOOLONG
@@ -408,6 +411,16 @@ func (*Dir) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, ol
 	return Rename(ctx, oldParent.InodeOperations, oldName, newParent.InodeOperations, newName, replacement)
 }
 
+// Release implements fs.InodeOperation.Release.
+func (d *Dir) Release(_ context.Context) {
+	// Drop references on all children.
+	d.mu.Lock()
+	for _, i := range d.children {
+		i.DecRef()
+	}
+	d.mu.Unlock()
+}
+
 // dirFileOperations implements fs.FileOperations for a ramfs directory.
 //
 // +stateify savable
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 0f4497cd6..159fb7c08 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -56,7 +56,6 @@ func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent
 type Dir struct {
 	fsutil.InodeGenericChecker `state:"nosave"`
 	fsutil.InodeIsDirTruncate  `state:"nosave"`
-	fsutil.InodeNoopRelease    `state:"nosave"`
 	fsutil.InodeNoopWriteOut   `state:"nosave"`
 	fsutil.InodeNotMappable    `state:"nosave"`
 	fsutil.InodeNotSocket      `state:"nosave"`
@@ -252,6 +251,11 @@ func (d *Dir) Allocate(ctx context.Context, node *fs.Inode, offset, length int64
 	return d.ramfsDir.Allocate(ctx, node, offset, length)
 }
 
+// Release implements fs.InodeOperations.Release.
+func (d *Dir) Release(ctx context.Context) {
+	d.ramfsDir.Release(ctx)
+}
+
 // Symlink is a symlink.
 //
 // +stateify savable
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 5e9327aec..291164986 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -23,6 +23,7 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 1d128532b..2f639c823 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -129,6 +129,9 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
 
 // Release implements fs.InodeOperations.Release.
 func (d *dirInodeOperations) Release(ctx context.Context) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
 	d.master.DecRef()
 	if len(d.slaves) != 0 {
 		panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d))
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 92ec1ca18..19b7557d5 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -172,6 +172,19 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io userme
 		return 0, mf.t.ld.windowSize(ctx, io, args)
 	case linux.TIOCSWINSZ:
 		return 0, mf.t.ld.setWindowSize(ctx, io, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		return 0, mf.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, mf.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return mf.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return mf.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
@@ -185,8 +198,6 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
 		linux.TCSETS,
 		linux.TCSETSW,
 		linux.TCSETSF,
-		linux.TIOCGPGRP,
-		linux.TIOCSPGRP,
 		linux.TIOCGWINSZ,
 		linux.TIOCSWINSZ,
 		linux.TIOCSETD,
@@ -200,8 +211,6 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
 		linux.TIOCEXCL,
 		linux.TIOCNXCL,
 		linux.TIOCGEXCL,
-		linux.TIOCNOTTY,
-		linux.TIOCSCTTY,
 		linux.TIOCGSID,
 		linux.TIOCGETD,
 		linux.TIOCVHANGUP,
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index e30266404..944c4ada1 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -152,9 +152,16 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		// TODO(b/129283598): Implement once we have support for job
-		// control.
-		return 0, nil
+		return 0, sf.si.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, sf.si.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return sf.si.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return sf.si.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index b7cecb2ed..ff8138820 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -17,7 +17,10 @@ package tty
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
 // Terminal is a pseudoterminal.
@@ -26,23 +29,100 @@ import (
 type Terminal struct {
 	refs.AtomicRefCount
 
-	// n is the terminal index.
+	// n is the terminal index. It is immutable.
 	n uint32
 
-	// d is the containing directory.
+	// d is the containing directory. It is immutable.
 	d *dirInodeOperations
 
-	// ld is the line discipline of the terminal.
+	// ld is the line discipline of the terminal. It is immutable.
 	ld *lineDiscipline
+
+	// masterKTTY contains the controlling process of the master end of
+	// this terminal. This field is immutable.
+	masterKTTY *kernel.TTY
+
+	// slaveKTTY contains the controlling process of the slave end of this
+	// terminal. This field is immutable.
+	slaveKTTY *kernel.TTY
 }
 
 func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal {
 	termios := linux.DefaultSlaveTermios
 	t := Terminal{
-		d:  d,
-		n:  n,
-		ld: newLineDiscipline(termios),
+		d:          d,
+		n:          n,
+		ld:         newLineDiscipline(termios),
+		masterKTTY: &kernel.TTY{},
+		slaveKTTY:  &kernel.TTY{},
 	}
 	t.EnableLeakCheck("tty.Terminal")
 	return &t
 }
+
+// setControllingTTY makes tm the controlling terminal of the calling thread
+// group.
+func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("setControllingTTY must be called from a task context")
+	}
+
+	return task.ThreadGroup().SetControllingTTY(tm.tty(isMaster), args[2].Int())
+}
+
+// releaseControllingTTY removes tm as the controlling terminal of the calling
+// thread group.
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("releaseControllingTTY must be called from a task context")
+	}
+
+	return task.ThreadGroup().ReleaseControllingTTY(tm.tty(isMaster))
+}
+
+// foregroundProcessGroup gets the process group ID of tm's foreground process.
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("foregroundProcessGroup must be called from a task context")
+	}
+
+	ret, err := task.ThreadGroup().ForegroundProcessGroup(tm.tty(isMaster))
+	if err != nil {
+		return 0, err
+	}
+
+	// Write it out to *arg.
+	_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return 0, err
+}
+
+// foregroundProcessGroup sets tm's foreground process.
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("setForegroundProcessGroup must be called from a task context")
+	}
+
+	// Read in the process group ID.
+	var pgid int32
+	if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
+		AddressSpaceActive: true,
+	}); err != nil {
+		return 0, err
+	}
+
+	ret, err := task.ThreadGroup().SetForegroundProcessGroup(tm.tty(isMaster), kernel.ProcessGroupID(pgid))
+	return uintptr(ret), err
+}
+
+func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
+	if isMaster {
+		return tm.masterKTTY
+	}
+	return tm.slaveKTTY
+}
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
new file mode 100644
index 000000000..a41101339
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -0,0 +1,86 @@
+package(licenses = ["notice"])
+
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+    name = "dirent_list",
+    out = "dirent_list.go",
+    package = "ext",
+    prefix = "dirent",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*dirent",
+        "Linker": "*dirent",
+    },
+)
+
+go_library(
+    name = "ext",
+    srcs = [
+        "block_map_file.go",
+        "dentry.go",
+        "directory.go",
+        "dirent_list.go",
+        "ext.go",
+        "extent_file.go",
+        "file_description.go",
+        "filesystem.go",
+        "inode.go",
+        "regular_file.go",
+        "symlink.go",
+        "utils.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/fd",
+        "//pkg/log",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fsimpl/ext/disklayout",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/syscalls/linux",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "ext_test",
+    size = "small",
+    srcs = [
+        "block_map_test.go",
+        "ext_test.go",
+        "extent_test.go",
+    ],
+    data = [
+        "//pkg/sentry/fsimpl/ext:assets/bigfile.txt",
+        "//pkg/sentry/fsimpl/ext:assets/file.txt",
+        "//pkg/sentry/fsimpl/ext:assets/tiny.ext2",
+        "//pkg/sentry/fsimpl/ext:assets/tiny.ext3",
+        "//pkg/sentry/fsimpl/ext:assets/tiny.ext4",
+    ],
+    embed = [":ext"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fsimpl/ext/disklayout",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//runsc/test/testutil",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/ext/README.md b/pkg/sentry/fsimpl/ext/README.md
new file mode 100644
index 000000000..af00cfda8
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/README.md
@@ -0,0 +1,117 @@
+## EXT(2/3/4) File System
+
+This is a filesystem driver which supports ext2, ext3 and ext4 filesystems.
+Linux has specialized drivers for each variant but none which supports all. This
+library takes advantage of ext's backward compatibility and understands the
+internal organization of on-disk structures to support all variants.
+
+This driver implementation diverges from the Linux implementations in being more
+forgiving about versioning. For instance, if a filesystem contains both extent
+based inodes and classical block map based inodes, this driver will not complain
+and interpret them both correctly. While in Linux this would be an issue. This
+blurs the line between the three ext fs variants.
+
+Ext2 is considered deprecated as of Red Hat Enterprise Linux 7, and ext3 has
+been superseded by ext4 by large performance gains. Thus it is recommended to
+upgrade older filesystem images to ext4 using e2fsprogs for better performance.
+
+### Read Only
+
+This driver currently only allows read only operations. A lot of the design
+decisions are based on this feature. There are plans to implement write (the
+process for which is documented in the future work section).
+
+### Performance
+
+One of the biggest wins about this driver is that it directly talks to the
+underlying block device (or whatever persistent storage is being used), instead
+of making expensive RPCs to a gofer.
+
+Another advantage is that ext fs supports fast concurrent reads. Currently the
+device is represented using a `io.ReaderAt` which allows for concurrent reads.
+All reads are directly passed to the device driver which intelligently serves
+the read requests in the optimal order. There is no congestion due to locking
+while reading in the filesystem level.
+
+Reads are optimized further in the way file data is transferred over to user
+memory. Ext fs directly copies over file data from disk into user memory with no
+additional allocations on the way. We can only get faster by preloading file
+data into memory (see future work section).
+
+The internal structures used to represent files, inodes and file descriptors use
+a lot of inheritance. With the level of indirection that an interface adds with
+an internal pointer, it can quickly fragment a structure across memory. As this
+runs along side a full blown kernel (which is memory intensive), having a
+fragmented struct might hurt performance. Hence these internal structures,
+though interfaced, are tightly packed in memory using the same inheritance
+pattern that pkg/sentry/vfs uses. The pkg/sentry/fsimpl/ext/disklayout package
+makes an execption to this pattern for reasons documented in the package.
+
+### Security
+
+This driver also intends to help sandbox the container better by reducing the
+surface of the host kernel that the application touches. It prevents the
+application from exploiting vulnerabilities in the host filesystem driver. All
+`io.ReaderAt.ReadAt()` calls are translated to `pread(2)` which are directly
+passed to the device driver in the kernel. Hence this reduces the surface for
+attack.
+
+The application can not affect any host filesystems other than the one passed
+via block device by the user.
+
+### Future Work
+
+#### Write
+
+To support write operations we would need to modify the block device underneath.
+Currently, the driver does not modify the device at all, not even for updating
+the access times for reads. Modifying the filesystem incorrectly can corrupt it
+and render it unreadable for other correct ext(x) drivers. Hence caution must be
+maintained while modifying metadata structures.
+
+Ext4 specifically is built for performance and has added a lot of complexity as
+to how metadata structures are modified. For instance, files that are organized
+via an extent tree which must be balanced and file data blocks must be placed in
+the same extent as much as possible to increase locality. Such properties must
+be maintained while modifying the tree.
+
+Ext filesystems boast a lot about locality, which plays a big role in them being
+performant. The block allocation algorithm in Linux does a good job in keeping
+related data together. This behavior must be maintained as much as possible,
+else we might end up degrading the filesystem performance over time.
+
+Ext4 also supports a wide variety of features which are specialized for varying
+use cases. Implementing all of them can get difficult very quickly.
+
+Ext(x) checksums all its metadata structures to check for corruption, so
+modification of any metadata struct must correspond with re-checksumming the
+struct. Linux filesystem drivers also order on-disk updates intelligently to not
+corrupt the filesystem and also remain performant. The in-memory metadata
+structures must be kept in sync with what is on disk.
+
+There is also replication of some important structures across the filesystem.
+All replicas must be updated when their original copy is updated. There is also
+provisioning for snapshotting which must be kept in mind, although it should not
+affect this implementation unless we allow users to create filesystem snapshots.
+
+Ext4 also introduced journaling (jbd2). The journal must be updated
+appropriately.
+
+#### Performance
+
+To improve performance we should implement a buffer cache, and optionally, read
+ahead for small files. While doing so we must also keep in mind the memory usage
+and have a reasonable cap on how much file data we want to hold in memory.
+
+#### Features
+
+Our current implementation will work with most ext4 filesystems for readonly
+purposed. However, the following features are not supported yet:
+
+-   Journal
+-   Snapshotting
+-   Extended Attributes
+-   Hash Tree Directories
+-   Meta Block Groups
+-   Multiple Mount Protection
+-   Bigalloc
diff --git a/pkg/sentry/fs/ext/assets/README.md b/pkg/sentry/fsimpl/ext/assets/README.md
index 6f1e81b3a..6f1e81b3a 100644
--- a/pkg/sentry/fs/ext/assets/README.md
+++ b/pkg/sentry/fsimpl/ext/assets/README.md
diff --git a/pkg/sentry/fs/ext/assets/bigfile.txt b/pkg/sentry/fsimpl/ext/assets/bigfile.txt
index 3857cf516..3857cf516 100644
--- a/pkg/sentry/fs/ext/assets/bigfile.txt
+++ b/pkg/sentry/fsimpl/ext/assets/bigfile.txt
diff --git a/pkg/sentry/fs/ext/assets/file.txt b/pkg/sentry/fsimpl/ext/assets/file.txt
index 980a0d5f1..980a0d5f1 100644
--- a/pkg/sentry/fs/ext/assets/file.txt
+++ b/pkg/sentry/fsimpl/ext/assets/file.txt
diff --git a/pkg/sentry/fs/ext/assets/symlink.txt b/pkg/sentry/fsimpl/ext/assets/symlink.txt
index 4c330738c..4c330738c 120000
--- a/pkg/sentry/fs/ext/assets/symlink.txt
+++ b/pkg/sentry/fsimpl/ext/assets/symlink.txt
diff --git a/pkg/sentry/fs/ext/assets/tiny.ext2 b/pkg/sentry/fsimpl/ext/assets/tiny.ext2
index 381ade9bf..381ade9bf 100644
--- a/pkg/sentry/fs/ext/assets/tiny.ext2
+++ b/pkg/sentry/fsimpl/ext/assets/tiny.ext2
diff --git a/pkg/sentry/fs/ext/assets/tiny.ext3 b/pkg/sentry/fsimpl/ext/assets/tiny.ext3
index 0e97a324c..0e97a324c 100644
--- a/pkg/sentry/fs/ext/assets/tiny.ext3
+++ b/pkg/sentry/fsimpl/ext/assets/tiny.ext3
diff --git a/pkg/sentry/fs/ext/assets/tiny.ext4 b/pkg/sentry/fsimpl/ext/assets/tiny.ext4
index a6859736d..a6859736d 100644
--- a/pkg/sentry/fs/ext/assets/tiny.ext4
+++ b/pkg/sentry/fsimpl/ext/assets/tiny.ext4
diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD
new file mode 100644
index 000000000..9fddb4c4c
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD
@@ -0,0 +1,16 @@
+load("//tools/go_stateify:defs.bzl", "go_test")
+
+package(licenses = ["notice"])
+
+go_test(
+    name = "benchmark_test",
+    size = "small",
+    srcs = ["benchmark_test.go"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fsimpl/ext",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
new file mode 100644
index 000000000..10a8083a0
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -0,0 +1,193 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// These benchmarks emulate memfs benchmarks. Ext4 images must be created
+// before this benchmark is run using the `make_deep_ext4.sh` script at
+// /tmp/image-{depth}.ext4 for all the depths tested below.
+package benchmark_test
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+var depths = []int{1, 2, 3, 8, 64, 100}
+
+const filename = "file.txt"
+
+// setUp opens imagePath as an ext Filesystem and returns all necessary
+// elements required to run tests. If error is nil, it also returns a tear
+// down function which must be called after the test is run for clean up.
+func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) {
+	f, err := os.Open(imagePath)
+	if err != nil {
+		return nil, nil, nil, nil, err
+	}
+
+	ctx := contexttest.Context(b)
+	creds := auth.CredentialsFromContext(ctx)
+
+	// Create VFS.
+	vfsObj := vfs.New()
+	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
+	if err != nil {
+		f.Close()
+		return nil, nil, nil, nil, err
+	}
+
+	root := mntns.Root()
+
+	tearDown := func() {
+		root.DecRef()
+
+		if err := f.Close(); err != nil {
+			b.Fatalf("tearDown failed: %v", err)
+		}
+	}
+	return ctx, vfsObj, &root, tearDown, nil
+}
+
+// mount mounts extfs at the path operation passed. Returns a tear down
+// function which must be called after the test is run for clean up.
+func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vfs.PathOperation) func() {
+	b.Helper()
+
+	f, err := os.Open(imagePath)
+	if err != nil {
+		b.Fatalf("could not open image at %s: %v", imagePath, err)
+	}
+
+	ctx := contexttest.Context(b)
+	creds := auth.CredentialsFromContext(ctx)
+
+	if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())}); err != nil {
+		b.Fatalf("failed to mount tmpfs submount: %v", err)
+	}
+	return func() {
+		if err := f.Close(); err != nil {
+			b.Fatalf("tearDown failed: %v", err)
+		}
+	}
+}
+
+// BenchmarkVFS2Ext4fsStat emulates BenchmarkVFS2MemfsStat.
+func BenchmarkVFS2Ext4fsStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			ctx, vfsfs, root, tearDown, err := setUp(b, fmt.Sprintf("/tmp/image-%d.ext4", depth))
+			if err != nil {
+				b.Fatalf("setUp failed: %v", err)
+			}
+			defer tearDown()
+
+			creds := auth.CredentialsFromContext(ctx)
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteByte('/')
+			for i := 1; i <= depth; i++ {
+				filePathBuilder.WriteString(fmt.Sprintf("%d", i))
+				filePathBuilder.WriteByte('/')
+			}
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
+					Root:               *root,
+					Start:              *root,
+					Pathname:           filePath,
+					FollowFinalSymlink: true,
+				}, &vfs.StatOptions{})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+				// Sanity check.
+				if stat.Size > 0 {
+					b.Fatalf("got wrong file size (%d)", stat.Size)
+				}
+			}
+		})
+	}
+}
+
+// BenchmarkVFS2ExtfsMountStat emulates BenchmarkVFS2MemfsMountStat.
+func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			// Create root extfs with depth 1 so we can mount extfs again at /1/.
+			ctx, vfsfs, root, tearDown, err := setUp(b, fmt.Sprintf("/tmp/image-%d.ext4", 1))
+			if err != nil {
+				b.Fatalf("setUp failed: %v", err)
+			}
+			defer tearDown()
+
+			creds := auth.CredentialsFromContext(ctx)
+			mountPointName := "/1/"
+			pop := vfs.PathOperation{
+				Root:     *root,
+				Start:    *root,
+				Pathname: mountPointName,
+			}
+
+			// Save the mount point for later use.
+			mountPoint, err := vfsfs.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+			if err != nil {
+				b.Fatalf("failed to walk to mount point: %v", err)
+			}
+			defer mountPoint.DecRef()
+
+			// Create extfs submount.
+			mountTearDown := mount(b, fmt.Sprintf("/tmp/image-%d.ext4", depth), vfsfs, &pop)
+			defer mountTearDown()
+
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteString(mountPointName)
+			for i := 1; i <= depth; i++ {
+				filePathBuilder.WriteString(fmt.Sprintf("%d", i))
+				filePathBuilder.WriteByte('/')
+			}
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
+					Root:               *root,
+					Start:              *root,
+					Pathname:           filePath,
+					FollowFinalSymlink: true,
+				}, &vfs.StatOptions{})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+				// Sanity check. touch(1) always creates files of size 0 (empty).
+				if stat.Size > 0 {
+					b.Fatalf("got wrong file size (%d)", stat.Size)
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh b/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh
new file mode 100755
index 000000000..d0910da1f
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script creates an ext4 image with $1 depth of directories and a file in
+# the inner most directory. The created file is at path /1/2/.../depth/file.txt.
+# The ext4 image is written to $2. The image is temporarily mounted at
+# /tmp/mountpoint. This script must be run with sudo privileges.
+
+# Usage:
+# sudo bash make_deep_ext4.sh {depth} {output path}
+
+# Check positional arguments.
+if [ "$#" -ne 2 ]; then
+    echo "Usage: sudo bash make_deep_ext4.sh {depth} {output path}"
+    exit 1
+fi
+
+# Make sure depth is a non-negative number.
+if ! [[ "$1" =~ ^[0-9]+$ ]]; then
+        echo "Depth must be a non-negative number."
+        exit 1
+fi
+
+# Create a 1 MB filesystem image at the requested output path.
+rm -f $2
+fallocate -l 1M $2
+if [ $? -ne 0 ]; then
+    echo "fallocate failed"
+    exit $?
+fi
+
+# Convert that blank into an ext4 image.
+mkfs.ext4 -j $2
+if [ $? -ne 0 ]; then
+    echo "mkfs.ext4 failed"
+    exit $?
+fi
+
+# Mount the image.
+MOUNTPOINT=/tmp/mountpoint
+mkdir -p $MOUNTPOINT
+mount -o loop $2 $MOUNTPOINT
+if [ $? -ne 0 ]; then
+    echo "mount failed"
+    exit $?
+fi
+
+# Create nested directories and the file.
+if [ "$1" -eq 0 ]; then
+   FILEPATH=$MOUNTPOINT/file.txt
+else
+   FILEPATH=$MOUNTPOINT/$(seq -s '/' 1 $1)/file.txt
+fi
+mkdir -p $(dirname $FILEPATH) || exit
+touch $FILEPATH
+
+# Clean up.
+umount $MOUNTPOINT
+rm -rf $MOUNTPOINT
diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go
new file mode 100644
index 000000000..cea89bcd9
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/block_map_file.go
@@ -0,0 +1,200 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"io"
+	"math"
+
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+	// numDirectBlks is the number of direct blocks in ext block map inodes.
+	numDirectBlks = 12
+)
+
+// blockMapFile is a type of regular file which uses direct/indirect block
+// addressing to store file data. This was deprecated in ext4.
+type blockMapFile struct {
+	regFile regularFile
+
+	// directBlks are the direct blocks numbers. The physical blocks pointed by
+	// these holds file data. Contains file blocks 0 to 11.
+	directBlks [numDirectBlks]uint32
+
+	// indirectBlk is the physical block which contains (blkSize/4) direct block
+	// numbers (as uint32 integers).
+	indirectBlk uint32
+
+	// doubleIndirectBlk is the physical block which contains (blkSize/4) indirect
+	// block numbers (as uint32 integers).
+	doubleIndirectBlk uint32
+
+	// tripleIndirectBlk is the physical block which contains (blkSize/4) doubly
+	// indirect block numbers (as uint32 integers).
+	tripleIndirectBlk uint32
+
+	// coverage at (i)th index indicates the amount of file data a node at
+	// height (i) covers. Height 0 is the direct block.
+	coverage [4]uint64
+}
+
+// Compiles only if blockMapFile implements io.ReaderAt.
+var _ io.ReaderAt = (*blockMapFile)(nil)
+
+// newBlockMapFile is the blockMapFile constructor. It initializes the file to
+// physical blocks map with (at most) the first 12 (direct) blocks.
+func newBlockMapFile(regFile regularFile) (*blockMapFile, error) {
+	file := &blockMapFile{regFile: regFile}
+	file.regFile.impl = file
+
+	for i := uint(0); i < 4; i++ {
+		file.coverage[i] = getCoverage(regFile.inode.blkSize, i)
+	}
+
+	blkMap := regFile.inode.diskInode.Data()
+	binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks)
+	binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk)
+	binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk)
+	binary.Unmarshal(blkMap[(numDirectBlks+2)*4:(numDirectBlks+3)*4], binary.LittleEndian, &file.tripleIndirectBlk)
+	return file, nil
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
+	if len(dst) == 0 {
+		return 0, nil
+	}
+
+	if off < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	offset := uint64(off)
+	size := f.regFile.inode.diskInode.Size()
+	if offset >= size {
+		return 0, io.EOF
+	}
+
+	// dirBlksEnd is the file offset until which direct blocks cover file data.
+	// Direct blocks cover 0 <= file offset < dirBlksEnd.
+	dirBlksEnd := numDirectBlks * f.coverage[0]
+
+	// indirBlkEnd is the file offset until which the indirect block covers file
+	// data. The indirect block covers dirBlksEnd <= file offset < indirBlkEnd.
+	indirBlkEnd := dirBlksEnd + f.coverage[1]
+
+	// doubIndirBlkEnd is the file offset until which the double indirect block
+	// covers file data. The double indirect block covers the range
+	// indirBlkEnd <= file offset < doubIndirBlkEnd.
+	doubIndirBlkEnd := indirBlkEnd + f.coverage[2]
+
+	read := 0
+	toRead := len(dst)
+	if uint64(toRead)+offset > size {
+		toRead = int(size - offset)
+	}
+	for read < toRead {
+		var err error
+		var curR int
+
+		// Figure out which block to delegate the read to.
+		switch {
+		case offset < dirBlksEnd:
+			// Direct block.
+			curR, err = f.read(f.directBlks[offset/f.regFile.inode.blkSize], offset%f.regFile.inode.blkSize, 0, dst[read:])
+		case offset < indirBlkEnd:
+			// Indirect block.
+			curR, err = f.read(f.indirectBlk, offset-dirBlksEnd, 1, dst[read:])
+		case offset < doubIndirBlkEnd:
+			// Doubly indirect block.
+			curR, err = f.read(f.doubleIndirectBlk, offset-indirBlkEnd, 2, dst[read:])
+		default:
+			// Triply indirect block.
+			curR, err = f.read(f.tripleIndirectBlk, offset-doubIndirBlkEnd, 3, dst[read:])
+		}
+
+		read += curR
+		offset += uint64(curR)
+		if err != nil {
+			return read, err
+		}
+	}
+
+	if read < len(dst) {
+		return read, io.EOF
+	}
+	return read, nil
+}
+
+// read is the recursive step of the ReadAt function. It relies on knowing the
+// current node's location on disk (curPhyBlk) and its height in the block map
+// tree. A height of 0 shows that the current node is actually holding file
+// data. relFileOff tells the offset from which we need to start to reading
+// under the current node. It is completely relative to the current node.
+func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, dst []byte) (int, error) {
+	curPhyBlkOff := int64(curPhyBlk) * int64(f.regFile.inode.blkSize)
+	if height == 0 {
+		toRead := int(f.regFile.inode.blkSize - relFileOff)
+		if len(dst) < toRead {
+			toRead = len(dst)
+		}
+
+		n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff))
+		if n < toRead {
+			return n, syserror.EIO
+		}
+		return n, nil
+	}
+
+	childCov := f.coverage[height-1]
+	startIdx := relFileOff / childCov
+	endIdx := f.regFile.inode.blkSize / 4 // This is exclusive.
+	wantEndIdx := (relFileOff + uint64(len(dst))) / childCov
+	wantEndIdx++ // Make this exclusive.
+	if wantEndIdx < endIdx {
+		endIdx = wantEndIdx
+	}
+
+	read := 0
+	curChildOff := relFileOff % childCov
+	for i := startIdx; i < endIdx; i++ {
+		var childPhyBlk uint32
+		err := readFromDisk(f.regFile.inode.dev, curPhyBlkOff+int64(i*4), &childPhyBlk)
+		if err != nil {
+			return read, err
+		}
+
+		n, err := f.read(childPhyBlk, curChildOff, height-1, dst[read:])
+		read += n
+		if err != nil {
+			return read, err
+		}
+
+		curChildOff = 0
+	}
+
+	return read, nil
+}
+
+// getCoverage returns the number of bytes a node at the given height covers.
+// Height 0 is the file data block itself. Height 1 is the indirect block.
+//
+// Formula: blkSize * ((blkSize / 4)^height)
+func getCoverage(blkSize uint64, height uint) uint64 {
+	return blkSize * uint64(math.Pow(float64(blkSize/4), float64(height)))
+}
diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go
new file mode 100644
index 000000000..213aa3919
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/block_map_test.go
@@ -0,0 +1,157 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"bytes"
+	"math/rand"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+)
+
+// These consts are for mocking the block map tree.
+const (
+	mockBMBlkSize  = uint32(16)
+	mockBMDiskSize = 2500
+)
+
+// TestBlockMapReader stress tests block map reader functionality. It performs
+// random length reads from all possible positions in the block map structure.
+func TestBlockMapReader(t *testing.T) {
+	mockBMFile, want := blockMapSetUp(t)
+	n := len(want)
+
+	for from := 0; from < n; from++ {
+		got := make([]byte, n-from)
+
+		if read, err := mockBMFile.ReadAt(got, int64(from)); err != nil {
+			t.Fatalf("file read operation from offset %d to %d only read %d bytes: %v", from, n, read, err)
+		}
+
+		if diff := cmp.Diff(got, want[from:]); diff != "" {
+			t.Fatalf("file data from offset %d to %d mismatched (-want +got):\n%s", from, n, diff)
+		}
+	}
+}
+
+// blkNumGen is a number generator which gives block numbers for building the
+// block map file on disk. It gives unique numbers in a random order which
+// facilitates in creating an extremely fragmented filesystem.
+type blkNumGen struct {
+	nums []uint32
+}
+
+// newBlkNumGen is the blkNumGen constructor.
+func newBlkNumGen() *blkNumGen {
+	blkNums := &blkNumGen{}
+	lim := mockBMDiskSize / mockBMBlkSize
+	blkNums.nums = make([]uint32, lim)
+	for i := uint32(0); i < lim; i++ {
+		blkNums.nums[i] = i
+	}
+
+	rand.Shuffle(int(lim), func(i, j int) {
+		blkNums.nums[i], blkNums.nums[j] = blkNums.nums[j], blkNums.nums[i]
+	})
+	return blkNums
+}
+
+// next returns the next random block number.
+func (n *blkNumGen) next() uint32 {
+	ret := n.nums[0]
+	n.nums = n.nums[1:]
+	return ret
+}
+
+// blockMapSetUp creates a mock disk and a block map file. It initializes the
+// block map file with 12 direct block, 1 indirect block, 1 double indirect
+// block and 1 triple indirect block (basically fill it till the rim). It
+// initializes the disk to reflect the inode. Also returns the file data that
+// the inode covers and that is written to disk.
+func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
+	mockDisk := make([]byte, mockBMDiskSize)
+	regFile := regularFile{
+		inode: inode{
+			diskInode: &disklayout.InodeNew{
+				InodeOld: disklayout.InodeOld{
+					SizeLo: getMockBMFileFize(),
+				},
+			},
+			dev:     bytes.NewReader(mockDisk),
+			blkSize: uint64(mockBMBlkSize),
+		},
+	}
+
+	var fileData []byte
+	blkNums := newBlkNumGen()
+	var data []byte
+
+	// Write the direct blocks.
+	for i := 0; i < numDirectBlks; i++ {
+		curBlkNum := blkNums.next()
+		data = binary.Marshal(data, binary.LittleEndian, curBlkNum)
+		fileData = append(fileData, writeFileDataToBlock(mockDisk, curBlkNum, 0, blkNums)...)
+	}
+
+	// Write to indirect block.
+	indirectBlk := blkNums.next()
+	data = binary.Marshal(data, binary.LittleEndian, indirectBlk)
+	fileData = append(fileData, writeFileDataToBlock(mockDisk, indirectBlk, 1, blkNums)...)
+
+	// Write to indirect block.
+	doublyIndirectBlk := blkNums.next()
+	data = binary.Marshal(data, binary.LittleEndian, doublyIndirectBlk)
+	fileData = append(fileData, writeFileDataToBlock(mockDisk, doublyIndirectBlk, 2, blkNums)...)
+
+	// Write to indirect block.
+	triplyIndirectBlk := blkNums.next()
+	data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk)
+	fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...)
+
+	copy(regFile.inode.diskInode.Data(), data)
+
+	mockFile, err := newBlockMapFile(regFile)
+	if err != nil {
+		t.Fatalf("newBlockMapFile failed: %v", err)
+	}
+	return mockFile, fileData
+}
+
+// writeFileDataToBlock writes random bytes to the block on disk.
+func writeFileDataToBlock(disk []byte, blkNum uint32, height uint, blkNums *blkNumGen) []byte {
+	if height == 0 {
+		start := blkNum * mockBMBlkSize
+		end := start + mockBMBlkSize
+		rand.Read(disk[start:end])
+		return disk[start:end]
+	}
+
+	var fileData []byte
+	for off := blkNum * mockBMBlkSize; off < (blkNum+1)*mockBMBlkSize; off += 4 {
+		curBlkNum := blkNums.next()
+		copy(disk[off:off+4], binary.Marshal(nil, binary.LittleEndian, curBlkNum))
+		fileData = append(fileData, writeFileDataToBlock(disk, curBlkNum, height-1, blkNums)...)
+	}
+	return fileData
+}
+
+// getMockBMFileFize gets the size of the mock block map file which is used for
+// testing.
+func getMockBMFileFize() uint32 {
+	return uint32(numDirectBlks*getCoverage(uint64(mockBMBlkSize), 0) + getCoverage(uint64(mockBMBlkSize), 1) + getCoverage(uint64(mockBMBlkSize), 2) + getCoverage(uint64(mockBMBlkSize), 3))
+}
diff --git a/pkg/sentry/fs/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index 054fb42b6..054fb42b6 100644
--- a/pkg/sentry/fs/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
new file mode 100644
index 000000000..b51f3e18d
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -0,0 +1,308 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// directory represents a directory inode. It holds the childList in memory.
+type directory struct {
+	inode inode
+
+	// mu serializes the changes to childList.
+	// Lock Order (outermost locks must be taken first):
+	//   directory.mu
+	//     filesystem.mu
+	mu sync.Mutex
+
+	// childList is a list containing (1) child dirents and (2) fake dirents
+	// (with diskDirent == nil) that represent the iteration position of
+	// directoryFDs. childList is used to support directoryFD.IterDirents()
+	// efficiently. childList is protected by mu.
+	childList direntList
+
+	// childMap maps the child's filename to the dirent structure stored in
+	// childList. This adds some data replication but helps in faster path
+	// traversal. For consistency, key == childMap[key].diskDirent.FileName().
+	// Immutable.
+	childMap map[string]*dirent
+}
+
+// newDirectroy is the directory constructor.
+func newDirectroy(inode inode, newDirent bool) (*directory, error) {
+	file := &directory{inode: inode, childMap: make(map[string]*dirent)}
+	file.inode.impl = file
+
+	// Initialize childList by reading dirents from the underlying file.
+	if inode.diskInode.Flags().Index {
+		// TODO(b/134676337): Support hash tree directories. Currently only the '.'
+		// and '..' entries are read in.
+
+		// Users cannot navigate this hash tree directory yet.
+		log.Warningf("hash tree directory being used which is unsupported")
+		return file, nil
+	}
+
+	// The dirents are organized in a linear array in the file data.
+	// Extract the file data and decode the dirents.
+	regFile, err := newRegularFile(inode)
+	if err != nil {
+		return nil, err
+	}
+
+	// buf is used as scratch space for reading in dirents from disk and
+	// unmarshalling them into dirent structs.
+	buf := make([]byte, disklayout.DirentSize)
+	size := inode.diskInode.Size()
+	for off, inc := uint64(0), uint64(0); off < size; off += inc {
+		toRead := size - off
+		if toRead > disklayout.DirentSize {
+			toRead = disklayout.DirentSize
+		}
+		if n, err := regFile.impl.ReadAt(buf[:toRead], int64(off)); uint64(n) < toRead {
+			return nil, err
+		}
+
+		var curDirent dirent
+		if newDirent {
+			curDirent.diskDirent = &disklayout.DirentNew{}
+		} else {
+			curDirent.diskDirent = &disklayout.DirentOld{}
+		}
+		binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent)
+
+		if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 {
+			// Inode number and name length fields being set to 0 is used to indicate
+			// an unused dirent.
+			file.childList.PushBack(&curDirent)
+			file.childMap[curDirent.diskDirent.FileName()] = &curDirent
+		}
+
+		// The next dirent is placed exactly after this dirent record on disk.
+		inc = uint64(curDirent.diskDirent.RecordSize())
+	}
+
+	return file, nil
+}
+
+func (i *inode) isDir() bool {
+	_, ok := i.impl.(*directory)
+	return ok
+}
+
+// dirent is the directory.childList node.
+type dirent struct {
+	diskDirent disklayout.Dirent
+
+	// direntEntry links dirents into their parent directory.childList.
+	direntEntry
+}
+
+// directoryFD represents a directory file description. It implements
+// vfs.FileDescriptionImpl.
+type directoryFD struct {
+	fileDescription
+	vfs.DirectoryFileDescriptionDefaultImpl
+
+	// Protected by directory.mu.
+	iter *dirent
+	off  int64
+}
+
+// Compiles only if directoryFD implements vfs.FileDescriptionImpl.
+var _ vfs.FileDescriptionImpl = (*directoryFD)(nil)
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+	if fd.iter == nil {
+		return
+	}
+
+	dir := fd.inode().impl.(*directory)
+	dir.mu.Lock()
+	dir.childList.Remove(fd.iter)
+	dir.mu.Unlock()
+	fd.iter = nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	extfs := fd.filesystem()
+	dir := fd.inode().impl.(*directory)
+
+	dir.mu.Lock()
+	defer dir.mu.Unlock()
+
+	// Ensure that fd.iter exists and is not linked into dir.childList.
+	var child *dirent
+	if fd.iter == nil {
+		// Start iteration at the beginning of dir.
+		child = dir.childList.Front()
+		fd.iter = &dirent{}
+	} else {
+		// Continue iteration from where we left off.
+		child = fd.iter.Next()
+		dir.childList.Remove(fd.iter)
+	}
+	for ; child != nil; child = child.Next() {
+		// Skip other directoryFD iterators.
+		if child.diskDirent != nil {
+			childType, ok := child.diskDirent.FileType()
+			if !ok {
+				// We will need to read the inode off disk. Do not increment
+				// ref count here because this inode is not being added to the
+				// dentry tree.
+				extfs.mu.Lock()
+				childInode, err := extfs.getOrCreateInodeLocked(child.diskDirent.Inode())
+				extfs.mu.Unlock()
+				if err != nil {
+					// Usage of the file description after the error is
+					// undefined. This implementation would continue reading
+					// from the next dirent.
+					fd.off++
+					dir.childList.InsertAfter(child, fd.iter)
+					return err
+				}
+				childType = fs.ToInodeType(childInode.diskInode.Mode().FileType())
+			}
+
+			if !cb.Handle(vfs.Dirent{
+				Name: child.diskDirent.FileName(),
+				Type: fs.ToDirentType(childType),
+				Ino:  uint64(child.diskDirent.Inode()),
+				Off:  fd.off,
+			}) {
+				dir.childList.InsertBefore(child, fd.iter)
+				return nil
+			}
+			fd.off++
+		}
+	}
+	dir.childList.PushBack(fd.iter)
+	return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	if whence != linux.SEEK_SET && whence != linux.SEEK_CUR {
+		return 0, syserror.EINVAL
+	}
+
+	dir := fd.inode().impl.(*directory)
+
+	dir.mu.Lock()
+	defer dir.mu.Unlock()
+
+	// Find resulting offset.
+	if whence == linux.SEEK_CUR {
+		offset += fd.off
+	}
+
+	if offset < 0 {
+		// lseek(2) specifies that EINVAL should be returned if the resulting offset
+		// is negative.
+		return 0, syserror.EINVAL
+	}
+
+	n := int64(len(dir.childMap))
+	realWantOff := offset
+	if realWantOff > n {
+		realWantOff = n
+	}
+	realCurOff := fd.off
+	if realCurOff > n {
+		realCurOff = n
+	}
+
+	// Ensure that fd.iter exists and is linked into dir.childList so we can
+	// intelligently seek from the optimal position.
+	if fd.iter == nil {
+		fd.iter = &dirent{}
+		dir.childList.PushFront(fd.iter)
+	}
+
+	// Guess that iterating from the current position is optimal.
+	child := fd.iter
+	diff := realWantOff - realCurOff // Shows direction and magnitude of travel.
+
+	// See if starting from the beginning or end is better.
+	abDiff := diff
+	if diff < 0 {
+		abDiff = -diff
+	}
+	if abDiff > realWantOff {
+		// Starting from the beginning is best.
+		child = dir.childList.Front()
+		diff = realWantOff
+	} else if abDiff > (n - realWantOff) {
+		// Starting from the end is best.
+		child = dir.childList.Back()
+		// (n - 1) because the last non-nil dirent represents the (n-1)th offset.
+		diff = realWantOff - (n - 1)
+	}
+
+	for child != nil {
+		// Skip other directoryFD iterators.
+		if child.diskDirent != nil {
+			if diff == 0 {
+				if child != fd.iter {
+					dir.childList.Remove(fd.iter)
+					dir.childList.InsertBefore(child, fd.iter)
+				}
+
+				fd.off = offset
+				return offset, nil
+			}
+
+			if diff < 0 {
+				diff++
+				child = child.Prev()
+			} else {
+				diff--
+				child = child.Next()
+			}
+			continue
+		}
+
+		if diff < 0 {
+			child = child.Prev()
+		} else {
+			child = child.Next()
+		}
+	}
+
+	// Reaching here indicates that the offset is beyond the end of the childList.
+	dir.childList.Remove(fd.iter)
+	dir.childList.PushBack(fd.iter)
+	fd.off = offset
+	return offset, nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+	// mmap(2) specifies that EACCESS should be returned for non-regular file fds.
+	return syserror.EACCES
+}
diff --git a/pkg/sentry/fs/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD
index dde15110d..907d35b7e 100644
--- a/pkg/sentry/fs/ext/disklayout/BUILD
+++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD
@@ -22,7 +22,7 @@ go_library(
         "superblock_old.go",
         "test_utils.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout",
+    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/fs/ext/disklayout/block_group.go b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
index ad6f4fef8..ad6f4fef8 100644
--- a/pkg/sentry/fs/ext/disklayout/block_group.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
diff --git a/pkg/sentry/fs/ext/disklayout/block_group_32.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
index 3e16c76db..3e16c76db 100644
--- a/pkg/sentry/fs/ext/disklayout/block_group_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
diff --git a/pkg/sentry/fs/ext/disklayout/block_group_64.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
index 9a809197a..9a809197a 100644
--- a/pkg/sentry/fs/ext/disklayout/block_group_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
diff --git a/pkg/sentry/fs/ext/disklayout/block_group_test.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
index 0ef4294c0..0ef4294c0 100644
--- a/pkg/sentry/fs/ext/disklayout/block_group_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
diff --git a/pkg/sentry/fs/ext/disklayout/dirent.go b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
index 685bf57b8..417b6cf65 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
@@ -21,6 +21,9 @@ import (
 const (
 	// MaxFileName is the maximum length of an ext fs file's name.
 	MaxFileName = 255
+
+	// DirentSize is the size of ext dirent structures.
+	DirentSize = 263
 )
 
 var (
diff --git a/pkg/sentry/fs/ext/disklayout/dirent_new.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
index 29ae4a5c2..29ae4a5c2 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
diff --git a/pkg/sentry/fs/ext/disklayout/dirent_old.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
index 6fff12a6e..6fff12a6e 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
diff --git a/pkg/sentry/fs/ext/disklayout/dirent_test.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
index cc6dff2c9..934919f8a 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
@@ -21,8 +21,6 @@ import (
 // TestDirentSize tests that the dirent structs are of the correct
 // size.
 func TestDirentSize(t *testing.T) {
-	want := uintptr(263)
-
-	assertSize(t, DirentOld{}, want)
-	assertSize(t, DirentNew{}, want)
+	assertSize(t, DirentOld{}, uintptr(DirentSize))
+	assertSize(t, DirentNew{}, uintptr(DirentSize))
 }
diff --git a/pkg/sentry/fs/ext/disklayout/disklayout.go b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
index bdf4e2132..bdf4e2132 100644
--- a/pkg/sentry/fs/ext/disklayout/disklayout.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
diff --git a/pkg/sentry/fs/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go
index 567523d32..567523d32 100644
--- a/pkg/sentry/fs/ext/disklayout/extent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go
diff --git a/pkg/sentry/fs/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
index b0fad9b71..b0fad9b71 100644
--- a/pkg/sentry/fs/ext/disklayout/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
diff --git a/pkg/sentry/fs/ext/disklayout/inode.go b/pkg/sentry/fsimpl/ext/disklayout/inode.go
index 88ae913f5..88ae913f5 100644
--- a/pkg/sentry/fs/ext/disklayout/inode.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode.go
diff --git a/pkg/sentry/fs/ext/disklayout/inode_new.go b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
index 8f9f574ce..8f9f574ce 100644
--- a/pkg/sentry/fs/ext/disklayout/inode_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
diff --git a/pkg/sentry/fs/ext/disklayout/inode_old.go b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
index db25b11b6..db25b11b6 100644
--- a/pkg/sentry/fs/ext/disklayout/inode_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
diff --git a/pkg/sentry/fs/ext/disklayout/inode_test.go b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
index dd03ee50e..dd03ee50e 100644
--- a/pkg/sentry/fs/ext/disklayout/inode_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
diff --git a/pkg/sentry/fs/ext/disklayout/superblock.go b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
index 7a337a5e0..8bb327006 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
@@ -221,7 +221,7 @@ func CompatFeaturesFromInt(f uint32) CompatFeatures {
 // This is not exhaustive, unused features are not listed.
 const (
 	// SbDirentFileType indicates that directory entries record the file type.
-	// We should use struct ext4_dir_entry_2 for dirents then.
+	// We should use struct DirentNew for dirents then.
 	SbDirentFileType = 0x2
 
 	// SbRecovery indicates that the filesystem needs recovery.
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_32.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
index 53e515fd3..53e515fd3 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_64.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
index 7c1053fb4..7c1053fb4 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_old.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
index 9221e0251..9221e0251 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_test.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
index 463b5ba21..463b5ba21 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
diff --git a/pkg/sentry/fs/ext/disklayout/test_utils.go b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
index 9c63f04c0..9c63f04c0 100644
--- a/pkg/sentry/fs/ext/disklayout/test_utils.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
new file mode 100644
index 000000000..f10accafc
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -0,0 +1,135 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ext implements readonly ext(2/3/4) filesystems.
+package ext
+
+import (
+	"errors"
+	"fmt"
+	"io"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fd"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Compiles only if FilesystemType implements vfs.FilesystemType.
+var _ vfs.FilesystemType = (*FilesystemType)(nil)
+
+// getDeviceFd returns an io.ReaderAt to the underlying device.
+// Currently there are two ways of mounting an ext(2/3/4) fs:
+//   1. Specify a mount with our internal special MountType in the OCI spec.
+//   2. Expose the device to the container and mount it from application layer.
+func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, error) {
+	if opts.InternalData == nil {
+		// User mount call.
+		// TODO(b/134676337): Open the device specified by `source` and return that.
+		panic("unimplemented")
+	}
+
+	// NewFilesystem call originated from within the sentry.
+	devFd, ok := opts.InternalData.(int)
+	if !ok {
+		return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device")
+	}
+
+	if devFd < 0 {
+		return nil, fmt.Errorf("ext device file descriptor is not valid: %d", devFd)
+	}
+
+	// The fd.ReadWriter returned from fd.NewReadWriter() does not take ownership
+	// of the file descriptor and hence will not close it when it is garbage
+	// collected.
+	return fd.NewReadWriter(devFd), nil
+}
+
+// isCompatible checks if the superblock has feature sets which are compatible.
+// We only need to check the superblock incompatible feature set since we are
+// mounting readonly. We will also need to check readonly compatible feature
+// set when mounting for read/write.
+func isCompatible(sb disklayout.SuperBlock) bool {
+	// Please note that what is being checked is limited based on the fact that we
+	// are mounting readonly and that we are not journaling. When mounting
+	// read/write or with a journal, this must be reevaluated.
+	incompatFeatures := sb.IncompatibleFeatures()
+	if incompatFeatures.MetaBG {
+		log.Warningf("ext fs: meta block groups are not supported")
+		return false
+	}
+	if incompatFeatures.MMP {
+		log.Warningf("ext fs: multiple mount protection is not supported")
+		return false
+	}
+	if incompatFeatures.Encrypted {
+		log.Warningf("ext fs: encrypted inodes not supported")
+		return false
+	}
+	if incompatFeatures.InlineData {
+		log.Warningf("ext fs: inline files not supported")
+		return false
+	}
+	return true
+}
+
+// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
+func (FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	// TODO(b/134676337): Ensure that the user is mounting readonly. If not,
+	// EACCESS should be returned according to mount(2). Filesystem independent
+	// flags (like readonly) are currently not available in pkg/sentry/vfs.
+
+	dev, err := getDeviceFd(source, opts)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)}
+	fs.vfsfs.Init(&fs)
+	fs.sb, err = readSuperBlock(dev)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	if fs.sb.Magic() != linux.EXT_SUPER_MAGIC {
+		// mount(2) specifies that EINVAL should be returned if the superblock is
+		// invalid.
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Refuse to mount if the filesystem is incompatible.
+	if !isCompatible(fs.sb) {
+		return nil, nil, syserror.EINVAL
+	}
+
+	fs.bgs, err = readBlockGroups(dev, fs.sb)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode)
+	if err != nil {
+		return nil, nil, err
+	}
+	rootInode.incRef()
+
+	return &fs.vfsfs, &newDentry(rootInode).vfsd, nil
+}
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
new file mode 100644
index 000000000..49b57a2d6
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -0,0 +1,917 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"path"
+	"sort"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+
+	"gvisor.dev/gvisor/runsc/test/testutil"
+)
+
+const (
+	assetsDir = "pkg/sentry/fsimpl/ext/assets"
+)
+
+var (
+	ext2ImagePath = path.Join(assetsDir, "tiny.ext2")
+	ext3ImagePath = path.Join(assetsDir, "tiny.ext3")
+	ext4ImagePath = path.Join(assetsDir, "tiny.ext4")
+)
+
+// setUp opens imagePath as an ext Filesystem and returns all necessary
+// elements required to run tests. If error is non-nil, it also returns a tear
+// down function which must be called after the test is run for clean up.
+func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) {
+	localImagePath, err := testutil.FindFile(imagePath)
+	if err != nil {
+		return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err)
+	}
+
+	f, err := os.Open(localImagePath)
+	if err != nil {
+		return nil, nil, nil, nil, err
+	}
+
+	ctx := contexttest.Context(t)
+	creds := auth.CredentialsFromContext(ctx)
+
+	// Create VFS.
+	vfsObj := vfs.New()
+	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
+	if err != nil {
+		f.Close()
+		return nil, nil, nil, nil, err
+	}
+
+	root := mntns.Root()
+
+	tearDown := func() {
+		root.DecRef()
+
+		if err := f.Close(); err != nil {
+			t.Fatalf("tearDown failed: %v", err)
+		}
+	}
+	return ctx, vfsObj, &root, tearDown, nil
+}
+
+// TODO(b/134676337): Test vfs.FilesystemImpl.ReadlinkAt and
+// vfs.FilesystemImpl.StatFSAt which are not implemented in
+// vfs.VirtualFilesystem yet.
+
+// TestSeek tests vfs.FileDescriptionImpl.Seek functionality.
+func TestSeek(t *testing.T) {
+	type seekTest struct {
+		name  string
+		image string
+		path  string
+	}
+
+	tests := []seekTest{
+		{
+			name:  "ext4 root dir seek",
+			image: ext4ImagePath,
+			path:  "/",
+		},
+		{
+			name:  "ext3 root dir seek",
+			image: ext3ImagePath,
+			path:  "/",
+		},
+		{
+			name:  "ext2 root dir seek",
+			image: ext2ImagePath,
+			path:  "/",
+		},
+		{
+			name:  "ext4 reg file seek",
+			image: ext4ImagePath,
+			path:  "/file.txt",
+		},
+		{
+			name:  "ext3 reg file seek",
+			image: ext3ImagePath,
+			path:  "/file.txt",
+		},
+		{
+			name:  "ext2 reg file seek",
+			image: ext2ImagePath,
+			path:  "/file.txt",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+			if err != nil {
+				t.Fatalf("setUp failed: %v", err)
+			}
+			defer tearDown()
+
+			fd, err := vfsfs.OpenAt(
+				ctx,
+				auth.CredentialsFromContext(ctx),
+				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.OpenOptions{},
+			)
+			if err != nil {
+				t.Fatalf("vfsfs.OpenAt failed: %v", err)
+			}
+
+			if n, err := fd.Impl().Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
+				t.Errorf("expected seek position 0, got %d and error %v", n, err)
+			}
+
+			stat, err := fd.Impl().Stat(ctx, vfs.StatOptions{})
+			if err != nil {
+				t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err)
+			}
+
+			// We should be able to seek beyond the end of file.
+			size := int64(stat.Size)
+			if n, err := fd.Impl().Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
+				t.Errorf("expected seek position %d, got %d and error %v", size, n, err)
+			}
+
+			// EINVAL should be returned if the resulting offset is negative.
+			if _, err := fd.Impl().Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
+				t.Errorf("expected error EINVAL but got %v", err)
+			}
+
+			if n, err := fd.Impl().Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
+				t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err)
+			}
+
+			// Make sure negative offsets work with SEEK_CUR.
+			if n, err := fd.Impl().Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
+				t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
+			}
+
+			// EINVAL should be returned if the resulting offset is negative.
+			if _, err := fd.Impl().Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
+				t.Errorf("expected error EINVAL but got %v", err)
+			}
+
+			// Make sure SEEK_END works with regular files.
+			switch fd.Impl().(type) {
+			case *regularFileFD:
+				// Seek back to 0.
+				if n, err := fd.Impl().Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
+					t.Errorf("expected seek position %d, got %d and error %v", 0, n, err)
+				}
+
+				// Seek forward beyond EOF.
+				if n, err := fd.Impl().Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
+					t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
+				}
+
+				// EINVAL should be returned if the resulting offset is negative.
+				if _, err := fd.Impl().Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
+					t.Errorf("expected error EINVAL but got %v", err)
+				}
+			}
+		})
+	}
+}
+
+// TestStatAt tests filesystem.StatAt functionality.
+func TestStatAt(t *testing.T) {
+	type statAtTest struct {
+		name  string
+		image string
+		path  string
+		want  linux.Statx
+	}
+
+	tests := []statAtTest{
+		{
+			name:  "ext4 statx small file",
+			image: ext4ImagePath,
+			path:  "/file.txt",
+			want: linux.Statx{
+				Blksize: 0x400,
+				Nlink:   1,
+				UID:     0,
+				GID:     0,
+				Mode:    0644 | linux.ModeRegular,
+				Size:    13,
+			},
+		},
+		{
+			name:  "ext3 statx small file",
+			image: ext3ImagePath,
+			path:  "/file.txt",
+			want: linux.Statx{
+				Blksize: 0x400,
+				Nlink:   1,
+				UID:     0,
+				GID:     0,
+				Mode:    0644 | linux.ModeRegular,
+				Size:    13,
+			},
+		},
+		{
+			name:  "ext2 statx small file",
+			image: ext2ImagePath,
+			path:  "/file.txt",
+			want: linux.Statx{
+				Blksize: 0x400,
+				Nlink:   1,
+				UID:     0,
+				GID:     0,
+				Mode:    0644 | linux.ModeRegular,
+				Size:    13,
+			},
+		},
+		{
+			name:  "ext4 statx big file",
+			image: ext4ImagePath,
+			path:  "/bigfile.txt",
+			want: linux.Statx{
+				Blksize: 0x400,
+				Nlink:   1,
+				UID:     0,
+				GID:     0,
+				Mode:    0644 | linux.ModeRegular,
+				Size:    13042,
+			},
+		},
+		{
+			name:  "ext3 statx big file",
+			image: ext3ImagePath,
+			path:  "/bigfile.txt",
+			want: linux.Statx{
+				Blksize: 0x400,
+				Nlink:   1,
+				UID:     0,
+				GID:     0,
+				Mode:    0644 | linux.ModeRegular,
+				Size:    13042,
+			},
+		},
+		{
+			name:  "ext2 statx big file",
+			image: ext2ImagePath,
+			path:  "/bigfile.txt",
+			want: linux.Statx{
+				Blksize: 0x400,
+				Nlink:   1,
+				UID:     0,
+				GID:     0,
+				Mode:    0644 | linux.ModeRegular,
+				Size:    13042,
+			},
+		},
+		{
+			name:  "ext4 statx symlink file",
+			image: ext4ImagePath,
+			path:  "/symlink.txt",
+			want: linux.Statx{
+				Blksize: 0x400,
+				Nlink:   1,
+				UID:     0,
+				GID:     0,
+				Mode:    0777 | linux.ModeSymlink,
+				Size:    8,
+			},
+		},
+		{
+			name:  "ext3 statx symlink file",
+			image: ext3ImagePath,
+			path:  "/symlink.txt",
+			want: linux.Statx{
+				Blksize: 0x400,
+				Nlink:   1,
+				UID:     0,
+				GID:     0,
+				Mode:    0777 | linux.ModeSymlink,
+				Size:    8,
+			},
+		},
+		{
+			name:  "ext2 statx symlink file",
+			image: ext2ImagePath,
+			path:  "/symlink.txt",
+			want: linux.Statx{
+				Blksize: 0x400,
+				Nlink:   1,
+				UID:     0,
+				GID:     0,
+				Mode:    0777 | linux.ModeSymlink,
+				Size:    8,
+			},
+		},
+	}
+
+	// Ignore the fields that are not supported by filesystem.StatAt yet and
+	// those which are likely to change as the image does.
+	ignoredFields := map[string]bool{
+		"Attributes":     true,
+		"AttributesMask": true,
+		"Atime":          true,
+		"Blocks":         true,
+		"Btime":          true,
+		"Ctime":          true,
+		"DevMajor":       true,
+		"DevMinor":       true,
+		"Ino":            true,
+		"Mask":           true,
+		"Mtime":          true,
+		"RdevMajor":      true,
+		"RdevMinor":      true,
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+			if err != nil {
+				t.Fatalf("setUp failed: %v", err)
+			}
+			defer tearDown()
+
+			got, err := vfsfs.StatAt(ctx,
+				auth.CredentialsFromContext(ctx),
+				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.StatOptions{},
+			)
+			if err != nil {
+				t.Fatalf("vfsfs.StatAt failed for file %s in image %s: %v", test.path, test.image, err)
+			}
+
+			cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool {
+				_, ok := ignoredFields[p.String()]
+				return ok
+			}, cmp.Ignore())
+			if diff := cmp.Diff(got, test.want, cmpIgnoreFields, cmpopts.IgnoreUnexported(linux.Statx{})); diff != "" {
+				t.Errorf("stat mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+// TestRead tests the read functionality for vfs file descriptions.
+func TestRead(t *testing.T) {
+	type readTest struct {
+		name    string
+		image   string
+		absPath string
+	}
+
+	tests := []readTest{
+		{
+			name:    "ext4 read small file",
+			image:   ext4ImagePath,
+			absPath: "/file.txt",
+		},
+		{
+			name:    "ext3 read small file",
+			image:   ext3ImagePath,
+			absPath: "/file.txt",
+		},
+		{
+			name:    "ext2 read small file",
+			image:   ext2ImagePath,
+			absPath: "/file.txt",
+		},
+		{
+			name:    "ext4 read big file",
+			image:   ext4ImagePath,
+			absPath: "/bigfile.txt",
+		},
+		{
+			name:    "ext3 read big file",
+			image:   ext3ImagePath,
+			absPath: "/bigfile.txt",
+		},
+		{
+			name:    "ext2 read big file",
+			image:   ext2ImagePath,
+			absPath: "/bigfile.txt",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+			if err != nil {
+				t.Fatalf("setUp failed: %v", err)
+			}
+			defer tearDown()
+
+			fd, err := vfsfs.OpenAt(
+				ctx,
+				auth.CredentialsFromContext(ctx),
+				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.absPath},
+				&vfs.OpenOptions{},
+			)
+			if err != nil {
+				t.Fatalf("vfsfs.OpenAt failed: %v", err)
+			}
+
+			// Get a local file descriptor and compare its functionality with a vfs file
+			// description for the same file.
+			localFile, err := testutil.FindFile(path.Join(assetsDir, test.absPath))
+			if err != nil {
+				t.Fatalf("testutil.FindFile failed for %s: %v", test.absPath, err)
+			}
+
+			f, err := os.Open(localFile)
+			if err != nil {
+				t.Fatalf("os.Open failed for %s: %v", localFile, err)
+			}
+			defer f.Close()
+
+			// Read the entire file by reading one byte repeatedly. Doing this stress
+			// tests the underlying file reader implementation.
+			got := make([]byte, 1)
+			want := make([]byte, 1)
+			for {
+				n, err := f.Read(want)
+				fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
+
+				if diff := cmp.Diff(got, want); diff != "" {
+					t.Errorf("file data mismatch (-want +got):\n%s", diff)
+				}
+
+				// Make sure there is no more file data left after getting EOF.
+				if n == 0 || err == io.EOF {
+					if n, _ := fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
+						t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image)
+					}
+
+					break
+				}
+
+				if err != nil {
+					t.Fatalf("read failed: %v", err)
+				}
+			}
+		})
+	}
+}
+
+// iterDirentsCb is a simple callback which just keeps adding the dirents to an
+// internal list. Implements vfs.IterDirentsCallback.
+type iterDirentsCb struct {
+	dirents []vfs.Dirent
+}
+
+// Compiles only if iterDirentCb implements vfs.IterDirentsCallback.
+var _ vfs.IterDirentsCallback = (*iterDirentsCb)(nil)
+
+// newIterDirentsCb is the iterDirent
+func newIterDirentCb() *iterDirentsCb {
+	return &iterDirentsCb{dirents: make([]vfs.Dirent, 0)}
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) bool {
+	cb.dirents = append(cb.dirents, dirent)
+	return true
+}
+
+// TestIterDirents tests the FileDescriptionImpl.IterDirents functionality.
+func TestIterDirents(t *testing.T) {
+	type iterDirentTest struct {
+		name  string
+		image string
+		path  string
+		want  []vfs.Dirent
+	}
+
+	wantDirents := []vfs.Dirent{
+		vfs.Dirent{
+			Name: ".",
+			Type: linux.DT_DIR,
+		},
+		vfs.Dirent{
+			Name: "..",
+			Type: linux.DT_DIR,
+		},
+		vfs.Dirent{
+			Name: "lost+found",
+			Type: linux.DT_DIR,
+		},
+		vfs.Dirent{
+			Name: "file.txt",
+			Type: linux.DT_REG,
+		},
+		vfs.Dirent{
+			Name: "bigfile.txt",
+			Type: linux.DT_REG,
+		},
+		vfs.Dirent{
+			Name: "symlink.txt",
+			Type: linux.DT_LNK,
+		},
+	}
+	tests := []iterDirentTest{
+		{
+			name:  "ext4 root dir iteration",
+			image: ext4ImagePath,
+			path:  "/",
+			want:  wantDirents,
+		},
+		{
+			name:  "ext3 root dir iteration",
+			image: ext3ImagePath,
+			path:  "/",
+			want:  wantDirents,
+		},
+		{
+			name:  "ext2 root dir iteration",
+			image: ext2ImagePath,
+			path:  "/",
+			want:  wantDirents,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+			if err != nil {
+				t.Fatalf("setUp failed: %v", err)
+			}
+			defer tearDown()
+
+			fd, err := vfsfs.OpenAt(
+				ctx,
+				auth.CredentialsFromContext(ctx),
+				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.OpenOptions{},
+			)
+			if err != nil {
+				t.Fatalf("vfsfs.OpenAt failed: %v", err)
+			}
+
+			cb := &iterDirentsCb{}
+			if err = fd.Impl().IterDirents(ctx, cb); err != nil {
+				t.Fatalf("dir fd.IterDirents() failed: %v", err)
+			}
+
+			sort.Slice(cb.dirents, func(i int, j int) bool { return cb.dirents[i].Name < cb.dirents[j].Name })
+			sort.Slice(test.want, func(i int, j int) bool { return test.want[i].Name < test.want[j].Name })
+
+			// Ignore the inode number and offset of dirents because those are likely to
+			// change as the underlying image changes.
+			cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool {
+				return p.String() == "Ino" || p.String() == "Off"
+			}, cmp.Ignore())
+			if diff := cmp.Diff(cb.dirents, test.want, cmpIgnoreFields); diff != "" {
+				t.Errorf("dirents mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+// TestRootDir tests that the root directory inode is correctly initialized and
+// returned from setUp.
+func TestRootDir(t *testing.T) {
+	type inodeProps struct {
+		Mode      linux.FileMode
+		UID       auth.KUID
+		GID       auth.KGID
+		Size      uint64
+		InodeSize uint16
+		Links     uint16
+		Flags     disklayout.InodeFlags
+	}
+
+	type rootDirTest struct {
+		name      string
+		image     string
+		wantInode inodeProps
+	}
+
+	tests := []rootDirTest{
+		{
+			name:  "ext4 root dir",
+			image: ext4ImagePath,
+			wantInode: inodeProps{
+				Mode:      linux.ModeDirectory | 0755,
+				Size:      0x400,
+				InodeSize: 0x80,
+				Links:     3,
+				Flags:     disklayout.InodeFlags{Extents: true},
+			},
+		},
+		{
+			name:  "ext3 root dir",
+			image: ext3ImagePath,
+			wantInode: inodeProps{
+				Mode:      linux.ModeDirectory | 0755,
+				Size:      0x400,
+				InodeSize: 0x80,
+				Links:     3,
+			},
+		},
+		{
+			name:  "ext2 root dir",
+			image: ext2ImagePath,
+			wantInode: inodeProps{
+				Mode:      linux.ModeDirectory | 0755,
+				Size:      0x400,
+				InodeSize: 0x80,
+				Links:     3,
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			_, _, vd, tearDown, err := setUp(t, test.image)
+			if err != nil {
+				t.Fatalf("setUp failed: %v", err)
+			}
+			defer tearDown()
+
+			d, ok := vd.Dentry().Impl().(*dentry)
+			if !ok {
+				t.Fatalf("ext dentry of incorrect type: %T", vd.Dentry().Impl())
+			}
+
+			// Offload inode contents into local structs for comparison.
+			gotInode := inodeProps{
+				Mode:      d.inode.diskInode.Mode(),
+				UID:       d.inode.diskInode.UID(),
+				GID:       d.inode.diskInode.GID(),
+				Size:      d.inode.diskInode.Size(),
+				InodeSize: d.inode.diskInode.InodeSize(),
+				Links:     d.inode.diskInode.LinksCount(),
+				Flags:     d.inode.diskInode.Flags(),
+			}
+
+			if diff := cmp.Diff(gotInode, test.wantInode); diff != "" {
+				t.Errorf("inode mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+// TestFilesystemInit tests that the filesystem superblock and block group
+// descriptors are correctly read in and initialized.
+func TestFilesystemInit(t *testing.T) {
+	// sb only contains the immutable properties of the superblock.
+	type sb struct {
+		InodesCount      uint32
+		BlocksCount      uint64
+		MaxMountCount    uint16
+		FirstDataBlock   uint32
+		BlockSize        uint64
+		BlocksPerGroup   uint32
+		ClusterSize      uint64
+		ClustersPerGroup uint32
+		InodeSize        uint16
+		InodesPerGroup   uint32
+		BgDescSize       uint16
+		Magic            uint16
+		Revision         disklayout.SbRevision
+		CompatFeatures   disklayout.CompatFeatures
+		IncompatFeatures disklayout.IncompatFeatures
+		RoCompatFeatures disklayout.RoCompatFeatures
+	}
+
+	// bg only contains the immutable properties of the block group descriptor.
+	type bg struct {
+		InodeTable      uint64
+		BlockBitmap     uint64
+		InodeBitmap     uint64
+		ExclusionBitmap uint64
+		Flags           disklayout.BGFlags
+	}
+
+	type fsInitTest struct {
+		name    string
+		image   string
+		wantSb  sb
+		wantBgs []bg
+	}
+
+	tests := []fsInitTest{
+		{
+			name:  "ext4 filesystem init",
+			image: ext4ImagePath,
+			wantSb: sb{
+				InodesCount:      0x10,
+				BlocksCount:      0x40,
+				MaxMountCount:    0xffff,
+				FirstDataBlock:   0x1,
+				BlockSize:        0x400,
+				BlocksPerGroup:   0x2000,
+				ClusterSize:      0x400,
+				ClustersPerGroup: 0x2000,
+				InodeSize:        0x80,
+				InodesPerGroup:   0x10,
+				BgDescSize:       0x40,
+				Magic:            linux.EXT_SUPER_MAGIC,
+				Revision:         disklayout.DynamicRev,
+				CompatFeatures: disklayout.CompatFeatures{
+					ExtAttr:     true,
+					ResizeInode: true,
+					DirIndex:    true,
+				},
+				IncompatFeatures: disklayout.IncompatFeatures{
+					DirentFileType: true,
+					Extents:        true,
+					Is64Bit:        true,
+					FlexBg:         true,
+				},
+				RoCompatFeatures: disklayout.RoCompatFeatures{
+					Sparse:       true,
+					LargeFile:    true,
+					HugeFile:     true,
+					DirNlink:     true,
+					ExtraIsize:   true,
+					MetadataCsum: true,
+				},
+			},
+			wantBgs: []bg{
+				{
+					InodeTable:  0x23,
+					BlockBitmap: 0x3,
+					InodeBitmap: 0x13,
+					Flags: disklayout.BGFlags{
+						InodeZeroed: true,
+					},
+				},
+			},
+		},
+		{
+			name:  "ext3 filesystem init",
+			image: ext3ImagePath,
+			wantSb: sb{
+				InodesCount:      0x10,
+				BlocksCount:      0x40,
+				MaxMountCount:    0xffff,
+				FirstDataBlock:   0x1,
+				BlockSize:        0x400,
+				BlocksPerGroup:   0x2000,
+				ClusterSize:      0x400,
+				ClustersPerGroup: 0x2000,
+				InodeSize:        0x80,
+				InodesPerGroup:   0x10,
+				BgDescSize:       0x20,
+				Magic:            linux.EXT_SUPER_MAGIC,
+				Revision:         disklayout.DynamicRev,
+				CompatFeatures: disklayout.CompatFeatures{
+					ExtAttr:     true,
+					ResizeInode: true,
+					DirIndex:    true,
+				},
+				IncompatFeatures: disklayout.IncompatFeatures{
+					DirentFileType: true,
+				},
+				RoCompatFeatures: disklayout.RoCompatFeatures{
+					Sparse:    true,
+					LargeFile: true,
+				},
+			},
+			wantBgs: []bg{
+				{
+					InodeTable:  0x5,
+					BlockBitmap: 0x3,
+					InodeBitmap: 0x4,
+					Flags: disklayout.BGFlags{
+						InodeZeroed: true,
+					},
+				},
+			},
+		},
+		{
+			name:  "ext2 filesystem init",
+			image: ext2ImagePath,
+			wantSb: sb{
+				InodesCount:      0x10,
+				BlocksCount:      0x40,
+				MaxMountCount:    0xffff,
+				FirstDataBlock:   0x1,
+				BlockSize:        0x400,
+				BlocksPerGroup:   0x2000,
+				ClusterSize:      0x400,
+				ClustersPerGroup: 0x2000,
+				InodeSize:        0x80,
+				InodesPerGroup:   0x10,
+				BgDescSize:       0x20,
+				Magic:            linux.EXT_SUPER_MAGIC,
+				Revision:         disklayout.DynamicRev,
+				CompatFeatures: disklayout.CompatFeatures{
+					ExtAttr:     true,
+					ResizeInode: true,
+					DirIndex:    true,
+				},
+				IncompatFeatures: disklayout.IncompatFeatures{
+					DirentFileType: true,
+				},
+				RoCompatFeatures: disklayout.RoCompatFeatures{
+					Sparse:    true,
+					LargeFile: true,
+				},
+			},
+			wantBgs: []bg{
+				{
+					InodeTable:  0x5,
+					BlockBitmap: 0x3,
+					InodeBitmap: 0x4,
+					Flags: disklayout.BGFlags{
+						InodeZeroed: true,
+					},
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			_, _, vd, tearDown, err := setUp(t, test.image)
+			if err != nil {
+				t.Fatalf("setUp failed: %v", err)
+			}
+			defer tearDown()
+
+			fs, ok := vd.Mount().Filesystem().Impl().(*filesystem)
+			if !ok {
+				t.Fatalf("ext filesystem of incorrect type: %T", vd.Mount().Filesystem().Impl())
+			}
+
+			// Offload superblock and block group descriptors contents into
+			// local structs for comparison.
+			totalFreeInodes := uint32(0)
+			totalFreeBlocks := uint64(0)
+			gotSb := sb{
+				InodesCount:      fs.sb.InodesCount(),
+				BlocksCount:      fs.sb.BlocksCount(),
+				MaxMountCount:    fs.sb.MaxMountCount(),
+				FirstDataBlock:   fs.sb.FirstDataBlock(),
+				BlockSize:        fs.sb.BlockSize(),
+				BlocksPerGroup:   fs.sb.BlocksPerGroup(),
+				ClusterSize:      fs.sb.ClusterSize(),
+				ClustersPerGroup: fs.sb.ClustersPerGroup(),
+				InodeSize:        fs.sb.InodeSize(),
+				InodesPerGroup:   fs.sb.InodesPerGroup(),
+				BgDescSize:       fs.sb.BgDescSize(),
+				Magic:            fs.sb.Magic(),
+				Revision:         fs.sb.Revision(),
+				CompatFeatures:   fs.sb.CompatibleFeatures(),
+				IncompatFeatures: fs.sb.IncompatibleFeatures(),
+				RoCompatFeatures: fs.sb.ReadOnlyCompatibleFeatures(),
+			}
+			gotNumBgs := len(fs.bgs)
+			gotBgs := make([]bg, gotNumBgs)
+			for i := 0; i < gotNumBgs; i++ {
+				gotBgs[i].InodeTable = fs.bgs[i].InodeTable()
+				gotBgs[i].BlockBitmap = fs.bgs[i].BlockBitmap()
+				gotBgs[i].InodeBitmap = fs.bgs[i].InodeBitmap()
+				gotBgs[i].ExclusionBitmap = fs.bgs[i].ExclusionBitmap()
+				gotBgs[i].Flags = fs.bgs[i].Flags()
+
+				totalFreeInodes += fs.bgs[i].FreeInodesCount()
+				totalFreeBlocks += uint64(fs.bgs[i].FreeBlocksCount())
+			}
+
+			if diff := cmp.Diff(gotSb, test.wantSb); diff != "" {
+				t.Errorf("superblock mismatch (-want +got):\n%s", diff)
+			}
+
+			if diff := cmp.Diff(gotBgs, test.wantBgs); diff != "" {
+				t.Errorf("block group descriptors mismatch (-want +got):\n%s", diff)
+			}
+
+			if diff := cmp.Diff(totalFreeInodes, fs.sb.FreeInodesCount()); diff != "" {
+				t.Errorf("total free inodes mismatch (-want +got):\n%s", diff)
+			}
+
+			if diff := cmp.Diff(totalFreeBlocks, fs.sb.FreeBlocksCount()); diff != "" {
+				t.Errorf("total free blocks mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
new file mode 100644
index 000000000..38b68a2d3
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -0,0 +1,237 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"io"
+	"sort"
+
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// extentFile is a type of regular file which uses extents to store file data.
+type extentFile struct {
+	regFile regularFile
+
+	// root is the root extent node. This lives in the 60 byte diskInode.Data().
+	// Immutable.
+	root disklayout.ExtentNode
+}
+
+// Compiles only if extentFile implements io.ReaderAt.
+var _ io.ReaderAt = (*extentFile)(nil)
+
+// newExtentFile is the extent file constructor. It reads the entire extent
+// tree into memory.
+// TODO(b/134676337): Build extent tree on demand to reduce memory usage.
+func newExtentFile(regFile regularFile) (*extentFile, error) {
+	file := &extentFile{regFile: regFile}
+	file.regFile.impl = file
+	err := file.buildExtTree()
+	if err != nil {
+		return nil, err
+	}
+	return file, nil
+}
+
+// buildExtTree builds the extent tree by reading it from disk by doing
+// running a simple DFS. It first reads the root node from the inode struct in
+// memory. Then it recursively builds the rest of the tree by reading it off
+// disk.
+//
+// Precondition: inode flag InExtents must be set.
+func (f *extentFile) buildExtTree() error {
+	rootNodeData := f.regFile.inode.diskInode.Data()
+
+	binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &f.root.Header)
+
+	// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
+	if f.root.Header.NumEntries > 4 {
+		// read(2) specifies that EINVAL should be returned if the file is unsuitable
+		// for reading.
+		return syserror.EINVAL
+	}
+
+	f.root.Entries = make([]disklayout.ExtentEntryPair, f.root.Header.NumEntries)
+	for i, off := uint16(0), disklayout.ExtentStructsSize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+		var curEntry disklayout.ExtentEntry
+		if f.root.Header.Height == 0 {
+			// Leaf node.
+			curEntry = &disklayout.Extent{}
+		} else {
+			// Internal node.
+			curEntry = &disklayout.ExtentIdx{}
+		}
+		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry)
+		f.root.Entries[i].Entry = curEntry
+	}
+
+	// If this node is internal, perform DFS.
+	if f.root.Header.Height > 0 {
+		for i := uint16(0); i < f.root.Header.NumEntries; i++ {
+			var err error
+			if f.root.Entries[i].Node, err = f.buildExtTreeFromDisk(f.root.Entries[i].Entry); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively
+// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to
+// by the ExtentEntry.
+func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*disklayout.ExtentNode, error) {
+	var header disklayout.ExtentHeader
+	off := entry.PhysicalBlock() * f.regFile.inode.blkSize
+	err := readFromDisk(f.regFile.inode.dev, int64(off), &header)
+	if err != nil {
+		return nil, err
+	}
+
+	entries := make([]disklayout.ExtentEntryPair, header.NumEntries)
+	for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+		var curEntry disklayout.ExtentEntry
+		if header.Height == 0 {
+			// Leaf node.
+			curEntry = &disklayout.Extent{}
+		} else {
+			// Internal node.
+			curEntry = &disklayout.ExtentIdx{}
+		}
+
+		err := readFromDisk(f.regFile.inode.dev, int64(off), curEntry)
+		if err != nil {
+			return nil, err
+		}
+		entries[i].Entry = curEntry
+	}
+
+	// If this node is internal, perform DFS.
+	if header.Height > 0 {
+		for i := uint16(0); i < header.NumEntries; i++ {
+			var err error
+			entries[i].Node, err = f.buildExtTreeFromDisk(entries[i].Entry)
+			if err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	return &disklayout.ExtentNode{header, entries}, nil
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (f *extentFile) ReadAt(dst []byte, off int64) (int, error) {
+	if len(dst) == 0 {
+		return 0, nil
+	}
+
+	if off < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	if uint64(off) >= f.regFile.inode.diskInode.Size() {
+		return 0, io.EOF
+	}
+
+	n, err := f.read(&f.root, uint64(off), dst)
+	if n < len(dst) && err == nil {
+		err = io.EOF
+	}
+	return n, err
+}
+
+// read is the recursive step of extentFile.ReadAt which traverses the extent
+// tree from the node passed and reads file data.
+func (f *extentFile) read(node *disklayout.ExtentNode, off uint64, dst []byte) (int, error) {
+	// Perform a binary search for the node covering bytes starting at r.fileOff.
+	// A highly fragmented filesystem can have upto 340 entries and so linear
+	// search should be avoided. Finds the first entry which does not cover the
+	// file block we want and subtracts 1 to get the desired index.
+	fileBlk := uint32(off / f.regFile.inode.blkSize)
+	n := len(node.Entries)
+	found := sort.Search(n, func(i int) bool {
+		return node.Entries[i].Entry.FileBlock() > fileBlk
+	}) - 1
+
+	// We should be in this recursive step only if the data we want exists under
+	// the current node.
+	if found < 0 {
+		panic("searching for a file block in an extent entry which does not cover it")
+	}
+
+	read := 0
+	toRead := len(dst)
+	var curR int
+	var err error
+	for i := found; i < n && read < toRead; i++ {
+		if node.Header.Height == 0 {
+			curR, err = f.readFromExtent(node.Entries[i].Entry.(*disklayout.Extent), off, dst[read:])
+		} else {
+			curR, err = f.read(node.Entries[i].Node, off, dst[read:])
+		}
+
+		read += curR
+		off += uint64(curR)
+		if err != nil {
+			return read, err
+		}
+	}
+
+	return read, nil
+}
+
+// readFromExtent reads file data from the extent. It takes advantage of the
+// sequential nature of extents and reads file data from multiple blocks in one
+// call.
+//
+// A non-nil error indicates that this is a partial read and there is probably
+// more to read from this extent. The caller should propagate the error upward
+// and not move to the next extent in the tree.
+//
+// A subsequent call to extentReader.Read should continue reading from where we
+// left off as expected.
+func (f *extentFile) readFromExtent(ex *disklayout.Extent, off uint64, dst []byte) (int, error) {
+	curFileBlk := uint32(off / f.regFile.inode.blkSize)
+	exFirstFileBlk := ex.FileBlock()
+	exLastFileBlk := exFirstFileBlk + uint32(ex.Length) // This is exclusive.
+
+	// We should be in this recursive step only if the data we want exists under
+	// the current extent.
+	if curFileBlk < exFirstFileBlk || exLastFileBlk <= curFileBlk {
+		panic("searching for a file block in an extent which does not cover it")
+	}
+
+	curPhyBlk := uint64(curFileBlk-exFirstFileBlk) + ex.PhysicalBlock()
+	readStart := curPhyBlk*f.regFile.inode.blkSize + (off % f.regFile.inode.blkSize)
+
+	endPhyBlk := ex.PhysicalBlock() + uint64(ex.Length)
+	extentEnd := endPhyBlk * f.regFile.inode.blkSize // This is exclusive.
+
+	toRead := int(extentEnd - readStart)
+	if len(dst) < toRead {
+		toRead = len(dst)
+	}
+
+	n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], int64(readStart))
+	if n < toRead {
+		return n, syserror.EIO
+	}
+	return n, nil
+}
diff --git a/pkg/sentry/fs/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go
index b3f342c8e..42d0a484b 100644
--- a/pkg/sentry/fs/ext/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/extent_test.go
@@ -16,17 +16,23 @@ package ext
 
 import (
 	"bytes"
+	"math/rand"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 )
 
-// TestExtentTree tests the extent tree building logic.
+const (
+	// mockExtentBlkSize is the mock block size used for testing.
+	// No block has more than 1 header + 4 entries.
+	mockExtentBlkSize = uint64(64)
+)
+
+// The tree described below looks like:
 //
-// Test tree:
 //            0.{Head}[Idx][Idx]
 //                     /     \
 //                    /       \
@@ -44,12 +50,8 @@ import (
 //
 // Please note that ext4 might not construct extent trees looking like this.
 // This is purely for testing the tree traversal logic.
-func TestExtentTree(t *testing.T) {
-	blkSize := uint64(64) // No block has more than 1 header + 4 entries.
-	mockDisk := make([]byte, blkSize*10)
-	mockInode := &inode{diskInode: &disklayout.InodeNew{}}
-
-	node3 := &disklayout.ExtentNode{
+var (
+	node3 = &disklayout.ExtentNode{
 		Header: disklayout.ExtentHeader{
 			Magic:      disklayout.ExtentMagic,
 			NumEntries: 1,
@@ -68,7 +70,7 @@ func TestExtentTree(t *testing.T) {
 		},
 	}
 
-	node2 := &disklayout.ExtentNode{
+	node2 = &disklayout.ExtentNode{
 		Header: disklayout.ExtentHeader{
 			Magic:      disklayout.ExtentMagic,
 			NumEntries: 1,
@@ -86,7 +88,7 @@ func TestExtentTree(t *testing.T) {
 		},
 	}
 
-	node1 := &disklayout.ExtentNode{
+	node1 = &disklayout.ExtentNode{
 		Header: disklayout.ExtentHeader{
 			Magic:      disklayout.ExtentMagic,
 			NumEntries: 2,
@@ -113,7 +115,7 @@ func TestExtentTree(t *testing.T) {
 		},
 	}
 
-	node0 := &disklayout.ExtentNode{
+	node0 = &disklayout.ExtentNode{
 		Header: disklayout.ExtentHeader{
 			Magic:      disklayout.ExtentMagic,
 			NumEntries: 2,
@@ -137,22 +139,69 @@ func TestExtentTree(t *testing.T) {
 			},
 		},
 	}
+)
 
-	writeTree(mockInode, mockDisk, node0, blkSize)
+// TestExtentReader stress tests extentReader functionality. It performs random
+// length reads from all possible positions in the extent tree.
+func TestExtentReader(t *testing.T) {
+	mockExtentFile, want := extentTreeSetUp(t, node0)
+	n := len(want)
 
-	r := bytes.NewReader(mockDisk)
-	if err := mockInode.buildExtTree(r, blkSize); err != nil {
-		t.Fatalf("inode.buildExtTree failed: %v", err)
+	for from := 0; from < n; from++ {
+		got := make([]byte, n-from)
+
+		if read, err := mockExtentFile.ReadAt(got, int64(from)); err != nil {
+			t.Fatalf("file read operation from offset %d to %d only read %d bytes: %v", from, n, read, err)
+		}
+
+		if diff := cmp.Diff(got, want[from:]); diff != "" {
+			t.Fatalf("file data from offset %d to %d mismatched (-want +got):\n%s", from, n, diff)
+		}
 	}
+}
+
+// TestBuildExtentTree tests the extent tree building logic.
+func TestBuildExtentTree(t *testing.T) {
+	mockExtentFile, _ := extentTreeSetUp(t, node0)
 
 	opt := cmpopts.IgnoreUnexported(disklayout.ExtentIdx{}, disklayout.ExtentHeader{})
-	if diff := cmp.Diff(mockInode.root, node0, opt); diff != "" {
+	if diff := cmp.Diff(&mockExtentFile.root, node0, opt); diff != "" {
 		t.Errorf("extent tree mismatch (-want +got):\n%s", diff)
 	}
 }
 
-// writeTree writes the tree represented by `root` to the inode and disk passed.
-func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, blkSize uint64) {
+// extentTreeSetUp writes the passed extent tree to a mock disk as an extent
+// tree. It also constucts a mock extent file with the same tree built in it.
+// It also writes random data file data and returns it.
+func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []byte) {
+	t.Helper()
+
+	mockDisk := make([]byte, mockExtentBlkSize*10)
+	mockExtentFile := &extentFile{
+		regFile: regularFile{
+			inode: inode{
+				diskInode: &disklayout.InodeNew{
+					InodeOld: disklayout.InodeOld{
+						SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
+					},
+				},
+				blkSize: mockExtentBlkSize,
+				dev:     bytes.NewReader(mockDisk),
+			},
+		},
+	}
+
+	fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize)
+
+	if err := mockExtentFile.buildExtTree(); err != nil {
+		t.Fatalf("inode.buildExtTree failed: %v", err)
+	}
+	return mockExtentFile, fileData
+}
+
+// writeTree writes the tree represented by `root` to the inode and disk. It
+// also writes random file data on disk.
+func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte {
 	rootData := binary.Marshal(nil, binary.LittleEndian, root.Header)
 	for _, ep := range root.Entries {
 		rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry)
@@ -160,26 +209,57 @@ func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, blkSize uint
 
 	copy(in.diskInode.Data(), rootData)
 
-	if root.Header.Height > 0 {
-		for _, ep := range root.Entries {
-			writeTreeToDisk(disk, ep, blkSize)
+	var fileData []byte
+	for _, ep := range root.Entries {
+		if root.Header.Height == 0 {
+			fileData = append(fileData, writeFileDataToExtent(disk, ep.Entry.(*disklayout.Extent))...)
+		} else {
+			fileData = append(fileData, writeTreeToDisk(disk, ep)...)
 		}
 	}
+	return fileData
 }
 
 // writeTreeToDisk is the recursive step for writeTree which writes the tree
-// on the disk only.
-func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair, blkSize uint64) {
+// on the disk only. Also writes random file data on disk.
+func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte {
 	nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header)
 	for _, ep := range curNode.Node.Entries {
 		nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry)
 	}
 
-	copy(disk[curNode.Entry.PhysicalBlock()*blkSize:], nodeData)
+	copy(disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:], nodeData)
+
+	var fileData []byte
+	for _, ep := range curNode.Node.Entries {
+		if curNode.Node.Header.Height == 0 {
+			fileData = append(fileData, writeFileDataToExtent(disk, ep.Entry.(*disklayout.Extent))...)
+		} else {
+			fileData = append(fileData, writeTreeToDisk(disk, ep)...)
+		}
+	}
+	return fileData
+}
+
+// writeFileDataToExtent writes random bytes to the blocks on disk that the
+// passed extent points to.
+func writeFileDataToExtent(disk []byte, ex *disklayout.Extent) []byte {
+	phyExStartBlk := ex.PhysicalBlock()
+	phyExStartOff := phyExStartBlk * mockExtentBlkSize
+	phyExEndOff := phyExStartOff + uint64(ex.Length)*mockExtentBlkSize
+	rand.Read(disk[phyExStartOff:phyExEndOff])
+	return disk[phyExStartOff:phyExEndOff]
+}
 
-	if curNode.Node.Header.Height > 0 {
-		for _, ep := range curNode.Node.Entries {
-			writeTreeToDisk(disk, ep, blkSize)
+// getNumPhyBlks returns the number of physical blocks covered under the node.
+func getNumPhyBlks(node *disklayout.ExtentNode) uint32 {
+	var res uint32
+	for _, ep := range node.Entries {
+		if node.Header.Height == 0 {
+			res += uint32(ep.Entry.(*disklayout.Extent).Length)
+		} else {
+			res += getNumPhyBlks(ep.Node)
 		}
 	}
+	return res
 }
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
new file mode 100644
index 000000000..a0065343b
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -0,0 +1,86 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// fileDescription is embedded by ext implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+
+	// flags is the same as vfs.OpenOptions.Flags which are passed to
+	// vfs.FilesystemImpl.OpenAt.
+	// TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2),
+	// fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set.
+	// Only close(2), fstat(2), fstatfs(2) should work.
+	flags uint32
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) inode() *inode {
+	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *fileDescription) OnClose() error { return nil }
+
+// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
+func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
+	return fd.flags, nil
+}
+
+// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
+func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
+	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
+	// no-op.
+	return nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	fd.inode().statTo(&stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	return syserror.EPERM
+}
+
+// SetStat implements vfs.FileDescriptionImpl.StatFS.
+func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+	var stat linux.Statfs
+	fd.filesystem().statTo(&stat)
+	return stat, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *fileDescription) Sync(ctx context.Context) error {
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
new file mode 100644
index 000000000..2d15e8aaf
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -0,0 +1,443 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"errors"
+	"io"
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+var (
+	// errResolveDirent indicates that the vfs.ResolvingPath.Component() does
+	// not exist on the dentry tree but does exist on disk. So it has to be read in
+	// using the in-memory dirent and added to the dentry tree. Usually indicates
+	// the need to lock filesystem.mu for writing.
+	errResolveDirent = errors.New("resolve path component using dirent")
+)
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// mu serializes changes to the Dentry tree.
+	mu sync.RWMutex
+
+	// dev represents the underlying fs device. It does not require protection
+	// because io.ReaderAt permits concurrent read calls to it. It translates to
+	// the pread syscall which passes on the read request directly to the device
+	// driver. Device drivers are intelligent in serving multiple concurrent read
+	// requests in the optimal order (taking locality into consideration).
+	dev io.ReaderAt
+
+	// inodeCache maps absolute inode numbers to the corresponding Inode struct.
+	// Inodes should be removed from this once their reference count hits 0.
+	//
+	// Protected by mu because most additions (see IterDirents) and all removals
+	// from this corresponds to a change in the dentry tree.
+	inodeCache map[uint32]*inode
+
+	// sb represents the filesystem superblock. Immutable after initialization.
+	sb disklayout.SuperBlock
+
+	// bgs represents all the block group descriptors for the filesystem.
+	// Immutable after initialization.
+	bgs []disklayout.BlockGroup
+}
+
+// Compiles only if filesystem implements vfs.FilesystemImpl.
+var _ vfs.FilesystemImpl = (*filesystem)(nil)
+
+// stepLocked resolves rp.Component() in parent directory vfsd. The write
+// parameter passed tells if the caller has acquired filesystem.mu for writing
+// or not. If set to true, an existing inode on disk can be added to the dentry
+// tree if not present already.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions:
+//     - filesystem.mu must be locked (for writing if write param is true).
+//     - !rp.Done().
+//     - inode == vfsd.Impl().(*Dentry).inode.
+func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
+	if !inode.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, nil, err
+	}
+
+	for {
+		nextVFSD, err := rp.ResolveComponent(vfsd)
+		if err != nil {
+			return nil, nil, err
+		}
+		if nextVFSD == nil {
+			// Since the Dentry tree is not the sole source of truth for extfs, if it's
+			// not in the Dentry tree, it might need to be pulled from disk.
+			childDirent, ok := inode.impl.(*directory).childMap[rp.Component()]
+			if !ok {
+				// The underlying inode does not exist on disk.
+				return nil, nil, syserror.ENOENT
+			}
+
+			if !write {
+				// filesystem.mu must be held for writing to add to the dentry tree.
+				return nil, nil, errResolveDirent
+			}
+
+			// Create and add the component's dirent to the dentry tree.
+			fs := rp.Mount().Filesystem().Impl().(*filesystem)
+			childInode, err := fs.getOrCreateInodeLocked(childDirent.diskDirent.Inode())
+			if err != nil {
+				return nil, nil, err
+			}
+			// incRef because this is being added to the dentry tree.
+			childInode.incRef()
+			child := newDentry(childInode)
+			vfsd.InsertChild(&child.vfsd, rp.Component())
+
+			// Continue as usual now that nextVFSD is not nil.
+			nextVFSD = &child.vfsd
+		}
+		nextInode := nextVFSD.Impl().(*dentry).inode
+		if nextInode.isSymlink() && rp.ShouldFollowSymlink() {
+			if err := rp.HandleSymlink(inode.impl.(*symlink).target); err != nil {
+				return nil, nil, err
+			}
+			continue
+		}
+		rp.Advance()
+		return nextVFSD, nextInode, nil
+	}
+}
+
+// walkLocked resolves rp to an existing file. The write parameter
+// passed tells if the caller has acquired filesystem.mu for writing or not.
+// If set to true, additions can be made to the dentry tree while walking.
+// If errResolveDirent is returned, the walk needs to be continued with an
+// upgraded filesystem.mu.
+//
+// walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
+//
+// Preconditions:
+//     - filesystem.mu must be locked (for writing if write param is true).
+func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+	vfsd := rp.Start()
+	inode := vfsd.Impl().(*dentry).inode
+	for !rp.Done() {
+		var err error
+		vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	if rp.MustBeDir() && !inode.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, inode, nil
+}
+
+// walkParentLocked resolves all but the last path component of rp to an
+// existing directory. It does not check that the returned directory is
+// searchable by the provider of rp. The write parameter passed tells if the
+// caller has acquired filesystem.mu for writing or not. If set to true,
+// additions can be made to the dentry tree while walking.
+// If errResolveDirent is returned, the walk needs to be continued with an
+// upgraded filesystem.mu.
+//
+// walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat().
+//
+// Preconditions:
+//     - filesystem.mu must be locked (for writing if write param is true).
+//     - !rp.Done().
+func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+	vfsd := rp.Start()
+	inode := vfsd.Impl().(*dentry).inode
+	for !rp.Final() {
+		var err error
+		vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	if !inode.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, inode, nil
+}
+
+// walk resolves rp to an existing file. If parent is set to true, it resolves
+// the rp till the parent of the last component which should be an existing
+// directory. If parent is false then resolves rp entirely. Attemps to resolve
+// the path as far as it can with a read lock and upgrades the lock if needed.
+func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) {
+	var (
+		vfsd  *vfs.Dentry
+		inode *inode
+		err   error
+	)
+
+	// Try walking with the hopes that all dentries have already been pulled out
+	// of disk. This reduces congestion (allows concurrent walks).
+	fs.mu.RLock()
+	if parent {
+		vfsd, inode, err = walkParentLocked(rp, false)
+	} else {
+		vfsd, inode, err = walkLocked(rp, false)
+	}
+	fs.mu.RUnlock()
+
+	if err == errResolveDirent {
+		// Upgrade lock and continue walking. Lock upgrading in the middle of the
+		// walk is fine as this is a read only filesystem.
+		fs.mu.Lock()
+		if parent {
+			vfsd, inode, err = walkParentLocked(rp, true)
+		} else {
+			vfsd, inode, err = walkLocked(rp, true)
+		}
+		fs.mu.Unlock()
+	}
+
+	return vfsd, inode, err
+}
+
+// getOrCreateInodeLocked gets the inode corresponding to the inode number passed in.
+// It creates a new one with the given inode number if one does not exist.
+// The caller must increment the ref count if adding this to the dentry tree.
+//
+// Precondition: must be holding fs.mu for writing.
+func (fs *filesystem) getOrCreateInodeLocked(inodeNum uint32) (*inode, error) {
+	if in, ok := fs.inodeCache[inodeNum]; ok {
+		return in, nil
+	}
+
+	in, err := newInode(fs, inodeNum)
+	if err != nil {
+		return nil, err
+	}
+
+	fs.inodeCache[inodeNum] = in
+	return in, nil
+}
+
+// statTo writes the statfs fields to the output parameter.
+func (fs *filesystem) statTo(stat *linux.Statfs) {
+	stat.Type = uint64(fs.sb.Magic())
+	stat.BlockSize = int64(fs.sb.BlockSize())
+	stat.Blocks = fs.sb.BlocksCount()
+	stat.BlocksFree = fs.sb.FreeBlocksCount()
+	stat.BlocksAvailable = fs.sb.FreeBlocksCount()
+	stat.Files = uint64(fs.sb.InodesCount())
+	stat.FilesFree = uint64(fs.sb.FreeInodesCount())
+	stat.NameLength = disklayout.MaxFileName
+	stat.FragmentSize = int64(fs.sb.BlockSize())
+	// TODO(b/134676337): Set Statfs.Flags and Statfs.FSID.
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	vfsd, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return nil, err
+	}
+
+	if opts.CheckSearchable {
+		if !inode.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+			return nil, err
+		}
+	}
+
+	inode.incRef()
+	return vfsd, nil
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	vfsd, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return nil, err
+	}
+
+	// EROFS is returned if write access is needed.
+	if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 {
+		return nil, syserror.EROFS
+	}
+	return inode.open(rp, vfsd, opts.Flags)
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	_, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return "", err
+	}
+	symlink, ok := inode.impl.(*symlink)
+	if !ok {
+		return "", syserror.EINVAL
+	}
+	return symlink.target, nil
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	_, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	var stat linux.Statx
+	inode.statTo(&stat)
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	if _, _, err := fs.walk(rp, false); err != nil {
+		return linux.Statfs{}, err
+	}
+
+	var stat linux.Statfs
+	fs.statTo(&stat)
+	return stat, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	// This is a readonly filesystem for now.
+	return nil
+}
+
+// The vfs.FilesystemImpl functions below return EROFS because their respective
+// man pages say that EROFS must be returned if the path resolves to a file on
+// this read-only filesystem.
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+
+	if _, _, err := fs.walk(rp, true); err != nil {
+		return err
+	}
+
+	return syserror.EROFS
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+
+	if _, _, err := fs.walk(rp, true); err != nil {
+		return err
+	}
+
+	return syserror.EROFS
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+
+	_, _, err := fs.walk(rp, true)
+	if err != nil {
+		return err
+	}
+
+	return syserror.EROFS
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+	if rp.Done() {
+		return syserror.ENOENT
+	}
+
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+
+	return syserror.EROFS
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	_, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+
+	if !inode.isDir() {
+		return syserror.ENOTDIR
+	}
+
+	return syserror.EROFS
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+
+	return syserror.EROFS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+
+	_, _, err := fs.walk(rp, true)
+	if err != nil {
+		return err
+	}
+
+	return syserror.EROFS
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	_, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+
+	if inode.isDir() {
+		return syserror.EISDIR
+	}
+
+	return syserror.EROFS
+}
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
new file mode 100644
index 000000000..e6c847a71
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -0,0 +1,219 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"fmt"
+	"io"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// inode represents an ext inode.
+//
+// inode uses the same inheritance pattern that pkg/sentry/vfs structures use.
+// This has been done to increase memory locality.
+//
+// Implementations:
+//    inode --
+//           |-- dir
+//           |-- symlink
+//           |-- regular--
+//                       |-- extent file
+//                       |-- block map file
+type inode struct {
+	// refs is a reference count. refs is accessed using atomic memory operations.
+	refs int64
+
+	// inodeNum is the inode number of this inode on disk. This is used to
+	// identify inodes within the ext filesystem.
+	inodeNum uint32
+
+	// dev represents the underlying device. Same as filesystem.dev.
+	dev io.ReaderAt
+
+	// blkSize is the fs data block size. Same as filesystem.sb.BlockSize().
+	blkSize uint64
+
+	// diskInode gives us access to the inode struct on disk. Immutable.
+	diskInode disklayout.Inode
+
+	// This is immutable. The first field of the implementations must have inode
+	// as the first field to ensure temporality.
+	impl interface{}
+}
+
+// incRef increments the inode ref count.
+func (in *inode) incRef() {
+	atomic.AddInt64(&in.refs, 1)
+}
+
+// tryIncRef tries to increment the ref count. Returns true if successful.
+func (in *inode) tryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&in.refs)
+		if refs == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&in.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// decRef decrements the inode ref count and releases the inode resources if
+// the ref count hits 0.
+//
+// Precondition: Must have locked fs.mu.
+func (in *inode) decRef(fs *filesystem) {
+	if refs := atomic.AddInt64(&in.refs, -1); refs == 0 {
+		delete(fs.inodeCache, in.inodeNum)
+	} else if refs < 0 {
+		panic("ext.inode.decRef() called without holding a reference")
+	}
+}
+
+// newInode is the inode constructor. Reads the inode off disk. Identifies
+// inodes based on the absolute inode number on disk.
+func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
+	if inodeNum == 0 {
+		panic("inode number 0 on ext filesystems is not possible")
+	}
+
+	inodeRecordSize := fs.sb.InodeSize()
+	var diskInode disklayout.Inode
+	if inodeRecordSize == disklayout.OldInodeSize {
+		diskInode = &disklayout.InodeOld{}
+	} else {
+		diskInode = &disklayout.InodeNew{}
+	}
+
+	// Calculate where the inode is actually placed.
+	inodesPerGrp := fs.sb.InodesPerGroup()
+	blkSize := fs.sb.BlockSize()
+	inodeTableOff := fs.bgs[getBGNum(inodeNum, inodesPerGrp)].InodeTable() * blkSize
+	inodeOff := inodeTableOff + uint64(uint32(inodeRecordSize)*getBGOff(inodeNum, inodesPerGrp))
+
+	if err := readFromDisk(fs.dev, int64(inodeOff), diskInode); err != nil {
+		return nil, err
+	}
+
+	// Build the inode based on its type.
+	inode := inode{
+		inodeNum:  inodeNum,
+		dev:       fs.dev,
+		blkSize:   blkSize,
+		diskInode: diskInode,
+	}
+
+	switch diskInode.Mode().FileType() {
+	case linux.ModeSymlink:
+		f, err := newSymlink(inode)
+		if err != nil {
+			return nil, err
+		}
+		return &f.inode, nil
+	case linux.ModeRegular:
+		f, err := newRegularFile(inode)
+		if err != nil {
+			return nil, err
+		}
+		return &f.inode, nil
+	case linux.ModeDirectory:
+		f, err := newDirectroy(inode, fs.sb.IncompatibleFeatures().DirentFileType)
+		if err != nil {
+			return nil, err
+		}
+		return &f.inode, nil
+	default:
+		// TODO(b/134676337): Return appropriate errors for sockets, pipes and devices.
+		return nil, syserror.EINVAL
+	}
+}
+
+// open creates and returns a file description for the dentry passed in.
+func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(flags)
+	if err := in.checkPermissions(rp.Credentials(), ats); err != nil {
+		return nil, err
+	}
+	switch in.impl.(type) {
+	case *regularFile:
+		var fd regularFileFD
+		fd.flags = flags
+		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		return &fd.vfsfd, nil
+	case *directory:
+		// Can't open directories writably. This check is not necessary for a read
+		// only filesystem but will be required when write is implemented.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		var fd directoryFD
+		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		fd.flags = flags
+		return &fd.vfsfd, nil
+	case *symlink:
+		if flags&linux.O_PATH == 0 {
+			// Can't open symlinks without O_PATH.
+			return nil, syserror.ELOOP
+		}
+		var fd symlinkFD
+		fd.flags = flags
+		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		return &fd.vfsfd, nil
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", in.impl))
+	}
+}
+
+func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID())
+}
+
+// statTo writes the statx fields to the output parameter.
+func (in *inode) statTo(stat *linux.Statx) {
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
+		linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
+		linux.STATX_ATIME | linux.STATX_CTIME | linux.STATX_MTIME
+	stat.Blksize = uint32(in.blkSize)
+	stat.Mode = uint16(in.diskInode.Mode())
+	stat.Nlink = uint32(in.diskInode.LinksCount())
+	stat.UID = uint32(in.diskInode.UID())
+	stat.GID = uint32(in.diskInode.GID())
+	stat.Ino = uint64(in.inodeNum)
+	stat.Size = in.diskInode.Size()
+	stat.Atime = in.diskInode.AccessTime().StatxTimestamp()
+	stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp()
+	stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp()
+	// TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks
+	// (including metadata blocks) required to represent this file.
+}
+
+// getBGNum returns the block group number that a given inode belongs to.
+func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 {
+	return (inodeNum - 1) / inodesPerGrp
+}
+
+// getBGOff returns the offset at which the given inode lives in the block
+// group's inode table, i.e. the index of the inode in the inode table.
+func getBGOff(inodeNum uint32, inodesPerGrp uint32) uint32 {
+	return (inodeNum - 1) % inodesPerGrp
+}
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
new file mode 100644
index 000000000..ffc76ba5b
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -0,0 +1,159 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"io"
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// regularFile represents a regular file's inode. This too follows the
+// inheritance pattern prevelant in the vfs layer described in
+// pkg/sentry/vfs/README.md.
+type regularFile struct {
+	inode inode
+
+	// This is immutable. The first field of fileReader implementations must be
+	// regularFile to ensure temporality.
+	// io.ReaderAt is more strict than io.Reader in the sense that a partial read
+	// is always accompanied by an error. If a read spans past the end of file, a
+	// partial read (within file range) is done and io.EOF is returned.
+	impl io.ReaderAt
+}
+
+// newRegularFile is the regularFile constructor. It figures out what kind of
+// file this is and initializes the fileReader.
+func newRegularFile(inode inode) (*regularFile, error) {
+	regFile := regularFile{
+		inode: inode,
+	}
+
+	inodeFlags := inode.diskInode.Flags()
+
+	if inodeFlags.Extents {
+		file, err := newExtentFile(regFile)
+		if err != nil {
+			return nil, err
+		}
+
+		file.regFile.inode.impl = &file.regFile
+		return &file.regFile, nil
+	}
+
+	file, err := newBlockMapFile(regFile)
+	if err != nil {
+		return nil, err
+	}
+	file.regFile.inode.impl = &file.regFile
+	return &file.regFile, nil
+}
+
+func (in *inode) isRegular() bool {
+	_, ok := in.impl.(*regularFile)
+	return ok
+}
+
+// directoryFD represents a directory file description. It implements
+// vfs.FileDescriptionImpl.
+type regularFileFD struct {
+	fileDescription
+
+	// off is the file offset. off is accessed using atomic memory operations.
+	off int64
+
+	// offMu serializes operations that may mutate off.
+	offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	safeReader := safemem.FromIOReaderAt{
+		ReaderAt: fd.inode().impl.(*regularFile).impl,
+		Offset:   offset,
+	}
+
+	// Copies data from disk directly into usermem without any intermediate
+	// allocations (if dst is converted into BlockSeq such that it does not need
+	// safe copying).
+	return dst.CopyOutFrom(ctx, safeReader)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.offMu.Lock()
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	// write(2) specifies that EBADF must be returned if the fd is not open for
+	// writing.
+	return 0, syserror.EBADF
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	n, err := fd.PWrite(ctx, src, fd.off, opts)
+	fd.offMu.Lock()
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *regularFileFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	return syserror.ENOTDIR
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.offMu.Lock()
+	defer fd.offMu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as specified.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	case linux.SEEK_END:
+		offset += int64(fd.inode().diskInode.Size())
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+	// TODO(b/134676337): Implement mmap(2).
+	return syserror.ENODEV
+}
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
new file mode 100644
index 000000000..e06548a98
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -0,0 +1,111 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// symlink represents a symlink inode.
+type symlink struct {
+	inode  inode
+	target string // immutable
+}
+
+// newSymlink is the symlink constructor. It reads out the symlink target from
+// the inode (however it might have been stored).
+func newSymlink(inode inode) (*symlink, error) {
+	var file *symlink
+	var link []byte
+
+	// If the symlink target is lesser than 60 bytes, its stores in inode.Data().
+	// Otherwise either extents or block maps will be used to store the link.
+	size := inode.diskInode.Size()
+	if size < 60 {
+		link = inode.diskInode.Data()[:size]
+	} else {
+		// Create a regular file out of this inode and read out the target.
+		regFile, err := newRegularFile(inode)
+		if err != nil {
+			return nil, err
+		}
+
+		link = make([]byte, size)
+		if n, err := regFile.impl.ReadAt(link, 0); uint64(n) < size {
+			return nil, err
+		}
+	}
+
+	file = &symlink{inode: inode, target: string(link)}
+	file.inode.impl = file
+	return file, nil
+}
+
+func (in *inode) isSymlink() bool {
+	_, ok := in.impl.(*symlink)
+	return ok
+}
+
+// symlinkFD represents a symlink file description and implements implements
+// vfs.FileDescriptionImpl. which may only be used if open options contains
+// O_PATH. For this reason most of the functions return EBADF.
+type symlinkFD struct {
+	fileDescription
+}
+
+// Compiles only if symlinkFD implements vfs.FileDescriptionImpl.
+var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil)
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *symlinkFD) Release() {}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *symlinkFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *symlinkFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *symlinkFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *symlinkFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	return syserror.ENOTDIR
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *symlinkFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+	return syserror.EBADF
+}
diff --git a/pkg/sentry/fs/ext/utils.go b/pkg/sentry/fsimpl/ext/utils.go
index 3472c5fa8..d8b728f8c 100644
--- a/pkg/sentry/fs/ext/utils.go
+++ b/pkg/sentry/fsimpl/ext/utils.go
@@ -15,38 +15,30 @@
 package ext
 
 import (
-	"encoding/binary"
 	"io"
 
-	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // readFromDisk performs a binary read from disk into the given struct from
 // the absolute offset provided.
-//
-// All disk reads should use this helper so we avoid reading from stale
-// previously used offsets. This function forces the offset parameter.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error {
-	if _, err := dev.Seek(abOff, io.SeekStart); err != nil {
-		return syserror.EIO
-	}
-
-	if err := binary.Read(dev, binary.LittleEndian, v); err != nil {
+func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error {
+	n := binary.Size(v)
+	buf := make([]byte, n)
+	if read, _ := dev.ReadAt(buf, abOff); read < int(n) {
 		return syserror.EIO
 	}
 
+	binary.Unmarshal(buf, binary.LittleEndian, v)
 	return nil
 }
 
 // readSuperBlock reads the SuperBlock from block group 0 in the underlying
 // device. There are three versions of the superblock. This function identifies
 // and returns the correct version.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readSuperBlock(dev io.ReadSeeker) (disklayout.SuperBlock, error) {
+func readSuperBlock(dev io.ReaderAt) (disklayout.SuperBlock, error) {
 	var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{}
 	if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil {
 		return nil, err
@@ -76,19 +68,12 @@ func blockGroupsCount(sb disklayout.SuperBlock) uint64 {
 	blocksPerGroup := uint64(sb.BlocksPerGroup())
 
 	// Round up the result. float64 can compromise precision so do it manually.
-	bgCount := blocksCount / blocksPerGroup
-	if blocksCount%blocksPerGroup != 0 {
-		bgCount++
-	}
-
-	return bgCount
+	return (blocksCount + blocksPerGroup - 1) / blocksPerGroup
 }
 
 // readBlockGroups reads the block group descriptor table from block group 0 in
 // the underlying device.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readBlockGroups(dev io.ReadSeeker, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) {
+func readBlockGroups(dev io.ReaderAt, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) {
 	bgCount := blockGroupsCount(sb)
 	bgdSize := uint64(sb.BgDescSize())
 	is64Bit := sb.IncompatibleFeatures().Is64Bit
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index d5d4f68df..d2450e810 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -11,8 +11,8 @@ go_template_instance(
     prefix = "dentry",
     template = "//pkg/ilist:generic_list",
     types = {
-        "Element": "*Dentry",
-        "Linker": "*Dentry",
+        "Element": "*dentry",
+        "Linker": "*dentry",
     },
 )
 
diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go
index b0c3ea39a..c52dc781c 100644
--- a/pkg/sentry/fsimpl/memfs/directory.go
+++ b/pkg/sentry/fsimpl/memfs/directory.go
@@ -23,23 +23,23 @@ import (
 )
 
 type directory struct {
-	inode Inode
+	inode inode
 
 	// childList is a list containing (1) child Dentries and (2) fake Dentries
 	// (with inode == nil) that represent the iteration position of
 	// directoryFDs. childList is used to support directoryFD.IterDirents()
-	// efficiently. childList is protected by Filesystem.mu.
+	// efficiently. childList is protected by filesystem.mu.
 	childList dentryList
 }
 
-func (fs *Filesystem) newDirectory(creds *auth.Credentials, mode uint16) *Inode {
+func (fs *filesystem) newDirectory(creds *auth.Credentials, mode uint16) *inode {
 	dir := &directory{}
 	dir.inode.init(dir, fs, creds, mode)
 	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
 	return &dir.inode
 }
 
-func (i *Inode) isDir() bool {
+func (i *inode) isDir() bool {
 	_, ok := i.impl.(*directory)
 	return ok
 }
@@ -48,8 +48,8 @@ type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
 
-	// Protected by Filesystem.mu.
-	iter *Dentry
+	// Protected by filesystem.mu.
+	iter *dentry
 	off  int64
 }
 
@@ -68,7 +68,7 @@ func (fd *directoryFD) Release() {
 // IterDirents implements vfs.FileDescriptionImpl.IterDirents.
 func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
 	fs := fd.filesystem()
-	d := fd.vfsfd.VirtualDentry().Dentry()
+	vfsd := fd.vfsfd.VirtualDentry().Dentry()
 
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
@@ -77,7 +77,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 		if !cb.Handle(vfs.Dirent{
 			Name: ".",
 			Type: linux.DT_DIR,
-			Ino:  d.Impl().(*Dentry).inode.ino,
+			Ino:  vfsd.Impl().(*dentry).inode.ino,
 			Off:  0,
 		}) {
 			return nil
@@ -85,7 +85,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 		fd.off++
 	}
 	if fd.off == 1 {
-		parentInode := d.ParentOrSelf().Impl().(*Dentry).inode
+		parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
 		if !cb.Handle(vfs.Dirent{
 			Name: "..",
 			Type: parentInode.direntType(),
@@ -97,12 +97,12 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 		fd.off++
 	}
 
-	dir := d.Impl().(*Dentry).inode.impl.(*directory)
-	var child *Dentry
+	dir := vfsd.Impl().(*dentry).inode.impl.(*directory)
+	var child *dentry
 	if fd.iter == nil {
 		// Start iteration at the beginning of dir.
 		child = dir.childList.Front()
-		fd.iter = &Dentry{}
+		fd.iter = &dentry{}
 	} else {
 		// Continue iteration from where we left off.
 		child = fd.iter.Next()
@@ -130,32 +130,41 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 
 // Seek implements vfs.FileDescriptionImpl.Seek.
 func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	if whence != linux.SEEK_SET {
-		// TODO: Linux also allows SEEK_CUR.
+	fs := fd.filesystem()
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as given.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	default:
 		return 0, syserror.EINVAL
 	}
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
 
+	// If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't
+	// seek even if doing so might reposition the iterator due to concurrent
+	// mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek().
+	if fd.off == offset {
+		return offset, nil
+	}
+
 	fd.off = offset
 	// Compensate for "." and "..".
-	var remChildren int64
-	if offset < 2 {
-		remChildren = 0
-	} else {
+	remChildren := int64(0)
+	if offset >= 2 {
 		remChildren = offset - 2
 	}
 
-	fs := fd.filesystem()
 	dir := fd.inode().impl.(*directory)
 
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-
 	// Ensure that fd.iter exists and is not linked into dir.childList.
 	if fd.iter == nil {
-		fd.iter = &Dentry{}
+		fd.iter = &dentry{}
 	} else {
 		dir.childList.Remove(fd.iter)
 	}
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index 4d989eeaf..f79e2d9c8 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -28,9 +28,9 @@ import (
 //
 // stepLocked is loosely analogous to fs/namei.c:walk_component().
 //
-// Preconditions: Filesystem.mu must be locked. !rp.Done(). inode ==
-// vfsd.Impl().(*Dentry).inode.
-func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *Inode) (*vfs.Dentry, *Inode, error) {
+// Preconditions: filesystem.mu must be locked. !rp.Done(). inode ==
+// vfsd.Impl().(*dentry).inode.
+func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode) (*vfs.Dentry, *inode, error) {
 	if !inode.isDir() {
 		return nil, nil, syserror.ENOTDIR
 	}
@@ -47,7 +47,7 @@ afterSymlink:
 		// not in the Dentry tree, it doesn't exist.
 		return nil, nil, syserror.ENOENT
 	}
-	nextInode := nextVFSD.Impl().(*Dentry).inode
+	nextInode := nextVFSD.Impl().(*dentry).inode
 	if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
 		// TODO: symlink traversals update access time
 		if err := rp.HandleSymlink(symlink.target); err != nil {
@@ -64,10 +64,10 @@ afterSymlink:
 // walkExistingLocked is loosely analogous to Linux's
 // fs/namei.c:path_lookupat().
 //
-// Preconditions: Filesystem.mu must be locked.
-func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+// Preconditions: filesystem.mu must be locked.
+func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
 	vfsd := rp.Start()
-	inode := vfsd.Impl().(*Dentry).inode
+	inode := vfsd.Impl().(*dentry).inode
 	for !rp.Done() {
 		var err error
 		vfsd, inode, err = stepLocked(rp, vfsd, inode)
@@ -88,10 +88,10 @@ func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
 // walkParentDirLocked is loosely analogous to Linux's
 // fs/namei.c:path_parentat().
 //
-// Preconditions: Filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
 	vfsd := rp.Start()
-	inode := vfsd.Impl().(*Dentry).inode
+	inode := vfsd.Impl().(*dentry).inode
 	for !rp.Final() {
 		var err error
 		vfsd, inode, err = stepLocked(rp, vfsd, inode)
@@ -108,9 +108,9 @@ func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
 // checkCreateLocked checks that a file named rp.Component() may be created in
 // directory parentVFSD, then returns rp.Component().
 //
-// Preconditions: Filesystem.mu must be locked. parentInode ==
-// parentVFSD.Impl().(*Dentry).inode. parentInode.isDir() == true.
-func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *Inode) (string, error) {
+// Preconditions: filesystem.mu must be locked. parentInode ==
+// parentVFSD.Impl().(*dentry).inode. parentInode.isDir() == true.
+func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *inode) (string, error) {
 	if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
 		return "", err
 	}
@@ -144,7 +144,7 @@ func checkDeleteLocked(vfsd *vfs.Dentry) error {
 }
 
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
-func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
 	vfsd, inode, err := walkExistingLocked(rp)
@@ -164,7 +164,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 }
 
 // LinkAt implements vfs.FilesystemImpl.LinkAt.
-func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
 	if rp.Done() {
 		return syserror.EEXIST
 	}
@@ -185,7 +185,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	d := vd.Dentry().Impl().(*Dentry)
+	d := vd.Dentry().Impl().(*dentry)
 	if d.inode.isDir() {
 		return syserror.EPERM
 	}
@@ -197,7 +197,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 }
 
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
-func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
 	if rp.Done() {
 		return syserror.EEXIST
 	}
@@ -223,7 +223,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 }
 
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
-func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
 	if rp.Done() {
 		return syserror.EEXIST
 	}
@@ -246,7 +246,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 }
 
 // OpenAt implements vfs.FilesystemImpl.OpenAt.
-func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	// Filter out flags that are not supported by memfs. O_DIRECTORY and
 	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
 	// appropriate bits in rp), but are returned by
@@ -265,11 +265,10 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 
 	mustCreate := opts.Flags&linux.O_EXCL != 0
 	vfsd := rp.Start()
-	inode := vfsd.Impl().(*Dentry).inode
+	inode := vfsd.Impl().(*dentry).inode
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
 	if rp.Done() {
-		// FIXME: ???
 		if rp.MustBeDir() {
 			return nil, syserror.EISDIR
 		}
@@ -327,7 +326,7 @@ afterTrailingSymlink:
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
-	childInode := childVFSD.Impl().(*Dentry).inode
+	childInode := childVFSD.Impl().(*dentry).inode
 	if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
 		// TODO: symlink traversals update access time
 		if err := rp.HandleSymlink(symlink.target); err != nil {
@@ -340,7 +339,7 @@ afterTrailingSymlink:
 	return childInode.open(rp, childVFSD, opts.Flags, false)
 }
 
-func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+func (i *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(flags)
 	if !afterCreate {
 		if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil {
@@ -385,7 +384,7 @@ func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afte
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
-func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
 	fs.mu.RLock()
 	_, inode, err := walkExistingLocked(rp)
 	fs.mu.RUnlock()
@@ -400,9 +399,8 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
 	if rp.Done() {
-		// FIXME
 		return syserror.ENOENT
 	}
 	fs.mu.Lock()
@@ -424,7 +422,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vf
 }
 
 // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
-func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
 	vfsd, inode, err := walkExistingLocked(rp)
@@ -447,12 +445,14 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
 		return err
 	}
+	// Remove from parent directory's childList.
+	vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
 	inode.decRef()
 	return nil
 }
 
 // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
-func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	fs.mu.RLock()
 	_, _, err := walkExistingLocked(rp)
 	fs.mu.RUnlock()
@@ -462,12 +462,12 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 	if opts.Stat.Mask == 0 {
 		return nil
 	}
-	// TODO: implement Inode.setStat
+	// TODO: implement inode.setStat
 	return syserror.EPERM
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
-func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
 	fs.mu.RLock()
 	_, inode, err := walkExistingLocked(rp)
 	fs.mu.RUnlock()
@@ -480,7 +480,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 }
 
 // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
-func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
 	fs.mu.RLock()
 	_, _, err := walkExistingLocked(rp)
 	fs.mu.RUnlock()
@@ -492,7 +492,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 }
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
-func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
 	if rp.Done() {
 		return syserror.EEXIST
 	}
@@ -517,7 +517,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 }
 
 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
-func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
 	vfsd, inode, err := walkExistingLocked(rp)
@@ -537,6 +537,8 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
 		return err
 	}
+	// Remove from parent directory's childList.
+	vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
 	inode.decLinksLocked()
 	return nil
 }
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
index f381e1a88..45cd42b3e 100644
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -21,10 +21,10 @@
 //
 // Lock order:
 //
-// Filesystem.mu
+// filesystem.mu
 //   regularFileFD.offMu
 //     regularFile.mu
-//   Inode.mu
+//   inode.mu
 package memfs
 
 import (
@@ -42,8 +42,8 @@ import (
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
-// Filesystem implements vfs.FilesystemImpl.
-type Filesystem struct {
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
 	vfsfs vfs.Filesystem
 
 	// mu serializes changes to the Dentry tree.
@@ -54,44 +54,44 @@ type Filesystem struct {
 
 // NewFilesystem implements vfs.FilesystemType.NewFilesystem.
 func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	var fs Filesystem
+	var fs filesystem
 	fs.vfsfs.Init(&fs)
 	root := fs.newDentry(fs.newDirectory(creds, 01777))
 	return &fs.vfsfs, &root.vfsd, nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
-func (fs *Filesystem) Release() {
+func (fs *filesystem) Release() {
 }
 
 // Sync implements vfs.FilesystemImpl.Sync.
-func (fs *Filesystem) Sync(ctx context.Context) error {
+func (fs *filesystem) Sync(ctx context.Context) error {
 	// All filesystem state is in-memory.
 	return nil
 }
 
-// Dentry implements vfs.DentryImpl.
-type Dentry struct {
+// dentry implements vfs.DentryImpl.
+type dentry struct {
 	vfsd vfs.Dentry
 
-	// inode is the inode represented by this Dentry. Multiple Dentries may
-	// share a single non-directory Inode (with hard links). inode is
+	// inode is the inode represented by this dentry. Multiple Dentries may
+	// share a single non-directory inode (with hard links). inode is
 	// immutable.
-	inode *Inode
+	inode *inode
 
-	// memfs doesn't count references on Dentries; because the Dentry tree is
+	// memfs doesn't count references on dentries; because the dentry tree is
 	// the sole source of truth, it is by definition always consistent with the
-	// state of the filesystem. However, it does count references on Inodes,
-	// because Inode resources are released when all references are dropped.
+	// state of the filesystem. However, it does count references on inodes,
+	// because inode resources are released when all references are dropped.
 	// (memfs doesn't really have resources to release, but we implement
 	// reference counting because tmpfs regular files will.)
 
-	// dentryEntry (ugh) links Dentries into their parent directory.childList.
+	// dentryEntry (ugh) links dentries into their parent directory.childList.
 	dentryEntry
 }
 
-func (fs *Filesystem) newDentry(inode *Inode) *Dentry {
-	d := &Dentry{
+func (fs *filesystem) newDentry(inode *inode) *dentry {
+	d := &dentry{
 		inode: inode,
 	}
 	d.vfsd.Init(d)
@@ -99,37 +99,37 @@ func (fs *Filesystem) newDentry(inode *Inode) *Dentry {
 }
 
 // IncRef implements vfs.DentryImpl.IncRef.
-func (d *Dentry) IncRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) IncRef(vfsfs *vfs.Filesystem) {
 	d.inode.incRef()
 }
 
 // TryIncRef implements vfs.DentryImpl.TryIncRef.
-func (d *Dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
 	return d.inode.tryIncRef()
 }
 
 // DecRef implements vfs.DentryImpl.DecRef.
-func (d *Dentry) DecRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) DecRef(vfsfs *vfs.Filesystem) {
 	d.inode.decRef()
 }
 
-// Inode represents a filesystem object.
-type Inode struct {
+// inode represents a filesystem object.
+type inode struct {
 	// refs is a reference count. refs is accessed using atomic memory
 	// operations.
 	//
-	// A reference is held on all Inodes that are reachable in the filesystem
+	// A reference is held on all inodes that are reachable in the filesystem
 	// tree. For non-directories (which may have multiple hard links), this
 	// means that a reference is dropped when nlink reaches 0. For directories,
 	// nlink never reaches 0 due to the "." entry; instead,
-	// Filesystem.RmdirAt() drops the reference.
+	// filesystem.RmdirAt() drops the reference.
 	refs int64
 
 	// Inode metadata; protected by mu and accessed using atomic memory
 	// operations unless otherwise specified.
 	mu    sync.RWMutex
 	mode  uint32 // excluding file type bits, which are based on impl
-	nlink uint32 // protected by Filesystem.mu instead of Inode.mu
+	nlink uint32 // protected by filesystem.mu instead of inode.mu
 	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
 	gid   uint32 // auth.KGID, but ...
 	ino   uint64 // immutable
@@ -137,7 +137,7 @@ type Inode struct {
 	impl interface{} // immutable
 }
 
-func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials, mode uint16) {
+func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode uint16) {
 	i.refs = 1
 	i.mode = uint32(mode)
 	i.uid = uint32(creds.EffectiveKUID)
@@ -147,29 +147,29 @@ func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials,
 	i.impl = impl
 }
 
-// Preconditions: Filesystem.mu must be locked for writing.
-func (i *Inode) incLinksLocked() {
+// Preconditions: filesystem.mu must be locked for writing.
+func (i *inode) incLinksLocked() {
 	if atomic.AddUint32(&i.nlink, 1) <= 1 {
-		panic("memfs.Inode.incLinksLocked() called with no existing links")
+		panic("memfs.inode.incLinksLocked() called with no existing links")
 	}
 }
 
-// Preconditions: Filesystem.mu must be locked for writing.
-func (i *Inode) decLinksLocked() {
+// Preconditions: filesystem.mu must be locked for writing.
+func (i *inode) decLinksLocked() {
 	if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
 		i.decRef()
 	} else if nlink == ^uint32(0) { // negative overflow
-		panic("memfs.Inode.decLinksLocked() called with no existing links")
+		panic("memfs.inode.decLinksLocked() called with no existing links")
 	}
 }
 
-func (i *Inode) incRef() {
+func (i *inode) incRef() {
 	if atomic.AddInt64(&i.refs, 1) <= 1 {
-		panic("memfs.Inode.incRef() called without holding a reference")
+		panic("memfs.inode.incRef() called without holding a reference")
 	}
 }
 
-func (i *Inode) tryIncRef() bool {
+func (i *inode) tryIncRef() bool {
 	for {
 		refs := atomic.LoadInt64(&i.refs)
 		if refs == 0 {
@@ -181,7 +181,7 @@ func (i *Inode) tryIncRef() bool {
 	}
 }
 
-func (i *Inode) decRef() {
+func (i *inode) decRef() {
 	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
 		// This is unnecessary; it's mostly to simulate what tmpfs would do.
 		if regfile, ok := i.impl.(*regularFile); ok {
@@ -191,18 +191,18 @@ func (i *Inode) decRef() {
 			regfile.mu.Unlock()
 		}
 	} else if refs < 0 {
-		panic("memfs.Inode.decRef() called without holding a reference")
+		panic("memfs.inode.decRef() called without holding a reference")
 	}
 }
 
-func (i *Inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
 	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
 }
 
 // Go won't inline this function, and returning linux.Statx (which is quite
 // big) means spending a lot of time in runtime.duffcopy(), so instead it's an
 // output parameter.
-func (i *Inode) statTo(stat *linux.Statx) {
+func (i *inode) statTo(stat *linux.Statx) {
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
 	stat.Blksize = 1 // usermem.PageSize in tmpfs
 	stat.Nlink = atomic.LoadUint32(&i.nlink)
@@ -241,7 +241,7 @@ func allocatedBlocksForSize(size uint64) uint64 {
 	return (size + 511) / 512
 }
 
-func (i *Inode) direntType() uint8 {
+func (i *inode) direntType() uint8 {
 	switch i.impl.(type) {
 	case *regularFile:
 		return linux.DT_REG
@@ -258,16 +258,17 @@ func (i *Inode) direntType() uint8 {
 // vfs.FileDescriptionImpl.
 type fileDescription struct {
 	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
 
 	flags uint32 // status flags; immutable
 }
 
-func (fd *fileDescription) filesystem() *Filesystem {
-	return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*Filesystem)
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
 }
 
-func (fd *fileDescription) inode() *Inode {
-	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+func (fd *fileDescription) inode() *inode {
+	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
 }
 
 // StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
@@ -294,6 +295,6 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	if opts.Stat.Mask == 0 {
 		return nil
 	}
-	// TODO: implement Inode.setStat
+	// TODO: implement inode.setStat
 	return syserror.EPERM
 }
diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go
index 4a3603cc8..55f869798 100644
--- a/pkg/sentry/fsimpl/memfs/regular_file.go
+++ b/pkg/sentry/fsimpl/memfs/regular_file.go
@@ -28,16 +28,16 @@ import (
 )
 
 type regularFile struct {
-	inode Inode
+	inode inode
 
 	mu   sync.RWMutex
 	data []byte
 	// dataLen is len(data), but accessed using atomic memory operations to
-	// avoid locking in Inode.stat().
+	// avoid locking in inode.stat().
 	dataLen int64
 }
 
-func (fs *Filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *Inode {
+func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *inode {
 	file := &regularFile{}
 	file.inode.init(file, fs, creds, mode)
 	file.inode.nlink = 1 // from parent directory
@@ -46,7 +46,6 @@ func (fs *Filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *Inod
 
 type regularFileFD struct {
 	fileDescription
-	vfs.FileDescriptionDefaultImpl
 
 	// These are immutable.
 	readable bool
diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/memfs/symlink.go
index e002d1727..b2ac2cbeb 100644
--- a/pkg/sentry/fsimpl/memfs/symlink.go
+++ b/pkg/sentry/fsimpl/memfs/symlink.go
@@ -19,11 +19,11 @@ import (
 )
 
 type symlink struct {
-	inode  Inode
+	inode  inode
 	target string // immutable
 }
 
-func (fs *Filesystem) newSymlink(creds *auth.Credentials, target string) *Inode {
+func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
 	link := &symlink{
 		target: target,
 	}
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
new file mode 100644
index 000000000..3d8a4deaf
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -0,0 +1,49 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "proc",
+    srcs = [
+        "filesystems.go",
+        "loadavg.go",
+        "meminfo.go",
+        "mounts.go",
+        "net.go",
+        "proc.go",
+        "stat.go",
+        "sys.go",
+        "task.go",
+        "version.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/socket/unix",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+    ],
+)
+
+go_test(
+    name = "proc_test",
+    size = "small",
+    srcs = ["net_test.go"],
+    embed = [":proc"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/inet",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/proc/filesystems.go b/pkg/sentry/fsimpl/proc/filesystems.go
new file mode 100644
index 000000000..c36c4aff5
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/filesystems.go
@@ -0,0 +1,25 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+// filesystemsData implements vfs.DynamicBytesSource for /proc/filesystems.
+//
+// +stateify savable
+type filesystemsData struct{}
+
+// TODO(b/138862512): Implement vfs.DynamicBytesSource.Generate for
+// filesystemsData. We would need to retrive filesystem names from
+// vfs.VirtualFilesystem. Also needs vfs replacement for
+// fs.Filesystem.AllowUserList() and fs.FilesystemRequiresDev.
diff --git a/pkg/sentry/fsimpl/proc/loadavg.go b/pkg/sentry/fsimpl/proc/loadavg.go
new file mode 100644
index 000000000..9135afef1
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/loadavg.go
@@ -0,0 +1,40 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// loadavgData backs /proc/loadavg.
+//
+// +stateify savable
+type loadavgData struct{}
+
+var _ vfs.DynamicBytesSource = (*loadavgData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	// TODO(b/62345059): Include real data in fields.
+	// Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods.
+	// Column 4-5: currently running processes and the total number of processes.
+	// Column 6: the last process ID used.
+	fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/meminfo.go b/pkg/sentry/fsimpl/proc/meminfo.go
new file mode 100644
index 000000000..9a827cd66
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/meminfo.go
@@ -0,0 +1,77 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo.
+//
+// +stateify savable
+type meminfoData struct {
+	// k is the owning Kernel.
+	k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*meminfoData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	mf := d.k.MemoryFile()
+	mf.UpdateUsage()
+	snapshot, totalUsage := usage.MemoryAccounting.Copy()
+	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
+	anon := snapshot.Anonymous + snapshot.Tmpfs
+	file := snapshot.PageCache + snapshot.Mapped
+	// We don't actually have active/inactive LRUs, so just make up numbers.
+	activeFile := (file / 2) &^ (usermem.PageSize - 1)
+	inactiveFile := file - activeFile
+
+	fmt.Fprintf(buf, "MemTotal:       %8d kB\n", totalSize/1024)
+	memFree := (totalSize - totalUsage) / 1024
+	// We use MemFree as MemAvailable because we don't swap.
+	// TODO(rahat): When reclaim is implemented the value of MemAvailable
+	// should change.
+	fmt.Fprintf(buf, "MemFree:        %8d kB\n", memFree)
+	fmt.Fprintf(buf, "MemAvailable:   %8d kB\n", memFree)
+	fmt.Fprintf(buf, "Buffers:               0 kB\n") // memory usage by block devices
+	fmt.Fprintf(buf, "Cached:         %8d kB\n", (file+snapshot.Tmpfs)/1024)
+	// Emulate a system with no swap, which disables inactivation of anon pages.
+	fmt.Fprintf(buf, "SwapCache:             0 kB\n")
+	fmt.Fprintf(buf, "Active:         %8d kB\n", (anon+activeFile)/1024)
+	fmt.Fprintf(buf, "Inactive:       %8d kB\n", inactiveFile/1024)
+	fmt.Fprintf(buf, "Active(anon):   %8d kB\n", anon/1024)
+	fmt.Fprintf(buf, "Inactive(anon):        0 kB\n")
+	fmt.Fprintf(buf, "Active(file):   %8d kB\n", activeFile/1024)
+	fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024)
+	fmt.Fprintf(buf, "Unevictable:           0 kB\n") // TODO(b/31823263)
+	fmt.Fprintf(buf, "Mlocked:               0 kB\n") // TODO(b/31823263)
+	fmt.Fprintf(buf, "SwapTotal:             0 kB\n")
+	fmt.Fprintf(buf, "SwapFree:              0 kB\n")
+	fmt.Fprintf(buf, "Dirty:                 0 kB\n")
+	fmt.Fprintf(buf, "Writeback:             0 kB\n")
+	fmt.Fprintf(buf, "AnonPages:      %8d kB\n", anon/1024)
+	fmt.Fprintf(buf, "Mapped:         %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know
+	fmt.Fprintf(buf, "Shmem:          %8d kB\n", snapshot.Tmpfs/1024)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/mounts.go b/pkg/sentry/fsimpl/proc/mounts.go
new file mode 100644
index 000000000..e81b1e910
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/mounts.go
@@ -0,0 +1,33 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import "gvisor.dev/gvisor/pkg/sentry/kernel"
+
+// TODO(b/138862512): Implement mountInfoFile and mountsFile.
+
+// mountInfoFile implements vfs.DynamicBytesSource for /proc/[pid]/mountinfo.
+//
+// +stateify savable
+type mountInfoFile struct {
+	t *kernel.Task
+}
+
+// mountsFile implements vfs.DynamicBytesSource for /proc/[pid]/mounts.
+//
+// +stateify savable
+type mountsFile struct {
+	t *kernel.Task
+}
diff --git a/pkg/sentry/fsimpl/proc/net.go b/pkg/sentry/fsimpl/proc/net.go
new file mode 100644
index 000000000..fd46eebf8
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/net.go
@@ -0,0 +1,338 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6.
+//
+// +stateify savable
+type ifinet6 struct {
+	s inet.Stack
+}
+
+var _ vfs.DynamicBytesSource = (*ifinet6)(nil)
+
+func (n *ifinet6) contents() []string {
+	var lines []string
+	nics := n.s.Interfaces()
+	for id, naddrs := range n.s.InterfaceAddrs() {
+		nic, ok := nics[id]
+		if !ok {
+			// NIC was added after NICNames was called. We'll just
+			// ignore it.
+			continue
+		}
+
+		for _, a := range naddrs {
+			// IPv6 only.
+			if a.Family != linux.AF_INET6 {
+				continue
+			}
+
+			// Fields:
+			// IPv6 address displayed in 32 hexadecimal chars without colons
+			// Netlink device number (interface index) in hexadecimal (use nic id)
+			// Prefix length in hexadecimal
+			// Scope value (use 0)
+			// Interface flags
+			// Device name
+			lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name))
+		}
+	}
+	return lines
+}
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	for _, l := range n.contents() {
+		buf.WriteString(l)
+	}
+	return nil
+}
+
+// netDev implements vfs.DynamicBytesSource for /proc/net/dev.
+//
+// +stateify savable
+type netDev struct {
+	s inet.Stack
+}
+
+var _ vfs.DynamicBytesSource = (*netDev)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	interfaces := n.s.Interfaces()
+	buf.WriteString("Inter-|   Receive                                                |  Transmit\n")
+	buf.WriteString(" face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n")
+
+	for _, i := range interfaces {
+		// Implements the same format as
+		// net/core/net-procfs.c:dev_seq_printf_stats.
+		var stats inet.StatDev
+		if err := n.s.Statistics(&stats, i.Name); err != nil {
+			log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err)
+			continue
+		}
+		fmt.Fprintf(
+			buf,
+			"%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n",
+			i.Name,
+			// Received
+			stats[0], // bytes
+			stats[1], // packets
+			stats[2], // errors
+			stats[3], // dropped
+			stats[4], // fifo
+			stats[5], // frame
+			stats[6], // compressed
+			stats[7], // multicast
+			// Transmitted
+			stats[8],  // bytes
+			stats[9],  // packets
+			stats[10], // errors
+			stats[11], // dropped
+			stats[12], // fifo
+			stats[13], // frame
+			stats[14], // compressed
+			stats[15], // multicast
+		)
+	}
+
+	return nil
+}
+
+// netUnix implements vfs.DynamicBytesSource for /proc/net/unix.
+//
+// +stateify savable
+type netUnix struct {
+	k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*netUnix)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
+	for _, se := range n.k.ListSockets() {
+		s := se.Sock.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
+			continue
+		}
+		sfile := s.(*fs.File)
+		if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
+			s.DecRef()
+			// Not a unix socket.
+			continue
+		}
+		sops := sfile.FileOperations.(*unix.SocketOperations)
+
+		addr, err := sops.Endpoint().GetLocalAddress()
+		if err != nil {
+			log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err)
+			addr.Addr = "<unknown>"
+		}
+
+		sockFlags := 0
+		if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok {
+			if ce.Listening() {
+				// For unix domain sockets, linux reports a single flag
+				// value if the socket is listening, of __SO_ACCEPTCON.
+				sockFlags = linux.SO_ACCEPTCON
+			}
+		}
+
+		// In the socket entry below, the value for the 'Num' field requires
+		// some consideration. Linux prints the address to the struct
+		// unix_sock representing a socket in the kernel, but may redact the
+		// value for unprivileged users depending on the kptr_restrict
+		// sysctl.
+		//
+		// One use for this field is to allow a privileged user to
+		// introspect into the kernel memory to determine information about
+		// a socket not available through procfs, such as the socket's peer.
+		//
+		// In gvisor, returning a pointer to our internal structures would
+		// be pointless, as it wouldn't match the memory layout for struct
+		// unix_sock, making introspection difficult. We could populate a
+		// struct unix_sock with the appropriate data, but even that
+		// requires consideration for which kernel version to emulate, as
+		// the definition of this struct changes over time.
+		//
+		// For now, we always redact this pointer.
+		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d",
+			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
+			sfile.ReadRefs()-1,            // RefCount, don't count our own ref.
+			0,                             // Protocol, always 0 for UDS.
+			sockFlags,                     // Flags.
+			sops.Endpoint().Type(),        // Type.
+			sops.State(),                  // State.
+			sfile.InodeID(),               // Inode.
+		)
+
+		// Path
+		if len(addr.Addr) != 0 {
+			if addr.Addr[0] == 0 {
+				// Abstract path.
+				fmt.Fprintf(buf, " @%s", string(addr.Addr[1:]))
+			} else {
+				fmt.Fprintf(buf, " %s", string(addr.Addr))
+			}
+		}
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef()
+	}
+	return nil
+}
+
+// netTCP implements vfs.DynamicBytesSource for /proc/net/tcp.
+//
+// +stateify savable
+type netTCP struct {
+	k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*netTCP)(nil)
+
+func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	t := kernel.TaskFromContext(ctx)
+	buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode                                                     \n")
+	for _, se := range n.k.ListSockets() {
+		s := se.Sock.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock)
+			continue
+		}
+		sfile := s.(*fs.File)
+		sops, ok := sfile.FileOperations.(socket.Socket)
+		if !ok {
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+		}
+		if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) {
+			s.DecRef()
+			// Not tcp4 sockets.
+			continue
+		}
+
+		// Linux's documentation for the fields below can be found at
+		// https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt.
+		// For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock().
+		// Note that the header doesn't contain labels for all the fields.
+
+		// Field: sl; entry number.
+		fmt.Fprintf(buf, "%4d: ", se.ID)
+
+		portBuf := make([]byte, 2)
+
+		// Field: local_adddress.
+		var localAddr linux.SockAddrInet
+		if local, _, err := sops.GetSockName(t); err == nil {
+			localAddr = *local.(*linux.SockAddrInet)
+		}
+		binary.LittleEndian.PutUint16(portBuf, localAddr.Port)
+		fmt.Fprintf(buf, "%08X:%04X ",
+			binary.LittleEndian.Uint32(localAddr.Addr[:]),
+			portBuf)
+
+		// Field: rem_address.
+		var remoteAddr linux.SockAddrInet
+		if remote, _, err := sops.GetPeerName(t); err == nil {
+			remoteAddr = *remote.(*linux.SockAddrInet)
+		}
+		binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port)
+		fmt.Fprintf(buf, "%08X:%04X ",
+			binary.LittleEndian.Uint32(remoteAddr.Addr[:]),
+			portBuf)
+
+		// Field: state; socket state.
+		fmt.Fprintf(buf, "%02X ", sops.State())
+
+		// Field: tx_queue, rx_queue; number of packets in the transmit and
+		// receive queue. Unimplemented.
+		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
+
+		// Field: tr, tm->when; timer active state and number of jiffies
+		// until timer expires. Unimplemented.
+		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
+
+		// Field: retrnsmt; number of unrecovered RTO timeouts.
+		// Unimplemented.
+		fmt.Fprintf(buf, "%08X ", 0)
+
+		// Field: uid.
+		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
+		if err != nil {
+			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+			fmt.Fprintf(buf, "%5d ", 0)
+		} else {
+			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
+		}
+
+		// Field: timeout; number of unanswered 0-window probes.
+		// Unimplemented.
+		fmt.Fprintf(buf, "%8d ", 0)
+
+		// Field: inode.
+		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+
+		// Field: refcount. Don't count the ref we obtain while deferencing
+		// the weakref to this socket.
+		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+
+		// Field: Socket struct address. Redacted due to the same reason as
+		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
+		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
+
+		// Field: retransmit timeout. Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: predicted tick of soft clock (delayed ACK control data).
+		// Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: (ack.quick<<1)|ack.pingpong, Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: sending congestion window, Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: Slow start size threshold, -1 if threshold >= 0xFFFF.
+		// Unimplemented, report as large threshold.
+		fmt.Fprintf(buf, "%d", -1)
+
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef()
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/net_test.go b/pkg/sentry/fsimpl/proc/net_test.go
new file mode 100644
index 000000000..20a77a8ca
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/net_test.go
@@ -0,0 +1,78 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"reflect"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+)
+
+func newIPv6TestStack() *inet.TestStack {
+	s := inet.NewTestStack()
+	s.SupportsIPv6Flag = true
+	return s
+}
+
+func TestIfinet6NoAddresses(t *testing.T) {
+	n := &ifinet6{s: newIPv6TestStack()}
+	var buf bytes.Buffer
+	n.Generate(contexttest.Context(t), &buf)
+	if buf.Len() > 0 {
+		t.Errorf("n.Generate() generated = %v, want = %v", buf.Bytes(), []byte{})
+	}
+}
+
+func TestIfinet6(t *testing.T) {
+	s := newIPv6TestStack()
+	s.InterfacesMap[1] = inet.Interface{Name: "eth0"}
+	s.InterfaceAddrsMap[1] = []inet.InterfaceAddr{
+		{
+			Family:    linux.AF_INET6,
+			PrefixLen: 128,
+			Addr:      []byte("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"),
+		},
+	}
+	s.InterfacesMap[2] = inet.Interface{Name: "eth1"}
+	s.InterfaceAddrsMap[2] = []inet.InterfaceAddr{
+		{
+			Family:    linux.AF_INET6,
+			PrefixLen: 128,
+			Addr:      []byte("\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
+		},
+	}
+	want := map[string]struct{}{
+		"000102030405060708090a0b0c0d0e0f 01 80 00 00     eth0\n": {},
+		"101112131415161718191a1b1c1d1e1f 02 80 00 00     eth1\n": {},
+	}
+
+	n := &ifinet6{s: s}
+	contents := n.contents()
+	if len(contents) != len(want) {
+		t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want))
+	}
+	got := map[string]struct{}{}
+	for _, l := range contents {
+		got[l] = struct{}{}
+	}
+
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("Got n.contents() = %v, want = %v", got, want)
+	}
+}
diff --git a/pkg/sentry/fsimpl/proc/proc.go b/pkg/sentry/fsimpl/proc/proc.go
new file mode 100644
index 000000000..31dec36de
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/proc.go
@@ -0,0 +1,16 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package proc implements a partial in-memory file system for procfs.
+package proc
diff --git a/pkg/sentry/fsimpl/proc/stat.go b/pkg/sentry/fsimpl/proc/stat.go
new file mode 100644
index 000000000..720db3828
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/stat.go
@@ -0,0 +1,127 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// cpuStats contains the breakdown of CPU time for /proc/stat.
+type cpuStats struct {
+	// user is time spent in userspace tasks with non-positive niceness.
+	user uint64
+
+	// nice is time spent in userspace tasks with positive niceness.
+	nice uint64
+
+	// system is time spent in non-interrupt kernel context.
+	system uint64
+
+	// idle is time spent idle.
+	idle uint64
+
+	// ioWait is time spent waiting for IO.
+	ioWait uint64
+
+	// irq is time spent in interrupt context.
+	irq uint64
+
+	// softirq is time spent in software interrupt context.
+	softirq uint64
+
+	// steal is involuntary wait time.
+	steal uint64
+
+	// guest is time spent in guests with non-positive niceness.
+	guest uint64
+
+	// guestNice is time spent in guests with positive niceness.
+	guestNice uint64
+}
+
+// String implements fmt.Stringer.
+func (c cpuStats) String() string {
+	return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice)
+}
+
+// statData implements vfs.DynamicBytesSource for /proc/stat.
+//
+// +stateify savable
+type statData struct {
+	// k is the owning Kernel.
+	k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*statData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	// TODO(b/37226836): We currently export only zero CPU stats. We could
+	// at least provide some aggregate stats.
+	var cpu cpuStats
+	fmt.Fprintf(buf, "cpu  %s\n", cpu)
+
+	for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ {
+		fmt.Fprintf(buf, "cpu%d %s\n", c, cpu)
+	}
+
+	// The total number of interrupts is dependent on the CPUs and PCI
+	// devices on the system. See arch_probe_nr_irqs.
+	//
+	// Since we don't report real interrupt stats, just choose an arbitrary
+	// value from a representative VM.
+	const numInterrupts = 256
+
+	// The Kernel doesn't handle real interrupts, so report all zeroes.
+	// TODO(b/37226836): We could count page faults as #PF.
+	fmt.Fprintf(buf, "intr 0") // total
+	for i := 0; i < numInterrupts; i++ {
+		fmt.Fprintf(buf, " 0")
+	}
+	fmt.Fprintf(buf, "\n")
+
+	// Total number of context switches.
+	// TODO(b/37226836): Count this.
+	fmt.Fprintf(buf, "ctxt 0\n")
+
+	// CLOCK_REALTIME timestamp from boot, in seconds.
+	fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds())
+
+	// Total number of clones.
+	// TODO(b/37226836): Count this.
+	fmt.Fprintf(buf, "processes 0\n")
+
+	// Number of runnable tasks.
+	// TODO(b/37226836): Count this.
+	fmt.Fprintf(buf, "procs_running 0\n")
+
+	// Number of tasks waiting on IO.
+	// TODO(b/37226836): Count this.
+	fmt.Fprintf(buf, "procs_blocked 0\n")
+
+	// Number of each softirq handled.
+	fmt.Fprintf(buf, "softirq 0") // total
+	for i := 0; i < linux.NumSoftIRQ; i++ {
+		fmt.Fprintf(buf, " 0")
+	}
+	fmt.Fprintf(buf, "\n")
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/sys.go b/pkg/sentry/fsimpl/proc/sys.go
new file mode 100644
index 000000000..b88256e12
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/sys.go
@@ -0,0 +1,51 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// mmapMinAddrData implements vfs.DynamicBytesSource for
+// /proc/sys/vm/mmap_min_addr.
+//
+// +stateify savable
+type mmapMinAddrData struct {
+	k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*mmapMinAddrData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress())
+	return nil
+}
+
+// +stateify savable
+type overcommitMemory struct{}
+
+var _ vfs.DynamicBytesSource = (*overcommitMemory)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *overcommitMemory) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "0\n")
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
new file mode 100644
index 000000000..c46e05c3a
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -0,0 +1,261 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// mapsCommon is embedded by mapsData and smapsData.
+type mapsCommon struct {
+	t *kernel.Task
+}
+
+// mm gets the kernel task's MemoryManager. No additional reference is taken on
+// mm here. This is safe because MemoryManager.destroy is required to leave the
+// MemoryManager in a state where it's still usable as a DynamicBytesSource.
+func (md *mapsCommon) mm() *mm.MemoryManager {
+	var tmm *mm.MemoryManager
+	md.t.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			tmm = mm
+		}
+	})
+	return tmm
+}
+
+// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
+//
+// +stateify savable
+type mapsData struct {
+	mapsCommon
+}
+
+var _ vfs.DynamicBytesSource = (*mapsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (md *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if mm := md.mm(); mm != nil {
+		mm.ReadMapsDataInto(ctx, buf)
+	}
+	return nil
+}
+
+// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps.
+//
+// +stateify savable
+type smapsData struct {
+	mapsCommon
+}
+
+var _ vfs.DynamicBytesSource = (*smapsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (sd *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if mm := sd.mm(); mm != nil {
+		mm.ReadSmapsDataInto(ctx, buf)
+	}
+	return nil
+}
+
+// +stateify savable
+type taskStatData struct {
+	t *kernel.Task
+
+	// If tgstats is true, accumulate fault stats (not implemented) and CPU
+	// time across all tasks in t's thread group.
+	tgstats bool
+
+	// pidns is the PID namespace associated with the proc filesystem that
+	// includes the file using this statData.
+	pidns *kernel.PIDNamespace
+}
+
+var _ vfs.DynamicBytesSource = (*taskStatData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t))
+	fmt.Fprintf(buf, "(%s) ", s.t.Name())
+	fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0])
+	ppid := kernel.ThreadID(0)
+	if parent := s.t.Parent(); parent != nil {
+		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+	}
+	fmt.Fprintf(buf, "%d ", ppid)
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup()))
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session()))
+	fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
+	fmt.Fprintf(buf, "0 " /* flags */)
+	fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
+	var cputime usage.CPUStats
+	if s.tgstats {
+		cputime = s.t.ThreadGroup().CPUStats()
+	} else {
+		cputime = s.t.CPUStats()
+	}
+	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+	cputime = s.t.ThreadGroup().JoinedChildCPUStats()
+	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+	fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness())
+	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count())
+
+	// itrealvalue. Since kernel 2.6.17, this field is no longer
+	// maintained, and is hard coded as 0.
+	fmt.Fprintf(buf, "0 ")
+
+	// Start time is relative to boot time, expressed in clock ticks.
+	fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime())))
+
+	var vss, rss uint64
+	s.t.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+		}
+	})
+	fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize)
+
+	// rsslim.
+	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur)
+
+	fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
+	fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
+	fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
+	terminationSignal := linux.Signal(0)
+	if s.t == s.t.ThreadGroup().Leader() {
+		terminationSignal = s.t.ThreadGroup().TerminationSignal()
+	}
+	fmt.Fprintf(buf, "%d ", terminationSignal)
+	fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
+	fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
+	fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
+	fmt.Fprintf(buf, "0\n" /* exit_code */)
+
+	return nil
+}
+
+// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm.
+//
+// +stateify savable
+type statmData struct {
+	t *kernel.Task
+}
+
+var _ vfs.DynamicBytesSource = (*statmData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	var vss, rss uint64
+	s.t.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+		}
+	})
+
+	fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize)
+	return nil
+}
+
+// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status.
+//
+// +stateify savable
+type statusData struct {
+	t     *kernel.Task
+	pidns *kernel.PIDNamespace
+}
+
+var _ vfs.DynamicBytesSource = (*statusData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name())
+	fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus())
+	fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup()))
+	fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t))
+	ppid := kernel.ThreadID(0)
+	if parent := s.t.Parent(); parent != nil {
+		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+	}
+	fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
+	tpid := kernel.ThreadID(0)
+	if tracer := s.t.Tracer(); tracer != nil {
+		tpid = s.pidns.IDOfTask(tracer)
+	}
+	fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
+	var fds int
+	var vss, rss, data uint64
+	s.t.WithMuLocked(func(t *kernel.Task) {
+		if fdTable := t.FDTable(); fdTable != nil {
+			fds = fdTable.Size()
+		}
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+			data = mm.VirtualDataSize()
+		}
+	})
+	fmt.Fprintf(buf, "FDSize:\t%d\n", fds)
+	fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
+	fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
+	fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
+	fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count())
+	creds := s.t.Credentials()
+	fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
+	fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
+	fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
+	fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
+	fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode())
+	return nil
+}
+
+// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider.
+type ioUsage interface {
+	// IOUsage returns the io usage data.
+	IOUsage() *usage.IO
+}
+
+// +stateify savable
+type ioData struct {
+	ioUsage
+}
+
+var _ vfs.DynamicBytesSource = (*ioData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	io := usage.IO{}
+	io.Accumulate(i.IOUsage())
+
+	fmt.Fprintf(buf, "char: %d\n", io.CharsRead)
+	fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten)
+	fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls)
+	fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls)
+	fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead)
+	fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten)
+	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/version.go b/pkg/sentry/fsimpl/proc/version.go
new file mode 100644
index 000000000..e1643d4e0
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/version.go
@@ -0,0 +1,68 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// versionData implements vfs.DynamicBytesSource for /proc/version.
+//
+// +stateify savable
+type versionData struct {
+	// k is the owning Kernel.
+	k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*versionData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	init := v.k.GlobalInit()
+	if init == nil {
+		// Attempted to read before the init Task is created. This can
+		// only occur during startup, which should never need to read
+		// this file.
+		panic("Attempted to read version before initial Task is available")
+	}
+
+	// /proc/version takes the form:
+	//
+	// "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST)
+	// (COMPILER_VERSION) VERSION"
+	//
+	// where:
+	// - SYSNAME, RELEASE, and VERSION are the same as returned by
+	// sys_utsname
+	// - COMPILE_USER is the user that build the kernel
+	// - COMPILE_HOST is the hostname of the machine on which the kernel
+	// was built
+	// - COMPILER_VERSION is the version reported by the building compiler
+	//
+	// Since we don't really want to expose build information to
+	// applications, those fields are omitted.
+	//
+	// FIXME(mpratt): Using Version from the init task SyscallTable
+	// disregards the different version a task may have (e.g., in a uts
+	// namespace).
+	ver := init.Leader().SyscallTable().Version
+	fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version)
+	return nil
+}
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index 5b75a4a06..80f227dbe 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -52,12 +52,16 @@ type Stack interface {
 
 	// Statistics reports stack statistics.
 	Statistics(stat interface{}, arg string) error
+
+	// RouteTable returns the network stack's route table.
+	RouteTable() []Route
+
+	// Resume restarts the network stack after restore.
+	Resume()
 }
 
 // Interface contains information about a network interface.
 type Interface struct {
-	// Keep these fields sorted in the order they appear in rtnetlink(7).
-
 	// DeviceType is the device type, a Linux ARPHRD_* constant.
 	DeviceType uint16
 
@@ -77,8 +81,6 @@ type Interface struct {
 
 // InterfaceAddr contains information about a network interface address.
 type InterfaceAddr struct {
-	// Keep these fields sorted in the order they appear in rtnetlink(7).
-
 	// Family is the address family, a Linux AF_* constant.
 	Family uint8
 
@@ -109,3 +111,45 @@ type TCPBufferSize struct {
 // StatDev describes one line of /proc/net/dev, i.e., stats for one network
 // interface.
 type StatDev [16]uint64
+
+// Route contains information about a network route.
+type Route struct {
+	// Family is the address family, a Linux AF_* constant.
+	Family uint8
+
+	// DstLen is the length of the destination address.
+	DstLen uint8
+
+	// SrcLen is the length of the source address.
+	SrcLen uint8
+
+	// TOS is the Type of Service filter.
+	TOS uint8
+
+	// Table is the routing table ID.
+	Table uint8
+
+	// Protocol is the route origin, a Linux RTPROT_* constant.
+	Protocol uint8
+
+	// Scope is the distance to destination, a Linux RT_SCOPE_* constant.
+	Scope uint8
+
+	// Type is the route origin, a Linux RTN_* constant.
+	Type uint8
+
+	// Flags are route flags. See rtnetlink(7) under "rtm_flags".
+	Flags uint32
+
+	// DstAddr is the route destination address (RTA_DST).
+	DstAddr []byte
+
+	// SrcAddr is the route source address (RTA_SRC).
+	SrcAddr []byte
+
+	// OutputInterface is the output interface index (RTA_OIF).
+	OutputInterface int32
+
+	// GatewayAddr is the route gateway address (RTA_GATEWAY).
+	GatewayAddr []byte
+}
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index 75f9e7a77..b9eed7c3a 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -18,6 +18,7 @@ package inet
 type TestStack struct {
 	InterfacesMap     map[int32]Interface
 	InterfaceAddrsMap map[int32][]InterfaceAddr
+	RouteList         []Route
 	SupportsIPv6Flag  bool
 	TCPRecvBufSize    TCPBufferSize
 	TCPSendBufSize    TCPBufferSize
@@ -86,3 +87,12 @@ func (s *TestStack) SetTCPSACKEnabled(enabled bool) error {
 func (s *TestStack) Statistics(stat interface{}, arg string) error {
 	return nil
 }
+
+// RouteTable implements Stack.RouteTable.
+func (s *TestStack) RouteTable() []Route {
+	return s.RouteList
+}
+
+// Resume implements Stack.Resume.
+func (s *TestStack) Resume() {
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index e61d39c82..41bee9a22 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -144,6 +144,7 @@ go_library(
         "threads.go",
         "timekeeper.go",
         "timekeeper_state.go",
+        "tty.go",
         "uts_namespace.go",
         "vdso.go",
         "version.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 4c2d48e65..8c1f79ab5 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -112,11 +112,6 @@ type Kernel struct {
 	rootIPCNamespace            *IPCNamespace
 	rootAbstractSocketNamespace *AbstractSocketNamespace
 
-	// mounts holds the state of the virtual filesystem. mounts is initially
-	// nil, and must be set by calling Kernel.SetRootMountNamespace before
-	// Kernel.CreateProcess can succeed.
-	mounts *fs.MountNamespace
-
 	// futexes is the "root" futex.Manager, from which all others are forked.
 	// This is necessary to ensure that shared futexes are coherent across all
 	// tasks, including those created by CreateProcess.
@@ -197,6 +192,15 @@ type Kernel struct {
 	// caches. Not all caches use it, only the caches that use host resources use
 	// the limiter. It may be nil if disabled.
 	DirentCacheLimiter *fs.DirentCacheLimiter
+
+	// unimplementedSyscallEmitterOnce is used in the initialization of
+	// unimplementedSyscallEmitter.
+	unimplementedSyscallEmitterOnce sync.Once `state:"nosave"`
+
+	// unimplementedSyscallEmitter is used to emit unimplemented syscall
+	// events. This is initialized lazily on the first unimplemented
+	// syscall.
+	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -290,7 +294,6 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
 	k.futexes = futex.NewManager()
 	k.netlinkPorts = port.New()
-
 	return nil
 }
 
@@ -384,11 +387,7 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 // flushMountSourceRefs flushes the MountSources for all mounted filesystems
 // and open FDs.
 func (k *Kernel) flushMountSourceRefs() error {
-	// Flush all mount sources for currently mounted filesystems in the
-	// root mount namespace.
-	k.mounts.FlushMountSourceRefs()
-
-	// Some tasks may have other mount namespaces; flush those as well.
+	// Flush all mount sources for currently mounted filesystems in each task.
 	flushed := make(map[*fs.MountNamespace]struct{})
 	k.tasks.mu.RLock()
 	k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) {
@@ -497,7 +496,7 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 }
 
 // LoadFrom returns a new Kernel loaded from args.
-func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
+func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
 	loadStart := time.Now()
 
 	k.networkStack = net
@@ -541,6 +540,11 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
 
 	log.Infof("Overall load took [%s]", time.Since(loadStart))
 
+	k.Timekeeper().SetClocks(clocks)
+	if net != nil {
+		net.Resume()
+	}
+
 	// Ensure that all pending asynchronous work is complete:
 	//   - namedpipe opening
 	//   - inode file opening
@@ -550,7 +554,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
 
 	tcpip.AsyncLoading.Wait()
 
-	log.Infof("Overall load took [%s]", time.Since(loadStart))
+	log.Infof("Overall load took [%s] after async work", time.Since(loadStart))
 
 	// Applications may size per-cpu structures based on k.applicationCores, so
 	// it can't change across save/restore. When we are virtualizing CPU
@@ -565,16 +569,6 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
 	return nil
 }
 
-// Destroy releases resources owned by k.
-//
-// Preconditions: There must be no task goroutines running in k.
-func (k *Kernel) Destroy() {
-	if k.mounts != nil {
-		k.mounts.DecRef()
-		k.mounts = nil
-	}
-}
-
 // UniqueID returns a unique identifier.
 func (k *Kernel) UniqueID() uint64 {
 	id := atomic.AddUint64(&k.uniqueID, 1)
@@ -586,11 +580,17 @@ func (k *Kernel) UniqueID() uint64 {
 
 // CreateProcessArgs holds arguments to kernel.CreateProcess.
 type CreateProcessArgs struct {
-	// Filename is the filename to load.
+	// Filename is the filename to load as the init binary.
 	//
-	// If this is provided as "", then the file will be guessed via Argv[0].
+	// If this is provided as "", File will be checked, then the file will be
+	// guessed via Argv[0].
 	Filename string
 
+	// File is a passed host FD pointing to a file to load as the init binary.
+	//
+	// This is checked if and only if Filename is "".
+	File *fs.File
+
 	// Argvv is a list of arguments.
 	Argv []string
 
@@ -632,19 +632,12 @@ type CreateProcessArgs struct {
 	AbstractSocketNamespace *AbstractSocketNamespace
 
 	// MountNamespace optionally contains the mount namespace for this
-	// process. If nil, the kernel's mount namespace is used.
+	// process. If nil, the init process's mount namespace is used.
 	//
 	// Anyone setting MountNamespace must donate a reference (i.e.
 	// increment it).
 	MountNamespace *fs.MountNamespace
 
-	// Root optionally contains the dirent that serves as the root for the
-	// process. If nil, the mount namespace's root is used as the process'
-	// root.
-	//
-	// Anyone setting Root must donate a reference (i.e. increment it).
-	Root *fs.Dirent
-
 	// ContainerID is the container that the process belongs to.
 	ContainerID string
 }
@@ -682,16 +675,10 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 	case auth.CtxCredentials:
 		return ctx.args.Credentials
 	case fs.CtxRoot:
-		if ctx.args.Root != nil {
-			// Take a reference on the root dirent that will be
-			// given to the caller.
-			ctx.args.Root.IncRef()
-			return ctx.args.Root
-		}
-		if ctx.k.mounts != nil {
-			// MountNamespace.Root() will take a reference on the
-			// root dirent for us.
-			return ctx.k.mounts.Root()
+		if ctx.args.MountNamespace != nil {
+			// MountNamespace.Root() will take a reference on the root
+			// dirent for us.
+			return ctx.args.MountNamespace.Root()
 		}
 		return nil
 	case fs.CtxDirentCacheLimiter:
@@ -735,30 +722,18 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	defer k.extMu.Unlock()
 	log.Infof("EXEC: %v", args.Argv)
 
-	if k.mounts == nil {
-		return nil, 0, fmt.Errorf("no kernel MountNamespace")
-	}
-
 	// Grab the mount namespace.
 	mounts := args.MountNamespace
 	if mounts == nil {
-		// If no MountNamespace was configured, then use the kernel's
-		// root mount namespace, with an extra reference that will be
-		// donated to the task.
-		mounts = k.mounts
+		mounts = k.GlobalInit().Leader().MountNamespace()
 		mounts.IncRef()
 	}
 
 	tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
 	ctx := args.NewContext(k)
 
-	// Grab the root directory.
-	root := args.Root
-	if root == nil {
-		// If no Root was configured, then get it from the
-		// MountNamespace.
-		root = mounts.Root()
-	}
+	// Get the root directory from the MountNamespace.
+	root := mounts.Root()
 	// The call to newFSContext below will take a reference on root, so we
 	// don't need to hold this one.
 	defer root.DecRef()
@@ -768,15 +743,23 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	wd := root // Default.
 	if args.WorkingDirectory != "" {
 		var err error
-		wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
+		wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
 		if err != nil {
 			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
 		}
 		defer wd.DecRef()
 	}
 
-	if args.Filename == "" {
-		// Was anything provided?
+	// Check which file to start from.
+	switch {
+	case args.Filename != "":
+		// If a filename is given, take that.
+		// Set File to nil so we resolve the path in LoadTaskImage.
+		args.File = nil
+	case args.File != nil:
+		// If File is set, take the File provided directly.
+	default:
+		// Otherwise look at Argv and see if the first argument is a valid path.
 		if len(args.Argv) == 0 {
 			return nil, 0, fmt.Errorf("no filename or command provided")
 		}
@@ -788,7 +771,8 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 
 	// Create a fresh task context.
 	remainingTraversals = uint(args.MaxSymlinkTraversals)
-	tc, se := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
+
+	tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet)
 	if se != nil {
 		return nil, 0, errors.New(se.String())
 	}
@@ -1032,20 +1016,6 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
 	return k.rootAbstractSocketNamespace
 }
 
-// RootMountNamespace returns the MountNamespace.
-func (k *Kernel) RootMountNamespace() *fs.MountNamespace {
-	k.extMu.Lock()
-	defer k.extMu.Unlock()
-	return k.mounts
-}
-
-// SetRootMountNamespace sets the MountNamespace.
-func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) {
-	k.extMu.Lock()
-	defer k.extMu.Unlock()
-	k.mounts = mounts
-}
-
 // NetworkStack returns the network stack. NetworkStack may return nil if no
 // network stack is available.
 func (k *Kernel) NetworkStack() inet.Stack {
@@ -1168,16 +1138,6 @@ func (k *Kernel) SupervisorContext() context.Context {
 	}
 }
 
-// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
-// channel.
-func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
-	t := TaskFromContext(ctx)
-	eventchannel.Emit(&uspb.UnimplementedSyscall{
-		Tid:       int32(t.ThreadID()),
-		Registers: t.Arch().StateData().Proto(),
-	})
-}
-
 // SocketEntry represents a socket recorded in Kernel.sockets. It implements
 // refs.WeakRefUser for sockets stored in the socket table.
 //
@@ -1246,7 +1206,10 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		// The supervisor context is global root.
 		return auth.NewRootCredentials(ctx.k.rootUserNamespace)
 	case fs.CtxRoot:
-		return ctx.k.mounts.Root()
+		if ctx.k.globalInit != nil {
+			return ctx.k.globalInit.mounts.Root()
+		}
+		return nil
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
 	case ktime.CtxRealtimeClock:
@@ -1272,3 +1235,23 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return nil
 	}
 }
+
+// Rate limits for the number of unimplemented syscall events.
+const (
+	unimplementedSyscallsMaxRate = 100  // events per second
+	unimplementedSyscallBurst    = 1000 // events
+)
+
+// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
+// channel.
+func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
+	k.unimplementedSyscallEmitterOnce.Do(func() {
+		k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst)
+	})
+
+	t := TaskFromContext(ctx)
+	k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{
+		Tid:       int32(t.ThreadID()),
+		Registers: t.Arch().StateData().Proto(),
+	})
+}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 81fcd8258..e5f297478 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -47,6 +47,11 @@ type Session struct {
 	// The id is immutable.
 	id SessionID
 
+	// foreground is the foreground process group.
+	//
+	// This is protected by TaskSet.mu.
+	foreground *ProcessGroup
+
 	// ProcessGroups is a list of process groups in this Session. This is
 	// protected by TaskSet.mu.
 	processGroups processGroupList
@@ -260,12 +265,14 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error {
 func (tg *ThreadGroup) CreateSession() error {
 	tg.pidns.owner.mu.Lock()
 	defer tg.pidns.owner.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
 	return tg.createSession()
 }
 
 // createSession creates a new session for a threadgroup.
 //
-// Precondition: callers must hold TaskSet.mu for writing.
+// Precondition: callers must hold TaskSet.mu and the signal mutex for writing.
 func (tg *ThreadGroup) createSession() error {
 	// Get the ID for this thread in the current namespace.
 	id := tg.pidns.tgids[tg]
@@ -346,6 +353,9 @@ func (tg *ThreadGroup) createSession() error {
 		ns.processGroups[ProcessGroupID(local)] = pg
 	}
 
+	// Disconnect from the controlling terminal.
+	tg.tty = nil
+
 	return nil
 }
 
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index 2a2e6f662..dd69939f9 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -15,6 +15,7 @@
 package kernel
 
 import (
+	"runtime"
 	"time"
 
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -121,6 +122,17 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
 	// Deactive our address space, we don't need it.
 	interrupt := t.SleepStart()
 
+	// If the request is not completed, but the timer has already expired,
+	// then ensure that we run through a scheduler cycle. This is because
+	// we may see applications relying on timer slack to yield the thread.
+	// For example, they may attempt to sleep for some number of nanoseconds,
+	// and expect that this will actually yield the CPU and sleep for at
+	// least microseconds, e.g.:
+	// https://github.com/LMAX-Exchange/disruptor/commit/6ca210f2bcd23f703c479804d583718e16f43c07
+	if len(timerChan) > 0 {
+		runtime.Gosched()
+	}
+
 	select {
 	case <-C:
 		t.SleepFinish(true)
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 54b1676b0..8639d379f 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -140,15 +140,22 @@ func (t *Task) Stack() *arch.Stack {
 //  * wd: Working directory to lookup filename under
 //  * maxTraversals: maximum number of symlinks to follow
 //  * filename: path to binary to load
+//  * file: an open fs.File object of the binary to load. If set,
+//  file will be loaded and not filename.
 //  * argv: Binary argv
 //  * envv: Binary envv
 //  * fs: Binary FeatureSet
-func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
+func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, file *fs.File, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
+	// If File is not nil, we should load that instead of resolving filename.
+	if file != nil {
+		filename = file.MappedName(ctx)
+	}
+
 	// Prepare a new user address space to load into.
 	m := mm.NewMemoryManager(k, k)
 	defer m.DecUsers(ctx)
 
-	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv, k.extraAuxv, k.vdso)
+	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, envv, k.extraAuxv, k.vdso)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index d60cd62c7..ae6fc4025 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -172,9 +172,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		if parentPG := tg.parentPG(); parentPG == nil {
 			tg.createSession()
 		} else {
-			// Inherit the process group.
+			// Inherit the process group and terminal.
 			parentPG.incRefWithParent(parentPG)
 			tg.processGroup = parentPG
+			tg.tty = t.parent.tg.tty
 		}
 	}
 	tg.tasks.PushBack(t)
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 2a97e3e8e..0eef24bfb 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -19,10 +19,13 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // A ThreadGroup is a logical grouping of tasks that has widespread
@@ -245,6 +248,12 @@ type ThreadGroup struct {
 	//
 	// mounts is immutable.
 	mounts *fs.MountNamespace
+
+	// tty is the thread group's controlling terminal. If nil, there is no
+	// controlling terminal.
+	//
+	// tty is protected by the signal mutex.
+	tty *TTY
 }
 
 // newThreadGroup returns a new, empty thread group in PID namespace ns. The
@@ -324,6 +333,176 @@ func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
 	}
 }
 
+// SetControllingTTY sets tty as the controlling terminal of tg.
+func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error {
+	tty.mu.Lock()
+	defer tty.mu.Unlock()
+
+	// We might be asked to set the controlling terminal of multiple
+	// processes, so we lock both the TaskSet and SignalHandlers.
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+
+	// "The calling process must be a session leader and not have a
+	// controlling terminal already." - tty_ioctl(4)
+	if tg.processGroup.session.leader != tg || tg.tty != nil {
+		return syserror.EINVAL
+	}
+
+	// "If this terminal is already the controlling terminal of a different
+	// session group, then the ioctl fails with EPERM, unless the caller
+	// has the CAP_SYS_ADMIN capability and arg equals 1, in which case the
+	// terminal is stolen, and all processes that had it as controlling
+	// terminal lose it." - tty_ioctl(4)
+	if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session {
+		if !auth.CredentialsFromContext(tg.leader).HasCapability(linux.CAP_SYS_ADMIN) || arg != 1 {
+			return syserror.EPERM
+		}
+		// Steal the TTY away. Unlike TIOCNOTTY, don't send signals.
+		for othertg := range tg.pidns.owner.Root.tgids {
+			// This won't deadlock by locking tg.signalHandlers
+			// because at this point:
+			// - We only lock signalHandlers if it's in the same
+			//   session as the tty's controlling thread group.
+			// - We know that the calling thread group is not in
+			//   the same session as the tty's controlling thread
+			//   group.
+			if othertg.processGroup.session == tty.tg.processGroup.session {
+				othertg.signalHandlers.mu.Lock()
+				othertg.tty = nil
+				othertg.signalHandlers.mu.Unlock()
+			}
+		}
+	}
+
+	// Set the controlling terminal and foreground process group.
+	tg.tty = tty
+	tg.processGroup.session.foreground = tg.processGroup
+	// Set this as the controlling process of the terminal.
+	tty.tg = tg
+
+	return nil
+}
+
+// ReleaseControllingTTY gives up tty as the controlling tty of tg.
+func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error {
+	tty.mu.Lock()
+	defer tty.mu.Unlock()
+
+	// We might be asked to set the controlling terminal of multiple
+	// processes, so we lock both the TaskSet and SignalHandlers.
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+
+	// Just below, we may re-lock signalHandlers in order to send signals.
+	// Thus we can't defer Unlock here.
+	tg.signalHandlers.mu.Lock()
+
+	if tg.tty == nil || tg.tty != tty {
+		tg.signalHandlers.mu.Unlock()
+		return syserror.ENOTTY
+	}
+
+	// "If the process was session leader, then send SIGHUP and SIGCONT to
+	// the foreground process group and all processes in the current
+	// session lose their controlling terminal." - tty_ioctl(4)
+	// Remove tty as the controlling tty for each process in the session,
+	// then send them SIGHUP and SIGCONT.
+
+	// If we're not the session leader, we don't have to do much.
+	if tty.tg != tg {
+		tg.tty = nil
+		tg.signalHandlers.mu.Unlock()
+		return nil
+	}
+
+	tg.signalHandlers.mu.Unlock()
+
+	// We're the session leader. SIGHUP and SIGCONT the foreground process
+	// group and remove all controlling terminals in the session.
+	var lastErr error
+	for othertg := range tg.pidns.owner.Root.tgids {
+		if othertg.processGroup.session == tg.processGroup.session {
+			othertg.signalHandlers.mu.Lock()
+			othertg.tty = nil
+			if othertg.processGroup == tg.processGroup.session.foreground {
+				if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGHUP)}, true /* group */); err != nil {
+					lastErr = err
+				}
+				if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGCONT)}, true /* group */); err != nil {
+					lastErr = err
+				}
+			}
+			othertg.signalHandlers.mu.Unlock()
+		}
+	}
+
+	return lastErr
+}
+
+// ForegroundProcessGroup returns the process group ID of the foreground
+// process group.
+func (tg *ThreadGroup) ForegroundProcessGroup(tty *TTY) (int32, error) {
+	tty.mu.Lock()
+	defer tty.mu.Unlock()
+
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+
+	// "When fd does not refer to the controlling terminal of the calling
+	// process, -1 is returned" - tcgetpgrp(3)
+	if tg.tty != tty {
+		return -1, syserror.ENOTTY
+	}
+
+	return int32(tg.processGroup.session.foreground.id), nil
+}
+
+// SetForegroundProcessGroup sets the foreground process group of tty to pgid.
+func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) (int32, error) {
+	tty.mu.Lock()
+	defer tty.mu.Unlock()
+
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+
+	// TODO(b/129283598): "If tcsetpgrp() is called by a member of a
+	// background process group in its session, and the calling process is
+	// not blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all
+	// members of this background process group."
+
+	// tty must be the controlling terminal.
+	if tg.tty != tty {
+		return -1, syserror.ENOTTY
+	}
+
+	// pgid must be positive.
+	if pgid < 0 {
+		return -1, syserror.EINVAL
+	}
+
+	// pg must not be empty. Empty process groups are removed from their
+	// pid namespaces.
+	pg, ok := tg.pidns.processGroups[pgid]
+	if !ok {
+		return -1, syserror.ESRCH
+	}
+
+	// pg must be part of this process's session.
+	if tg.processGroup.session != pg.session {
+		return -1, syserror.EPERM
+	}
+
+	tg.processGroup.session.foreground.id = pgid
+	return 0, nil
+}
+
 // itimerRealListener implements ktime.Listener for ITIMER_REAL expirations.
 //
 // +stateify savable
diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go
new file mode 100644
index 000000000..34f84487a
--- /dev/null
+++ b/pkg/sentry/kernel/tty.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import "sync"
+
+// TTY defines the relationship between a thread group and its controlling
+// terminal.
+//
+// +stateify savable
+type TTY struct {
+	mu sync.Mutex `state:"nosave"`
+
+	// tg is protected by mu.
+	tg *ThreadGroup
+}
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index bc5b841fb..ba9c9ce12 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -464,7 +464,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 	// base address big enough to fit all segments, so we first create a
 	// mapping for the total size just to find a region that is big enough.
 	//
-	// It is safe to unmap it immediately with racing with another mapping
+	// It is safe to unmap it immediately without racing with another mapping
 	// because we are the only one in control of the MemoryManager.
 	//
 	// Note that the vaddr of the first PT_LOAD segment is ignored when
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index baa12d9a0..f6f1ae762 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -67,8 +67,64 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
 	if err != nil {
 		return nil, nil, err
 	}
+
+	// Open file will take a reference to Dirent, so destroy this one.
 	defer d.DecRef()
 
+	return openFile(ctx, nil, d, name)
+}
+
+// openFile performs checks on a file to be executed. If provided a *fs.File,
+// openFile takes that file's Dirent and performs checks on it. If provided a
+// *fs.Dirent and not a *fs.File, it creates a *fs.File object from the Dirent's
+// Inode and performs checks on that.
+//
+// openFile returns an *fs.File and *fs.Dirent, and the caller takes ownership
+// of both.
+//
+// "dirent" and "file" must not both be nil and point to a readable, executable, regular file.
+func openFile(ctx context.Context, file *fs.File, dirent *fs.Dirent, name string) (*fs.Dirent, *fs.File, error) {
+	// file and dirent must not be nil.
+	if dirent == nil && file == nil {
+		ctx.Infof("dirent and file cannot both be nil.")
+		return nil, nil, syserror.ENOENT
+	}
+
+	if file != nil {
+		dirent = file.Dirent
+	}
+
+	// Perform permissions checks on the file.
+	if err := checkFile(ctx, dirent, name); err != nil {
+		return nil, nil, err
+	}
+
+	if file == nil {
+		var ferr error
+		if file, ferr = dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true}); ferr != nil {
+			return nil, nil, ferr
+		}
+	} else {
+		// GetFile takes a reference to the created file, so make one in the case
+		// that the file reference already existed.
+		file.IncRef()
+	}
+
+	// We must be able to read at arbitrary offsets.
+	if !file.Flags().Pread {
+		file.DecRef()
+		ctx.Infof("%s cannot be read at an offset: %+v", file.MappedName(ctx), file.Flags())
+		return nil, nil, syserror.EACCES
+	}
+
+	// Grab reference for caller.
+	dirent.IncRef()
+	return dirent, file, nil
+}
+
+// checkFile performs file permissions checks for binaries called in openPath
+// and openFile
+func checkFile(ctx context.Context, d *fs.Dirent, name string) error {
 	perms := fs.PermMask{
 		// TODO(gvisor.dev/issue/160): Linux requires only execute
 		// permission, not read. However, our backing filesystems may
@@ -80,7 +136,7 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
 		Execute: true,
 	}
 	if err := d.Inode.CheckPermission(ctx, perms); err != nil {
-		return nil, nil, err
+		return err
 	}
 
 	// If they claim it's a directory, then make sure.
@@ -88,31 +144,17 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
 	// N.B. we reject directories below, but we must first reject
 	// non-directories passed as directories.
 	if len(name) > 0 && name[len(name)-1] == '/' && !fs.IsDir(d.Inode.StableAttr) {
-		return nil, nil, syserror.ENOTDIR
+		return syserror.ENOTDIR
 	}
 
 	// No exec-ing directories, pipes, etc!
 	if !fs.IsRegular(d.Inode.StableAttr) {
 		ctx.Infof("%s is not regular: %v", name, d.Inode.StableAttr)
-		return nil, nil, syserror.EACCES
+		return syserror.EACCES
 	}
 
-	// Create a new file.
-	file, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
-	if err != nil {
-		return nil, nil, err
-	}
+	return nil
 
-	// We must be able to read at arbitrary offsets.
-	if !file.Flags().Pread {
-		file.DecRef()
-		ctx.Infof("%s cannot be read at an offset: %+v", name, file.Flags())
-		return nil, nil, syserror.EACCES
-	}
-
-	// Grab a reference for the caller.
-	d.IncRef()
-	return d, file, nil
 }
 
 // allocStack allocates and maps a stack in to any available part of the address space.
@@ -131,16 +173,30 @@ const (
 	maxLoaderAttempts = 6
 )
 
-// loadPath resolves filename to a binary and loads it.
+// loadBinary loads a binary that is pointed to by "file". If nil, the path
+// "filename" is resolved and loaded.
 //
 // It returns:
 //  * loadedELF, description of the loaded binary
 //  * arch.Context matching the binary arch
 //  * fs.Dirent of the binary file
 //  * Possibly updated argv
-func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, fs *cpuid.FeatureSet, filename string, argv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, features *cpuid.FeatureSet, filename string, passedFile *fs.File, argv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
 	for i := 0; i < maxLoaderAttempts; i++ {
-		d, f, err := openPath(ctx, mounts, root, wd, remainingTraversals, filename)
+		var (
+			d   *fs.Dirent
+			f   *fs.File
+			err error
+		)
+		if passedFile == nil {
+			d, f, err = openPath(ctx, mounts, root, wd, remainingTraversals, filename)
+
+		} else {
+			d, f, err = openFile(ctx, passedFile, nil, "")
+			// Set to nil in case we loop on a Interpreter Script.
+			passedFile = nil
+		}
+
 		if err != nil {
 			ctx.Infof("Error opening %s: %v", filename, err)
 			return loadedELF{}, nil, nil, nil, err
@@ -165,7 +221,7 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
 
 		switch {
 		case bytes.Equal(hdr[:], []byte(elfMagic)):
-			loaded, ac, err := loadELF(ctx, m, mounts, root, wd, remainingTraversals, fs, f)
+			loaded, ac, err := loadELF(ctx, m, mounts, root, wd, remainingTraversals, features, f)
 			if err != nil {
 				ctx.Infof("Error loading ELF: %v", err)
 				return loadedELF{}, nil, nil, nil, err
@@ -190,7 +246,8 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
 	return loadedELF{}, nil, nil, nil, syserror.ELOOP
 }
 
-// Load loads filename into a MemoryManager.
+// Load loads "file" into a MemoryManager. If file is nil, the path "filename"
+// is resolved and loaded instead.
 //
 // If Load returns ErrSwitchFile it should be called again with the returned
 // path and argv.
@@ -198,9 +255,9 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
 // Preconditions:
 //  * The Task MemoryManager is empty.
 //  * Load is called on the Task goroutine.
-func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
+func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, file *fs.File, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
 	// Load the binary itself.
-	loaded, ac, d, argv, err := loadPath(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv)
+	loaded, ac, d, argv, err := loadBinary(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv)
 	if err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", filename, err), syserr.FromError(err).ToLinux())
 	}
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
index a8819aa84..8c2246bb4 100644
--- a/pkg/sentry/mm/procfs.go
+++ b/pkg/sentry/mm/procfs.go
@@ -58,6 +58,34 @@ func (mm *MemoryManager) NeedsUpdate(generation int64) bool {
 	return true
 }
 
+// ReadMapsDataInto is called by fsimpl/proc.mapsData.Generate to
+// implement /proc/[pid]/maps.
+func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer) {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	var start usermem.Addr
+
+	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
+		// "panic: autosave error: type usermem.Addr is not registered".
+		mm.appendVMAMapsEntryLocked(ctx, vseg, buf)
+	}
+
+	// We always emulate vsyscall, so advertise it here. Everything about a
+	// vsyscall region is static, so just hard code the maps entry since we
+	// don't have a real vma backing it. The vsyscall region is at the end of
+	// the virtual address space so nothing should be mapped after it (if
+	// something is really mapped in the tiny ~10 MiB segment afterwards, we'll
+	// get the sorting on the maps file wrong at worst; but that's not possible
+	// on any current platform).
+	//
+	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
+	if start != vsyscallEnd {
+		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
+		buf.WriteString(vsyscallMapsEntry)
+	}
+}
+
 // ReadMapsSeqFileData is called by fs/proc.mapsData.ReadSeqFileData to
 // implement /proc/[pid]/maps.
 func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
@@ -151,6 +179,27 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI
 	b.WriteString("\n")
 }
 
+// ReadSmapsDataInto is called by fsimpl/proc.smapsData.Generate to
+// implement /proc/[pid]/maps.
+func (mm *MemoryManager) ReadSmapsDataInto(ctx context.Context, buf *bytes.Buffer) {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	var start usermem.Addr
+
+	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
+		// "panic: autosave error: type usermem.Addr is not registered".
+		mm.vmaSmapsEntryIntoLocked(ctx, vseg, buf)
+	}
+
+	// We always emulate vsyscall, so advertise it here. See
+	// ReadMapsSeqFileData for additional commentary.
+	if start != vsyscallEnd {
+		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
+		buf.WriteString(vsyscallSmapsEntry)
+	}
+}
+
 // ReadSmapsSeqFileData is called by fs/proc.smapsData.ReadSeqFileData to
 // implement /proc/[pid]/smaps.
 func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
@@ -190,7 +239,12 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil
 // Preconditions: mm.mappingMu must be locked.
 func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
 	var b bytes.Buffer
-	mm.appendVMAMapsEntryLocked(ctx, vseg, &b)
+	mm.vmaSmapsEntryIntoLocked(ctx, vseg, &b)
+	return b.Bytes()
+}
+
+func (mm *MemoryManager) vmaSmapsEntryIntoLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) {
+	mm.appendVMAMapsEntryLocked(ctx, vseg, b)
 	vma := vseg.ValuePtr()
 
 	// We take mm.activeMu here in each call to vmaSmapsEntryLocked, instead of
@@ -211,40 +265,40 @@ func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterat
 	}
 	mm.activeMu.RUnlock()
 
-	fmt.Fprintf(&b, "Size:           %8d kB\n", vseg.Range().Length()/1024)
-	fmt.Fprintf(&b, "Rss:            %8d kB\n", rss/1024)
+	fmt.Fprintf(b, "Size:           %8d kB\n", vseg.Range().Length()/1024)
+	fmt.Fprintf(b, "Rss:            %8d kB\n", rss/1024)
 	// Currently we report PSS = RSS, i.e. we pretend each page mapped by a pma
 	// is only mapped by that pma. This avoids having to query memmap.Mappables
 	// for reference count information on each page. As a corollary, all pages
 	// are accounted as "private" whether or not the vma is private; compare
 	// Linux's fs/proc/task_mmu.c:smaps_account().
-	fmt.Fprintf(&b, "Pss:            %8d kB\n", rss/1024)
-	fmt.Fprintf(&b, "Shared_Clean:   %8d kB\n", 0)
-	fmt.Fprintf(&b, "Shared_Dirty:   %8d kB\n", 0)
+	fmt.Fprintf(b, "Pss:            %8d kB\n", rss/1024)
+	fmt.Fprintf(b, "Shared_Clean:   %8d kB\n", 0)
+	fmt.Fprintf(b, "Shared_Dirty:   %8d kB\n", 0)
 	// Pretend that all pages are dirty if the vma is writable, and clean otherwise.
 	clean := rss
 	if vma.effectivePerms.Write {
 		clean = 0
 	}
-	fmt.Fprintf(&b, "Private_Clean:  %8d kB\n", clean/1024)
-	fmt.Fprintf(&b, "Private_Dirty:  %8d kB\n", (rss-clean)/1024)
+	fmt.Fprintf(b, "Private_Clean:  %8d kB\n", clean/1024)
+	fmt.Fprintf(b, "Private_Dirty:  %8d kB\n", (rss-clean)/1024)
 	// Pretend that all pages are "referenced" (recently touched).
-	fmt.Fprintf(&b, "Referenced:     %8d kB\n", rss/1024)
-	fmt.Fprintf(&b, "Anonymous:      %8d kB\n", anon/1024)
+	fmt.Fprintf(b, "Referenced:     %8d kB\n", rss/1024)
+	fmt.Fprintf(b, "Anonymous:      %8d kB\n", anon/1024)
 	// Hugepages (hugetlb and THP) are not implemented.
-	fmt.Fprintf(&b, "AnonHugePages:  %8d kB\n", 0)
-	fmt.Fprintf(&b, "Shared_Hugetlb: %8d kB\n", 0)
-	fmt.Fprintf(&b, "Private_Hugetlb: %7d kB\n", 0)
+	fmt.Fprintf(b, "AnonHugePages:  %8d kB\n", 0)
+	fmt.Fprintf(b, "Shared_Hugetlb: %8d kB\n", 0)
+	fmt.Fprintf(b, "Private_Hugetlb: %7d kB\n", 0)
 	// Swap is not implemented.
-	fmt.Fprintf(&b, "Swap:           %8d kB\n", 0)
-	fmt.Fprintf(&b, "SwapPss:        %8d kB\n", 0)
-	fmt.Fprintf(&b, "KernelPageSize: %8d kB\n", usermem.PageSize/1024)
-	fmt.Fprintf(&b, "MMUPageSize:    %8d kB\n", usermem.PageSize/1024)
+	fmt.Fprintf(b, "Swap:           %8d kB\n", 0)
+	fmt.Fprintf(b, "SwapPss:        %8d kB\n", 0)
+	fmt.Fprintf(b, "KernelPageSize: %8d kB\n", usermem.PageSize/1024)
+	fmt.Fprintf(b, "MMUPageSize:    %8d kB\n", usermem.PageSize/1024)
 	locked := rss
 	if vma.mlockMode == memmap.MLockNone {
 		locked = 0
 	}
-	fmt.Fprintf(&b, "Locked:         %8d kB\n", locked/1024)
+	fmt.Fprintf(b, "Locked:         %8d kB\n", locked/1024)
 
 	b.WriteString("VmFlags: ")
 	if vma.realPerms.Read {
@@ -284,6 +338,4 @@ func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterat
 		b.WriteString("ac ")
 	}
 	b.WriteString("\n")
-
-	return b.Bytes()
 }
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 8bd3e885d..f7f7298c4 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -285,7 +285,10 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
 	switch opts.DelayedEviction {
 	case DelayedEvictionDefault:
 		opts.DelayedEviction = DelayedEvictionEnabled
-	case DelayedEvictionDisabled, DelayedEvictionEnabled, DelayedEvictionManual:
+	case DelayedEvictionDisabled, DelayedEvictionManual:
+		opts.UseHostMemcgPressure = false
+	case DelayedEvictionEnabled:
+		// ok
 	default:
 		return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction)
 	}
@@ -777,6 +780,14 @@ func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) {
 	}
 }
 
+// ShouldCacheEvictable returns true if f is meaningfully delaying evictions of
+// evictable memory, such that it may be advantageous to cache data in
+// evictable memory. The value returned by ShouldCacheEvictable may change
+// between calls.
+func (f *MemoryFile) ShouldCacheEvictable() bool {
+	return f.opts.DelayedEviction == DelayedEvictionManual || f.opts.UseHostMemcgPressure
+}
+
 // UpdateUsage ensures that the memory usage statistics in
 // usage.MemoryAccounting are up to date.
 func (f *MemoryFile) UpdateUsage() error {
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index 1b6c54e96..ebcc8c098 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -7,13 +7,17 @@ go_library(
     srcs = [
         "filters.go",
         "ptrace.go",
+        "ptrace_amd64.go",
+        "ptrace_arm64.go",
         "ptrace_unsafe.go",
         "stub_amd64.s",
+        "stub_arm64.s",
         "stub_unsafe.go",
         "subprocess.go",
         "subprocess_amd64.go",
+        "subprocess_arm64.go",
         "subprocess_linux.go",
-        "subprocess_linux_amd64_unsafe.go",
+        "subprocess_linux_unsafe.go",
         "subprocess_unsafe.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ptrace",
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 6fd30ed25..7b120a15d 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -60,7 +60,7 @@ var (
 	// maximum user address. This is valid only after a call to stubInit.
 	//
 	// We attempt to link the stub here, and adjust downward as needed.
-	stubStart uintptr = 0x7fffffff0000
+	stubStart uintptr = stubInitAddress
 
 	// stubEnd is the first byte past the end of the stub, as with
 	// stubStart this is valid only after a call to stubInit.
diff --git a/pkg/sentry/platform/ptrace/ptrace_amd64.go b/pkg/sentry/platform/ptrace/ptrace_amd64.go
new file mode 100644
index 000000000..db0212538
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/ptrace_amd64.go
@@ -0,0 +1,33 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ptrace
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+// fpRegSet returns the GETREGSET/SETREGSET register set type to be used.
+func fpRegSet(useXsave bool) uintptr {
+	if useXsave {
+		return linux.NT_X86_XSTATE
+	}
+	return linux.NT_PRFPREG
+}
+
+func stackPointer(r *syscall.PtraceRegs) uintptr {
+	return uintptr(r.Rsp)
+}
diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64.go b/pkg/sentry/platform/ptrace/ptrace_arm64.go
new file mode 100644
index 000000000..4db28c534
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/ptrace_arm64.go
@@ -0,0 +1,30 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ptrace
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+// fpRegSet returns the GETREGSET/SETREGSET register set type to be used.
+func fpRegSet(_ bool) uintptr {
+	return linux.NT_PRFPREG
+}
+
+func stackPointer(r *syscall.PtraceRegs) uintptr {
+	return uintptr(r.Sp)
+}
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index 2706039a5..47957bb3b 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -18,37 +18,23 @@ import (
 	"syscall"
 	"unsafe"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
-// GETREGSET/SETREGSET register set types.
-//
-// See include/uapi/linux/elf.h.
-const (
-	// _NT_PRFPREG is for x86 floating-point state without using xsave.
-	_NT_PRFPREG = 0x2
-
-	// _NT_X86_XSTATE is for x86 extended state using xsave.
-	_NT_X86_XSTATE = 0x202
-)
-
-// fpRegSet returns the GETREGSET/SETREGSET register set type to be used.
-func fpRegSet(useXsave bool) uintptr {
-	if useXsave {
-		return _NT_X86_XSTATE
-	}
-	return _NT_PRFPREG
-}
-
-// getRegs sets the regular register set.
+// getRegs gets the general purpose register set.
 func (t *thread) getRegs(regs *syscall.PtraceRegs) error {
+	iovec := syscall.Iovec{
+		Base: (*byte)(unsafe.Pointer(regs)),
+		Len:  uint64(unsafe.Sizeof(*regs)),
+	}
 	_, _, errno := syscall.RawSyscall6(
 		syscall.SYS_PTRACE,
-		syscall.PTRACE_GETREGS,
+		syscall.PTRACE_GETREGSET,
 		uintptr(t.tid),
-		0,
-		uintptr(unsafe.Pointer(regs)),
+		linux.NT_PRSTATUS,
+		uintptr(unsafe.Pointer(&iovec)),
 		0, 0)
 	if errno != 0 {
 		return errno
@@ -56,14 +42,18 @@ func (t *thread) getRegs(regs *syscall.PtraceRegs) error {
 	return nil
 }
 
-// setRegs sets the regular register set.
+// setRegs sets the general purpose register set.
 func (t *thread) setRegs(regs *syscall.PtraceRegs) error {
+	iovec := syscall.Iovec{
+		Base: (*byte)(unsafe.Pointer(regs)),
+		Len:  uint64(unsafe.Sizeof(*regs)),
+	}
 	_, _, errno := syscall.RawSyscall6(
 		syscall.SYS_PTRACE,
-		syscall.PTRACE_SETREGS,
+		syscall.PTRACE_SETREGSET,
 		uintptr(t.tid),
-		0,
-		uintptr(unsafe.Pointer(regs)),
+		linux.NT_PRSTATUS,
+		uintptr(unsafe.Pointer(&iovec)),
 		0, 0)
 	if errno != 0 {
 		return errno
@@ -131,7 +121,7 @@ func (t *thread) getSignalInfo(si *arch.SignalInfo) error {
 //
 // Precondition: the OS thread must be locked and own t.
 func (t *thread) clone() (*thread, error) {
-	r, ok := usermem.Addr(t.initRegs.Rsp).RoundUp()
+	r, ok := usermem.Addr(stackPointer(&t.initRegs)).RoundUp()
 	if !ok {
 		return nil, syscall.EINVAL
 	}
diff --git a/pkg/sentry/platform/ptrace/stub_arm64.s b/pkg/sentry/platform/ptrace/stub_arm64.s
new file mode 100644
index 000000000..2c5e4d5cb
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/stub_arm64.s
@@ -0,0 +1,106 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+#define SYS_GETPID		172
+#define SYS_EXIT		93
+#define SYS_KILL		129
+#define SYS_GETPPID		173
+#define SYS_PRCTL		167
+
+#define SIGKILL			9
+#define SIGSTOP			19
+
+#define PR_SET_PDEATHSIG	1
+
+// stub bootstraps the child and sends itself SIGSTOP to wait for attach.
+//
+// R7 contains the expected PPID.
+//
+// This should not be used outside the context of a new ptrace child (as the
+// function is otherwise a bunch of nonsense).
+TEXT ·stub(SB),NOSPLIT,$0
+begin:
+	// N.B. This loop only executes in the context of a single-threaded
+	// fork child.
+
+	MOVD $SYS_PRCTL, R8
+	MOVD $PR_SET_PDEATHSIG, R0
+	MOVD $SIGKILL, R1
+	SVC
+
+	CMN $4095, R0
+	BCS error
+
+	// If the parent already died before we called PR_SET_DEATHSIG then
+	// we'll have an unexpected PPID.
+	MOVD $SYS_GETPPID, R8
+	SVC
+
+	CMP R0, R7
+	BNE parent_dead
+
+	MOVD $SYS_GETPID, R8
+	SVC
+
+	CMP $0x0, R0
+	BLT error
+
+	// SIGSTOP to wait for attach.
+	//
+	// The SYSCALL instruction will be used for future syscall injection by
+	// thread.syscall.
+	MOVD $SYS_KILL, R8
+	MOVD $SIGSTOP, R1
+	SVC
+	// The tracer may "detach" and/or allow code execution here in three cases:
+	//
+	// 1. New (traced) stub threads are explicitly detached by the
+	// goroutine in newSubprocess. However, they are detached while in
+	// group-stop, so they do not execute code here.
+	//
+	// 2. If a tracer thread exits, it implicitly detaches from the stub,
+	// potentially allowing code execution here. However, the Go runtime
+	// never exits individual threads, so this case never occurs.
+	//
+	// 3. subprocess.createStub clones a new stub process that is untraced,
+	// thus executing this code. We setup the PDEATHSIG before SIGSTOPing
+	// ourselves for attach by the tracer.
+	//
+	// R7 has been updated with the expected PPID.
+	B begin
+
+error:
+	// Exit with -errno.
+	NEG R0, R0
+	MOVD $SYS_EXIT, R8
+	SVC
+	HLT
+
+parent_dead:
+	MOVD $SYS_EXIT, R8
+	MOVD $1, R0
+	SVC
+	HLT
+
+// stubCall calls the stub function at the given address with the given PPID.
+//
+// This is a distinct function because stub, above, may be mapped at any
+// arbitrary location, and stub has a specific binary API (see above).
+TEXT ·stubCall(SB),NOSPLIT,$0-16
+	MOVD addr+0(FP), R0
+	MOVD pid+8(FP), R7
+	B (R0)
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 15e84735e..6bf7cd097 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -28,6 +28,16 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
+// Linux kernel errnos which "should never be seen by user programs", but will
+// be revealed to ptrace syscall exit tracing.
+//
+// These constants are only used in subprocess.go.
+const (
+	ERESTARTSYS    = syscall.Errno(512)
+	ERESTARTNOINTR = syscall.Errno(513)
+	ERESTARTNOHAND = syscall.Errno(514)
+)
+
 // globalPool exists to solve two distinct problems:
 //
 // 1) Subprocesses can't always be killed properly (see Release).
@@ -282,7 +292,7 @@ func (t *thread) grabInitRegs() {
 	if err := t.getRegs(&t.initRegs); err != nil {
 		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
 	}
-	t.initRegs.Rip -= initRegsRipAdjustment
+	t.adjustInitRegsRip()
 }
 
 // detach detaches from the thread.
@@ -344,6 +354,9 @@ func (t *thread) wait(outcome waitOutcome) syscall.Signal {
 				continue // Spurious stop.
 			}
 			if stopSig == syscall.SIGTRAP {
+				if status.TrapCause() == syscall.PTRACE_EVENT_EXIT {
+					t.dumpAndPanic("wait failed: the process exited")
+				}
 				// Re-encode the trap cause the way it's expected.
 				return stopSig | syscall.Signal(status.TrapCause()<<8)
 			}
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index a70512913..4649a94a7 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -28,20 +28,13 @@ const (
 	// maximumUserAddress is the largest possible user address.
 	maximumUserAddress = 0x7ffffffff000
 
+	// stubInitAddress is the initial attempt link address for the stub.
+	stubInitAddress = 0x7fffffff0000
+
 	// initRegsRipAdjustment is the size of the syscall instruction.
 	initRegsRipAdjustment = 2
 )
 
-// Linux kernel errnos which "should never be seen by user programs", but will
-// be revealed to ptrace syscall exit tracing.
-//
-// These constants are used in subprocess.go.
-const (
-	ERESTARTSYS    = syscall.Errno(512)
-	ERESTARTNOINTR = syscall.Errno(513)
-	ERESTARTNOHAND = syscall.Errno(514)
-)
-
 // resetSysemuRegs sets up emulation registers.
 //
 // This should be called prior to calling sysemu.
@@ -139,3 +132,14 @@ func dumpRegs(regs *syscall.PtraceRegs) string {
 
 	return m.String()
 }
+
+// adjustInitregsRip adjust the current register RIP value to
+// be just before the system call instruction excution
+func (t *thread) adjustInitRegsRip() {
+	t.initRegs.Rip -= initRegsRipAdjustment
+}
+
+// Pass the expected PPID to the child via R15 when creating stub process
+func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
+	initregs.R15 = uint64(ppid)
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go
new file mode 100644
index 000000000..bec884ba5
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go
@@ -0,0 +1,126 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package ptrace
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+const (
+	// maximumUserAddress is the largest possible user address.
+	maximumUserAddress = 0xfffffffff000
+
+	// stubInitAddress is the initial attempt link address for the stub.
+	// Only support 48bits VA currently.
+	stubInitAddress = 0xffffffff0000
+
+	// initRegsRipAdjustment is the size of the svc instruction.
+	initRegsRipAdjustment = 4
+)
+
+// resetSysemuRegs sets up emulation registers.
+//
+// This should be called prior to calling sysemu.
+func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) {
+}
+
+// createSyscallRegs sets up syscall registers.
+//
+// This should be called to generate registers for a system call.
+func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs {
+	// Copy initial registers (Pc, Sp, etc.).
+	regs := *initRegs
+
+	// Set our syscall number.
+	// r8 for the syscall number.
+	// r0-r6 is used to store the parameters.
+	regs.Regs[8] = uint64(sysno)
+	if len(args) >= 1 {
+		regs.Regs[0] = args[0].Uint64()
+	}
+	if len(args) >= 2 {
+		regs.Regs[1] = args[1].Uint64()
+	}
+	if len(args) >= 3 {
+		regs.Regs[2] = args[2].Uint64()
+	}
+	if len(args) >= 4 {
+		regs.Regs[3] = args[3].Uint64()
+	}
+	if len(args) >= 5 {
+		regs.Regs[4] = args[4].Uint64()
+	}
+	if len(args) >= 6 {
+		regs.Regs[5] = args[5].Uint64()
+	}
+
+	return regs
+}
+
+// isSingleStepping determines if the registers indicate single-stepping.
+func isSingleStepping(regs *syscall.PtraceRegs) bool {
+	// Refer to the ARM SDM D2.12.3: software step state machine
+	// return (regs.Pstate.SS == 1) && (MDSCR_EL1.SS == 1).
+	//
+	// Since the host Linux kernel will set MDSCR_EL1.SS on our behalf
+	// when we call a single-step ptrace command, we only need to check
+	// the Pstate.SS bit here.
+	return (regs.Pstate & arch.ARMTrapFlag) != 0
+}
+
+// updateSyscallRegs updates registers after finishing sysemu.
+func updateSyscallRegs(regs *syscall.PtraceRegs) {
+	// No special work is necessary.
+	return
+}
+
+// syscallReturnValue extracts a sensible return from registers.
+func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
+	rval := int64(regs.Regs[0])
+	if rval < 0 {
+		return 0, syscall.Errno(-rval)
+	}
+	return uintptr(rval), nil
+}
+
+func dumpRegs(regs *syscall.PtraceRegs) string {
+	var m strings.Builder
+
+	fmt.Fprintf(&m, "Registers:\n")
+
+	for i := 0; i < 31; i++ {
+		fmt.Fprintf(&m, "\tRegs[%d]\t = %016x\n", i, regs.Regs[i])
+	}
+	fmt.Fprintf(&m, "\tSp\t = %016x\n", regs.Sp)
+	fmt.Fprintf(&m, "\tPc\t = %016x\n", regs.Pc)
+	fmt.Fprintf(&m, "\tPstate\t = %016x\n", regs.Pstate)
+
+	return m.String()
+}
+
+// adjustInitregsRip adjust the current register RIP value to
+// be just before the system call instruction excution
+func (t *thread) adjustInitRegsRip() {
+	t.initRegs.Pc -= initRegsRipAdjustment
+}
+
+// Pass the expected PPID to the child via X7 when creating stub process
+func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
+	initregs.Regs[7] = uint64(ppid)
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 87ded0bbd..f09b0b3d0 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -284,7 +284,7 @@ func (s *subprocess) createStub() (*thread, error) {
 
 	// Pass the expected PPID to the child via R15.
 	regs := t.initRegs
-	regs.R15 = uint64(t.tgid)
+	initChildProcessPPID(&regs, t.tgid)
 
 	// Call fork in a subprocess.
 	//
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
index e977992f9..de6783fb0 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 linux
+// +build linux
+// +build amd64 arm64
 
 package ptrace
 
diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go
index 5c3d73eb7..f039a5c34 100644
--- a/pkg/sentry/safemem/io.go
+++ b/pkg/sentry/safemem/io.go
@@ -157,7 +157,8 @@ func (w ToIOWriter) Write(src []byte) (int, error) {
 }
 
 // FromIOReader implements Reader for an io.Reader by repeatedly invoking
-// io.Reader.Read until it returns an error or partial read.
+// io.Reader.Read until it returns an error or partial read. This is not
+// thread-safe.
 //
 // FromIOReader will return a successful partial read iff Reader.Read does so.
 type FromIOReader struct {
@@ -206,6 +207,58 @@ func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) {
 	return wbn, buf, rerr
 }
 
+// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly
+// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial
+// read indicates an error. This is not thread-safe.
+type FromIOReaderAt struct {
+	ReaderAt io.ReaderAt
+	Offset   int64
+}
+
+// ReadToBlocks implements Reader.ReadToBlocks.
+func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) {
+	var buf []byte
+	var done uint64
+	for !dsts.IsEmpty() {
+		dst := dsts.Head()
+		var n int
+		var err error
+		n, buf, err = r.readToBlock(dst, buf)
+		done += uint64(n)
+		if n != dst.Len() {
+			return done, err
+		}
+		dsts = dsts.Tail()
+		if err != nil {
+			if dsts.IsEmpty() && err == io.EOF {
+				return done, nil
+			}
+			return done, err
+		}
+	}
+	return done, nil
+}
+
+func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) {
+	// io.Reader isn't safecopy-aware, so we have to buffer Blocks that require
+	// safecopy.
+	if !dst.NeedSafecopy() {
+		n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset)
+		r.Offset += int64(n)
+		return n, buf, err
+	}
+	if len(buf) < dst.Len() {
+		buf = make([]byte, dst.Len())
+	}
+	rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset)
+	r.Offset += int64(rn)
+	wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn]))
+	if wberr != nil {
+		return wbn, buf, wberr
+	}
+	return wbn, buf, rerr
+}
+
 // FromIOWriter implements Writer for an io.Writer by repeatedly invoking
 // io.Writer.Write until it returns an error or partial write.
 //
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 2b03ea87c..3300f9a6b 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -9,6 +9,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/binary",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 1f014f399..e927821e1 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -31,6 +31,7 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
+        "//pkg/sentry/socket/netfilter",
         "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
@@ -38,6 +39,7 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/iptables",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index e57aed927..635042263 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -43,6 +43,7 @@ import (
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
@@ -290,18 +291,22 @@ func bytesToIPAddress(addr []byte) tcpip.Address {
 	return tcpip.Address(addr)
 }
 
-// GetAddress reads an sockaddr struct from the given address and converts it
-// to the FullAddress format. It supports AF_UNIX, AF_INET and AF_INET6
-// addresses.
-func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syserr.Error) {
+// AddressAndFamily reads an sockaddr struct from the given address and
+// converts it to the FullAddress format. It supports AF_UNIX, AF_INET and
+// AF_INET6 addresses.
+//
+// strict indicates whether addresses with the AF_UNSPEC family are accepted of not.
+//
+// AddressAndFamily returns an address, its family.
+func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, uint16, *syserr.Error) {
 	// Make sure we have at least 2 bytes for the address family.
 	if len(addr) < 2 {
-		return tcpip.FullAddress{}, syserr.ErrInvalidArgument
+		return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument
 	}
 
 	family := usermem.ByteOrder.Uint16(addr)
 	if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) {
-		return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported
+		return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported
 	}
 
 	// Get the rest of the fields based on the address family.
@@ -309,7 +314,7 @@ func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syse
 	case linux.AF_UNIX:
 		path := addr[2:]
 		if len(path) > linux.UnixPathMax {
-			return tcpip.FullAddress{}, syserr.ErrInvalidArgument
+			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
 		}
 		// Drop the terminating NUL (if one exists) and everything after
 		// it for filesystem (non-abstract) addresses.
@@ -320,12 +325,12 @@ func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syse
 		}
 		return tcpip.FullAddress{
 			Addr: tcpip.Address(path),
-		}, nil
+		}, family, nil
 
 	case linux.AF_INET:
 		var a linux.SockAddrInet
 		if len(addr) < sockAddrInetSize {
-			return tcpip.FullAddress{}, syserr.ErrInvalidArgument
+			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
 		}
 		binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a)
 
@@ -333,12 +338,12 @@ func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syse
 			Addr: bytesToIPAddress(a.Addr[:]),
 			Port: ntohs(a.Port),
 		}
-		return out, nil
+		return out, family, nil
 
 	case linux.AF_INET6:
 		var a linux.SockAddrInet6
 		if len(addr) < sockAddrInet6Size {
-			return tcpip.FullAddress{}, syserr.ErrInvalidArgument
+			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
 		}
 		binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a)
 
@@ -349,13 +354,13 @@ func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syse
 		if isLinkLocal(out.Addr) {
 			out.NIC = tcpip.NICID(a.Scope_id)
 		}
-		return out, nil
+		return out, family, nil
 
 	case linux.AF_UNSPEC:
-		return tcpip.FullAddress{}, nil
+		return tcpip.FullAddress{}, family, nil
 
 	default:
-		return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported
+		return tcpip.FullAddress{}, 0, syserr.ErrAddressFamilyNotSupported
 	}
 }
 
@@ -428,6 +433,11 @@ func (i *ioSequencePayload) Size() int {
 	return int(i.src.NumBytes())
 }
 
+// DropFirst drops the first n bytes from underlying src.
+func (i *ioSequencePayload) DropFirst(n int) {
+	i.src = i.src.DropFirst(int(n))
+}
+
 // Write implements fs.FileOperations.Write.
 func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
 	f := &ioSequencePayload{ctx: ctx, src: src}
@@ -476,11 +486,18 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
 // Connect implements the linux syscall connect(2) for sockets backed by
 // tpcip.Endpoint.
 func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
-	addr, err := GetAddress(s.family, sockaddr, false /* strict */)
+	addr, family, err := AddressAndFamily(s.family, sockaddr, false /* strict */)
 	if err != nil {
 		return err
 	}
 
+	if family == linux.AF_UNSPEC {
+		err := s.Endpoint.Disconnect()
+		if err == tcpip.ErrNotSupported {
+			return syserr.ErrAddressFamilyNotSupported
+		}
+		return syserr.TranslateNetstackError(err)
+	}
 	// Always return right away in the non-blocking case.
 	if !blocking {
 		return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
@@ -509,7 +526,7 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
-	addr, err := GetAddress(s.family, sockaddr, true /* strict */)
+	addr, _, err := AddressAndFamily(s.family, sockaddr, true /* strict */)
 	if err != nil {
 		return err
 	}
@@ -547,7 +564,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *wait
 
 // Accept implements the linux syscall accept(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
 	// Issue the accept request to get the new endpoint.
 	ep, wq, terr := s.Endpoint.Accept()
 	if terr != nil {
@@ -574,7 +591,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		ns.SetFlags(flags.Settable())
 	}
 
-	var addr interface{}
+	var addr linux.SockAddr
 	var addrLen uint32
 	if peerRequested {
 		// Get address of the peer and write it to peer slice.
@@ -624,7 +641,7 @@ func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
 	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
 	// implemented specifically for epsocket.SocketOperations rather than
 	// commonEndpoint. commonEndpoint should be extended to support socket
@@ -655,6 +672,33 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (
 		return val, nil
 	}
 
+	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
+		switch name {
+		case linux.IPT_SO_GET_INFO:
+			if outLen < linux.SizeOfIPTGetinfo {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			info, err := netfilter.GetInfo(t, s.Endpoint, outPtr)
+			if err != nil {
+				return nil, err
+			}
+			return info, nil
+
+		case linux.IPT_SO_GET_ENTRIES:
+			if outLen < linux.SizeOfIPTGetEntries {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			entries, err := netfilter.GetEntries(t, s.Endpoint, outPtr, outLen)
+			if err != nil {
+				return nil, err
+			}
+			return entries, nil
+
+		}
+	}
+
 	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
 }
 
@@ -1028,7 +1072,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfac
 
 		a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
 
-		return a.(linux.SockAddrInet).Addr, nil
+		return a.(*linux.SockAddrInet).Addr, nil
 
 	case linux.IP_MULTICAST_LOOP:
 		if outLen < sizeOfInt32 {
@@ -1658,7 +1702,7 @@ func isLinkLocal(addr tcpip.Address) bool {
 }
 
 // ConvertAddress converts the given address to a native format.
-func ConvertAddress(family int, addr tcpip.FullAddress) (interface{}, uint32) {
+func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) {
 	switch family {
 	case linux.AF_UNIX:
 		var out linux.SockAddrUnix
@@ -1674,15 +1718,15 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (interface{}, uint32) {
 		// address length is the max. Abstract and empty paths always return
 		// the full exact length.
 		if l == 0 || out.Path[0] == 0 || l == len(out.Path) {
-			return out, uint32(2 + l)
+			return &out, uint32(2 + l)
 		}
-		return out, uint32(3 + l)
+		return &out, uint32(3 + l)
 	case linux.AF_INET:
 		var out linux.SockAddrInet
 		copy(out.Addr[:], addr.Addr)
 		out.Family = linux.AF_INET
 		out.Port = htons(addr.Port)
-		return out, uint32(binary.Size(out))
+		return &out, uint32(binary.Size(out))
 	case linux.AF_INET6:
 		var out linux.SockAddrInet6
 		if len(addr.Addr) == 4 {
@@ -1698,7 +1742,7 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (interface{}, uint32) {
 		if isLinkLocal(addr.Addr) {
 			out.Scope_id = uint32(addr.NIC)
 		}
-		return out, uint32(binary.Size(out))
+		return &out, uint32(binary.Size(out))
 	default:
 		return nil, 0
 	}
@@ -1706,7 +1750,7 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (interface{}, uint32) {
 
 // GetSockName implements the linux syscall getsockname(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr, err := s.Endpoint.GetLocalAddress()
 	if err != nil {
 		return nil, 0, syserr.TranslateNetstackError(err)
@@ -1718,7 +1762,7 @@ func (s *SocketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *sy
 
 // GetPeerName implements the linux syscall getpeername(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr, err := s.Endpoint.GetRemoteAddress()
 	if err != nil {
 		return nil, 0, syserr.TranslateNetstackError(err)
@@ -1791,7 +1835,7 @@ func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) {
 // nonBlockingRead issues a non-blocking read.
 //
 // TODO(b/78348848): Support timestamps for stream sockets.
-func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
 	isPacket := s.isPacketBased()
 
 	// Fast path for regular reads from stream (e.g., TCP) endpoints. Note
@@ -1839,7 +1883,7 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 	if err == nil {
 		s.updateTimestamp()
 	}
-	var addr interface{}
+	var addr linux.SockAddr
 	var addrLen uint32
 	if isPacket && senderRequested {
 		addr, addrLen = ConvertAddress(s.family, s.sender)
@@ -1914,7 +1958,7 @@ func (s *SocketOperations) updateTimestamp() {
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
+func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 	peek := flags&linux.MSG_PEEK != 0
 	dontWait := flags&linux.MSG_DONTWAIT != 0
@@ -1990,7 +2034,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 
 	var addr *tcpip.FullAddress
 	if len(to) > 0 {
-		addrBuf, err := GetAddress(s.family, to, true /* strict */)
+		addrBuf, _, err := AddressAndFamily(s.family, to, true /* strict */)
 		if err != nil {
 			return 0, err
 		}
@@ -1998,28 +2042,22 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		addr = &addrBuf
 	}
 
-	v := buffer.NewView(int(src.NumBytes()))
-
-	// Copy all the data into the buffer.
-	if _, err := src.CopyIn(t, v); err != nil {
-		return 0, syserr.FromError(err)
-	}
-
 	opts := tcpip.WriteOptions{
 		To:          addr,
 		More:        flags&linux.MSG_MORE != 0,
 		EndOfRecord: flags&linux.MSG_EOR != 0,
 	}
 
-	n, resCh, err := s.Endpoint.Write(tcpip.SlicePayload(v), opts)
+	v := &ioSequencePayload{t, src}
+	n, resCh, err := s.Endpoint.Write(v, opts)
 	if resCh != nil {
 		if err := t.Block(resCh); err != nil {
 			return 0, syserr.FromError(err)
 		}
-		n, _, err = s.Endpoint.Write(tcpip.SlicePayload(v), opts)
+		n, _, err = s.Endpoint.Write(v, opts)
 	}
 	dontWait := flags&linux.MSG_DONTWAIT != 0
-	if err == nil && (n >= uintptr(len(v)) || dontWait) {
+	if err == nil && (n >= int64(v.Size()) || dontWait) {
 		// Complete write.
 		return int(n), nil
 	}
@@ -2033,18 +2071,18 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 	s.EventRegister(&e, waiter.EventOut)
 	defer s.EventUnregister(&e)
 
-	v.TrimFront(int(n))
+	v.DropFirst(int(n))
 	total := n
 	for {
-		n, _, err = s.Endpoint.Write(tcpip.SlicePayload(v), opts)
-		v.TrimFront(int(n))
+		n, _, err = s.Endpoint.Write(v, opts)
+		v.DropFirst(int(n))
 		total += n
 
 		if err != nil && err != tcpip.ErrWouldBlock && total == 0 {
 			return 0, syserr.TranslateNetstackError(err)
 		}
 
-		if err == nil && len(v) == 0 || err != nil && err != tcpip.ErrWouldBlock {
+		if err == nil && v.Size() == 0 || err != nil && err != tcpip.ErrWouldBlock {
 			return int(total), nil
 		}
 
@@ -2252,19 +2290,19 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
 
 	case syscall.SIOCGIFMAP:
 		// Gets the hardware parameters of the device.
-		// TODO(b/71872867): Implement.
+		// TODO(gvisor.dev/issue/505): Implement.
 
 	case syscall.SIOCGIFTXQLEN:
 		// Gets the transmit queue length of the device.
-		// TODO(b/71872867): Implement.
+		// TODO(gvisor.dev/issue/505): Implement.
 
 	case syscall.SIOCGIFDSTADDR:
 		// Gets the destination address of a point-to-point device.
-		// TODO(b/71872867): Implement.
+		// TODO(gvisor.dev/issue/505): Implement.
 
 	case syscall.SIOCGIFBRDADDR:
 		// Gets the broadcast address of a device.
-		// TODO(b/71872867): Implement.
+		// TODO(gvisor.dev/issue/505): Implement.
 
 	case syscall.SIOCGIFNETMASK:
 		// Gets the network mask of a device.
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index 8fe489c0e..7cf7ff735 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -18,7 +18,10 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -143,3 +146,57 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
 func (s *Stack) Statistics(stat interface{}, arg string) error {
 	return syserr.ErrEndpointOperation.ToError()
 }
+
+// RouteTable implements inet.Stack.RouteTable.
+func (s *Stack) RouteTable() []inet.Route {
+	var routeTable []inet.Route
+
+	for _, rt := range s.Stack.GetRouteTable() {
+		var family uint8
+		switch len(rt.Destination.ID()) {
+		case header.IPv4AddressSize:
+			family = linux.AF_INET
+		case header.IPv6AddressSize:
+			family = linux.AF_INET6
+		default:
+			log.Warningf("Unknown network protocol in route %+v", rt)
+			continue
+		}
+
+		routeTable = append(routeTable, inet.Route{
+			Family: family,
+			DstLen: uint8(rt.Destination.Prefix()), // The CIDR prefix for the destination.
+
+			// Always return unspecified protocol since we have no notion of
+			// protocol for routes.
+			Protocol: linux.RTPROT_UNSPEC,
+			// Set statically to LINK scope for now.
+			//
+			// TODO(gvisor.dev/issue/595): Set scope for routes.
+			Scope: linux.RT_SCOPE_LINK,
+			Type:  linux.RTN_UNICAST,
+
+			DstAddr:         []byte(rt.Destination.ID()),
+			OutputInterface: int32(rt.NIC),
+			GatewayAddr:     []byte(rt.Gateway),
+		})
+	}
+
+	return routeTable
+}
+
+// IPTables returns the stack's iptables.
+func (s *Stack) IPTables() (iptables.IPTables, error) {
+	return s.Stack.IPTables(), nil
+}
+
+// FillDefaultIPTables sets the stack's iptables to the default tables, which
+// allow and do not modify all traffic.
+func (s *Stack) FillDefaultIPTables() {
+	netfilter.FillDefaultIPTables(s.Stack)
+}
+
+// Resume implements inet.Stack.Resume.
+func (s *Stack) Resume() {
+	s.Stack.Resume()
+}
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 7f69406b7..92beb1bcf 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -189,15 +189,16 @@ func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 }
 
 // Accept implements socket.Socket.Accept.
-func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
-	var peerAddr []byte
+func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
+	var peerAddr linux.SockAddr
+	var peerAddrBuf []byte
 	var peerAddrlen uint32
 	var peerAddrPtr *byte
 	var peerAddrlenPtr *uint32
 	if peerRequested {
-		peerAddr = make([]byte, sizeofSockaddr)
-		peerAddrlen = uint32(len(peerAddr))
-		peerAddrPtr = &peerAddr[0]
+		peerAddrBuf = make([]byte, sizeofSockaddr)
+		peerAddrlen = uint32(len(peerAddrBuf))
+		peerAddrPtr = &peerAddrBuf[0]
 		peerAddrlenPtr = &peerAddrlen
 	}
 
@@ -222,7 +223,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	}
 
 	if peerRequested {
-		peerAddr = peerAddr[:peerAddrlen]
+		peerAddr = socket.UnmarshalSockAddr(s.family, peerAddrBuf[:peerAddrlen])
 	}
 	if syscallErr != nil {
 		return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr)
@@ -272,7 +273,7 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 }
 
 // GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) {
+func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
 	if outLen < 0 {
 		return nil, syserr.ErrInvalidArgument
 	}
@@ -353,7 +354,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
 	// Whitelist flags.
 	//
 	// FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary
@@ -363,9 +364,10 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument
 	}
 
-	var senderAddr []byte
+	var senderAddr linux.SockAddr
+	var senderAddrBuf []byte
 	if senderRequested {
-		senderAddr = make([]byte, sizeofSockaddr)
+		senderAddrBuf = make([]byte, sizeofSockaddr)
 	}
 
 	var msgFlags int
@@ -384,7 +386,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 
 		if dsts.NumBlocks() == 1 {
 			// Skip allocating []syscall.Iovec.
-			return recvfrom(s.fd, dsts.Head().ToSlice(), sysflags, &senderAddr)
+			return recvfrom(s.fd, dsts.Head().ToSlice(), sysflags, &senderAddrBuf)
 		}
 
 		iovs := iovecsFromBlockSeq(dsts)
@@ -392,15 +394,15 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			Iov:    &iovs[0],
 			Iovlen: uint64(len(iovs)),
 		}
-		if len(senderAddr) != 0 {
-			msg.Name = &senderAddr[0]
-			msg.Namelen = uint32(len(senderAddr))
+		if len(senderAddrBuf) != 0 {
+			msg.Name = &senderAddrBuf[0]
+			msg.Namelen = uint32(len(senderAddrBuf))
 		}
 		n, err := recvmsg(s.fd, &msg, sysflags)
 		if err != nil {
 			return 0, err
 		}
-		senderAddr = senderAddr[:msg.Namelen]
+		senderAddrBuf = senderAddrBuf[:msg.Namelen]
 		msgFlags = int(msg.Flags)
 		return n, nil
 	})
@@ -431,7 +433,10 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	// We don't allow control messages.
 	msgFlags &^= linux.MSG_CTRUNC
 
-	return int(n), msgFlags, senderAddr, uint32(len(senderAddr)), socket.ControlMessages{}, syserr.FromError(err)
+	if senderRequested {
+		senderAddr = socket.UnmarshalSockAddr(s.family, senderAddrBuf)
+	}
+	return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), socket.ControlMessages{}, syserr.FromError(err)
 }
 
 // SendMsg implements socket.Socket.SendMsg.
diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go
index 6c69ba9c7..e69ec38c2 100644
--- a/pkg/sentry/socket/hostinet/socket_unsafe.go
+++ b/pkg/sentry/socket/hostinet/socket_unsafe.go
@@ -18,10 +18,12 @@ import (
 	"syscall"
 	"unsafe"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -91,25 +93,25 @@ func getsockopt(fd int, level, name int, optlen int) ([]byte, error) {
 }
 
 // GetSockName implements socket.Socket.GetSockName.
-func (s *socketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr := make([]byte, sizeofSockaddr)
 	addrlen := uint32(len(addr))
 	_, _, errno := syscall.Syscall(syscall.SYS_GETSOCKNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen)))
 	if errno != 0 {
 		return nil, 0, syserr.FromError(errno)
 	}
-	return addr[:addrlen], addrlen, nil
+	return socket.UnmarshalSockAddr(s.family, addr), addrlen, nil
 }
 
 // GetPeerName implements socket.Socket.GetPeerName.
-func (s *socketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr := make([]byte, sizeofSockaddr)
 	addrlen := uint32(len(addr))
 	_, _, errno := syscall.Syscall(syscall.SYS_GETPEERNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen)))
 	if errno != 0 {
 		return nil, 0, syserr.FromError(errno)
 	}
-	return addr[:addrlen], addrlen, nil
+	return socket.UnmarshalSockAddr(s.family, addr), addrlen, nil
 }
 
 func recvfrom(fd int, dst []byte, flags int, from *[]byte) (uint64, error) {
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index cc1f66fa1..3a4fdec47 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -46,6 +46,7 @@ type Stack struct {
 	// Stack is immutable.
 	interfaces     map[int32]inet.Interface
 	interfaceAddrs map[int32][]inet.InterfaceAddr
+	routes         []inet.Route
 	supportsIPv6   bool
 	tcpRecvBufSize inet.TCPBufferSize
 	tcpSendBufSize inet.TCPBufferSize
@@ -66,6 +67,10 @@ func (s *Stack) Configure() error {
 		return err
 	}
 
+	if err := addHostRoutes(s); err != nil {
+		return err
+	}
+
 	if _, err := os.Stat("/proc/net/if_inet6"); err == nil {
 		s.supportsIPv6 = true
 	}
@@ -161,6 +166,60 @@ func ExtractHostInterfaces(links []syscall.NetlinkMessage, addrs []syscall.Netli
 	return nil
 }
 
+// ExtractHostRoutes populates the given routes slice with the data from the
+// host route table.
+func ExtractHostRoutes(routeMsgs []syscall.NetlinkMessage) ([]inet.Route, error) {
+	var routes []inet.Route
+	for _, routeMsg := range routeMsgs {
+		if routeMsg.Header.Type != syscall.RTM_NEWROUTE {
+			continue
+		}
+
+		var ifRoute syscall.RtMsg
+		binary.Unmarshal(routeMsg.Data[:syscall.SizeofRtMsg], usermem.ByteOrder, &ifRoute)
+		inetRoute := inet.Route{
+			Family:   ifRoute.Family,
+			DstLen:   ifRoute.Dst_len,
+			SrcLen:   ifRoute.Src_len,
+			TOS:      ifRoute.Tos,
+			Table:    ifRoute.Table,
+			Protocol: ifRoute.Protocol,
+			Scope:    ifRoute.Scope,
+			Type:     ifRoute.Type,
+			Flags:    ifRoute.Flags,
+		}
+
+		// Not clearly documented: syscall.ParseNetlinkRouteAttr will check the
+		// syscall.NetlinkMessage.Header.Type and skip the struct rtmsg
+		// accordingly.
+		attrs, err := syscall.ParseNetlinkRouteAttr(&routeMsg)
+		if err != nil {
+			return nil, fmt.Errorf("RTM_GETROUTE returned RTM_NEWROUTE message with invalid rtattrs: %v", err)
+		}
+
+		for _, attr := range attrs {
+			switch attr.Attr.Type {
+			case syscall.RTA_DST:
+				inetRoute.DstAddr = attr.Value
+			case syscall.RTA_SRC:
+				inetRoute.SrcAddr = attr.Value
+			case syscall.RTA_GATEWAY:
+				inetRoute.GatewayAddr = attr.Value
+			case syscall.RTA_OIF:
+				expected := int(binary.Size(inetRoute.OutputInterface))
+				if len(attr.Value) != expected {
+					return nil, fmt.Errorf("RTM_GETROUTE returned RTM_NEWROUTE message with invalid attribute data length (%d bytes, expected %d bytes)", len(attr.Value), expected)
+				}
+				binary.Unmarshal(attr.Value, usermem.ByteOrder, &inetRoute.OutputInterface)
+			}
+		}
+
+		routes = append(routes, inetRoute)
+	}
+
+	return routes, nil
+}
+
 func addHostInterfaces(s *Stack) error {
 	links, err := doNetlinkRouteRequest(syscall.RTM_GETLINK)
 	if err != nil {
@@ -175,6 +234,20 @@ func addHostInterfaces(s *Stack) error {
 	return ExtractHostInterfaces(links, addrs, s.interfaces, s.interfaceAddrs)
 }
 
+func addHostRoutes(s *Stack) error {
+	routes, err := doNetlinkRouteRequest(syscall.RTM_GETROUTE)
+	if err != nil {
+		return fmt.Errorf("RTM_GETROUTE failed: %v", err)
+	}
+
+	s.routes, err = ExtractHostRoutes(routes)
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
 func doNetlinkRouteRequest(req int) ([]syscall.NetlinkMessage, error) {
 	data, err := syscall.NetlinkRIB(req, syscall.AF_UNSPEC)
 	if err != nil {
@@ -202,12 +275,20 @@ func readTCPBufferSizeFile(filename string) (inet.TCPBufferSize, error) {
 
 // Interfaces implements inet.Stack.Interfaces.
 func (s *Stack) Interfaces() map[int32]inet.Interface {
-	return s.interfaces
+	interfaces := make(map[int32]inet.Interface)
+	for k, v := range s.interfaces {
+		interfaces[k] = v
+	}
+	return interfaces
 }
 
 // InterfaceAddrs implements inet.Stack.InterfaceAddrs.
 func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
-	return s.interfaceAddrs
+	addrs := make(map[int32][]inet.InterfaceAddr)
+	for k, v := range s.interfaceAddrs {
+		addrs[k] = append([]inet.InterfaceAddr(nil), v...)
+	}
+	return addrs
 }
 
 // SupportsIPv6 implements inet.Stack.SupportsIPv6.
@@ -249,3 +330,11 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
 func (s *Stack) Statistics(stat interface{}, arg string) error {
 	return syserror.EOPNOTSUPP
 }
+
+// RouteTable implements inet.Stack.RouteTable.
+func (s *Stack) RouteTable() []inet.Route {
+	return append([]inet.Route(nil), s.routes...)
+}
+
+// Resume implements inet.Stack.Resume.
+func (s *Stack) Resume() {}
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
new file mode 100644
index 000000000..354a0d6ee
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -0,0 +1,24 @@
+package(licenses = ["notice"])
+
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+go_library(
+    name = "netfilter",
+    srcs = [
+        "netfilter.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netfilter",
+    # This target depends on netstack and should only be used by epsocket,
+    # which is allowed to depend on netstack.
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/usermem",
+        "//pkg/syserr",
+        "//pkg/tcpip",
+        "//pkg/tcpip/iptables",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
new file mode 100644
index 000000000..9f87c32f1
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -0,0 +1,286 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package netfilter helps the sentry interact with netstack's netfilter
+// capabilities.
+package netfilter
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// errorTargetName is used to mark targets as error targets. Error targets
+// shouldn't be reached - an error has occurred if we fall through to one.
+const errorTargetName = "ERROR"
+
+// metadata is opaque to netstack. It holds data that we need to translate
+// between Linux's and netstack's iptables representations.
+type metadata struct {
+	HookEntry  [linux.NF_INET_NUMHOOKS]uint32
+	Underflow  [linux.NF_INET_NUMHOOKS]uint32
+	NumEntries uint32
+	Size       uint32
+}
+
+// GetInfo returns information about iptables.
+func GetInfo(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
+	// Read in the struct and table name.
+	var info linux.IPTGetinfo
+	if _, err := t.CopyIn(outPtr, &info); err != nil {
+		return linux.IPTGetinfo{}, syserr.FromError(err)
+	}
+
+	// Find the appropriate table.
+	table, err := findTable(ep, info.TableName())
+	if err != nil {
+		return linux.IPTGetinfo{}, err
+	}
+
+	// Get the hooks that apply to this table.
+	info.ValidHooks = table.ValidHooks()
+
+	// Grab the metadata struct, which is used to store information (e.g.
+	// the number of entries) that applies to the user's encoding of
+	// iptables, but not netstack's.
+	metadata := table.Metadata().(metadata)
+
+	// Set values from metadata.
+	info.HookEntry = metadata.HookEntry
+	info.Underflow = metadata.Underflow
+	info.NumEntries = metadata.NumEntries
+	info.Size = metadata.Size
+
+	return info, nil
+}
+
+// GetEntries returns netstack's iptables rules encoded for the iptables tool.
+func GetEntries(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
+	// Read in the struct and table name.
+	var userEntries linux.IPTGetEntries
+	if _, err := t.CopyIn(outPtr, &userEntries); err != nil {
+		return linux.KernelIPTGetEntries{}, syserr.FromError(err)
+	}
+
+	// Find the appropriate table.
+	table, err := findTable(ep, userEntries.TableName())
+	if err != nil {
+		return linux.KernelIPTGetEntries{}, err
+	}
+
+	// Convert netstack's iptables rules to something that the iptables
+	// tool can understand.
+	entries, _, err := convertNetstackToBinary(userEntries.TableName(), table)
+	if err != nil {
+		return linux.KernelIPTGetEntries{}, err
+	}
+	if binary.Size(entries) > uintptr(outLen) {
+		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
+	}
+
+	return entries, nil
+}
+
+func findTable(ep tcpip.Endpoint, tableName string) (iptables.Table, *syserr.Error) {
+	ipt, err := ep.IPTables()
+	if err != nil {
+		return iptables.Table{}, syserr.FromError(err)
+	}
+	table, ok := ipt.Tables[tableName]
+	if !ok {
+		return iptables.Table{}, syserr.ErrInvalidArgument
+	}
+	return table, nil
+}
+
+// FillDefaultIPTables sets stack's IPTables to the default tables and
+// populates them with metadata.
+func FillDefaultIPTables(stack *stack.Stack) {
+	ipt := iptables.DefaultTables()
+
+	// In order to fill in the metadata, we have to translate ipt from its
+	// netstack format to Linux's giant-binary-blob format.
+	for name, table := range ipt.Tables {
+		_, metadata, err := convertNetstackToBinary(name, table)
+		if err != nil {
+			panic(fmt.Errorf("Unable to set default IP tables: %v", err))
+		}
+		table.SetMetadata(metadata)
+		ipt.Tables[name] = table
+	}
+
+	stack.SetIPTables(ipt)
+}
+
+// convertNetstackToBinary converts the iptables as stored in netstack to the
+// format expected by the iptables tool. Linux stores each table as a binary
+// blob that can only be traversed by parsing a bit, reading some offsets,
+// jumping to those offsets, parsing again, etc.
+func convertNetstackToBinary(name string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) {
+	// Return values.
+	var entries linux.KernelIPTGetEntries
+	var meta metadata
+
+	// The table name has to fit in the struct.
+	if linux.XT_TABLE_MAXNAMELEN < len(name) {
+		return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
+	}
+	copy(entries.Name[:], name)
+
+	// Deal with the built in chains first (INPUT, OUTPUT, etc.). Each of
+	// these chains ends with an unconditional policy entry.
+	for hook := iptables.Prerouting; hook < iptables.NumHooks; hook++ {
+		chain, ok := table.BuiltinChains[hook]
+		if !ok {
+			// This table doesn't support this hook.
+			continue
+		}
+
+		// Sanity check.
+		if len(chain.Rules) < 1 {
+			return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
+		}
+
+		for ruleIdx, rule := range chain.Rules {
+			// If this is the first rule of a builtin chain, set
+			// the metadata hook entry point.
+			if ruleIdx == 0 {
+				meta.HookEntry[hook] = entries.Size
+			}
+
+			// Each rule corresponds to an entry.
+			entry := linux.KernelIPTEntry{
+				IPTEntry: linux.IPTEntry{
+					NextOffset:   linux.SizeOfIPTEntry,
+					TargetOffset: linux.SizeOfIPTEntry,
+				},
+			}
+
+			for _, matcher := range rule.Matchers {
+				// Serialize the matcher and add it to the
+				// entry.
+				serialized := marshalMatcher(matcher)
+				entry.Elems = append(entry.Elems, serialized...)
+				entry.NextOffset += uint16(len(serialized))
+				entry.TargetOffset += uint16(len(serialized))
+			}
+
+			// Serialize and append the target.
+			serialized := marshalTarget(rule.Target)
+			entry.Elems = append(entry.Elems, serialized...)
+			entry.NextOffset += uint16(len(serialized))
+
+			// The underflow rule is the last rule in the chain,
+			// and is an unconditional rule (i.e. it matches any
+			// packet). This is enforced when saving iptables.
+			if ruleIdx == len(chain.Rules)-1 {
+				meta.Underflow[hook] = entries.Size
+			}
+
+			entries.Size += uint32(entry.NextOffset)
+			entries.Entrytable = append(entries.Entrytable, entry)
+			meta.NumEntries++
+		}
+
+	}
+
+	// TODO(gvisor.dev/issue/170): Deal with the user chains here. Each of
+	// these starts with an error node holding the chain's name and ends
+	// with an unconditional return.
+
+	// Lastly, each table ends with an unconditional error target rule as
+	// its final entry.
+	errorEntry := linux.KernelIPTEntry{
+		IPTEntry: linux.IPTEntry{
+			NextOffset:   linux.SizeOfIPTEntry,
+			TargetOffset: linux.SizeOfIPTEntry,
+		},
+	}
+	var errorTarget linux.XTErrorTarget
+	errorTarget.Target.TargetSize = linux.SizeOfXTErrorTarget
+	copy(errorTarget.ErrorName[:], errorTargetName)
+	copy(errorTarget.Target.Name[:], errorTargetName)
+
+	// Serialize and add it to the list of entries.
+	errorTargetBuf := make([]byte, 0, linux.SizeOfXTErrorTarget)
+	serializedErrorTarget := binary.Marshal(errorTargetBuf, usermem.ByteOrder, errorTarget)
+	errorEntry.Elems = append(errorEntry.Elems, serializedErrorTarget...)
+	errorEntry.NextOffset += uint16(len(serializedErrorTarget))
+
+	entries.Size += uint32(errorEntry.NextOffset)
+	entries.Entrytable = append(entries.Entrytable, errorEntry)
+	meta.NumEntries++
+	meta.Size = entries.Size
+
+	return entries, meta, nil
+}
+
+func marshalMatcher(matcher iptables.Matcher) []byte {
+	switch matcher.(type) {
+	default:
+		// TODO(gvisor.dev/issue/170): We don't support any matchers yet, so
+		// any call to marshalMatcher will panic.
+		panic(fmt.Errorf("unknown matcher of type %T", matcher))
+	}
+}
+
+func marshalTarget(target iptables.Target) []byte {
+	switch target.(type) {
+	case iptables.UnconditionalAcceptTarget:
+		return marshalUnconditionalAcceptTarget()
+	default:
+		panic(fmt.Errorf("unknown target of type %T", target))
+	}
+}
+
+func marshalUnconditionalAcceptTarget() []byte {
+	// The target's name will be the empty string.
+	target := linux.XTStandardTarget{
+		Target: linux.XTEntryTarget{
+			TargetSize: linux.SizeOfXTStandardTarget,
+		},
+		Verdict: translateStandardVerdict(iptables.Accept),
+	}
+
+	ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
+	return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
+// translateStandardVerdict translates verdicts the same way as the iptables
+// tool.
+func translateStandardVerdict(verdict iptables.Verdict) int32 {
+	switch verdict {
+	case iptables.Accept:
+		return -linux.NF_ACCEPT - 1
+	case iptables.Drop:
+		return -linux.NF_DROP - 1
+	case iptables.Queue:
+		return -linux.NF_QUEUE - 1
+	case iptables.Return:
+		return linux.NF_RETURN
+	case iptables.Jump:
+		// TODO(gvisor.dev/issue/170): Support Jump.
+		panic("Jump isn't supported yet")
+	default:
+		panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
+	}
+}
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index fb1ff329c..cc70ac237 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -110,7 +110,7 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
 		m.PutAttr(linux.IFLA_ADDRESS, mac)
 		m.PutAttr(linux.IFLA_BROADCAST, brd)
 
-		// TODO(b/68878065): There are many more attributes.
+		// TODO(gvisor.dev/issue/578): There are many more attributes.
 	}
 
 	return nil
@@ -151,13 +151,69 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
 
 			m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr))
 
-			// TODO(b/68878065): There are many more attributes.
+			// TODO(gvisor.dev/issue/578): There are many more attributes.
 		}
 	}
 
 	return nil
 }
 
+// dumpRoutes handles RTM_GETROUTE + NLM_F_DUMP requests.
+func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+	// RTM_GETROUTE dump requests need not contain anything more than the
+	// netlink header and 1 byte protocol family common to all
+	// NETLINK_ROUTE requests.
+
+	// We always send back an NLMSG_DONE.
+	ms.Multi = true
+
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		// No network routes.
+		return nil
+	}
+
+	for _, rt := range stack.RouteTable() {
+		m := ms.AddMessage(linux.NetlinkMessageHeader{
+			Type: linux.RTM_NEWROUTE,
+		})
+
+		m.Put(linux.RouteMessage{
+			Family: rt.Family,
+			DstLen: rt.DstLen,
+			SrcLen: rt.SrcLen,
+			TOS:    rt.TOS,
+
+			// Always return the main table since we don't have multiple
+			// routing tables.
+			Table:    linux.RT_TABLE_MAIN,
+			Protocol: rt.Protocol,
+			Scope:    rt.Scope,
+			Type:     rt.Type,
+
+			Flags: rt.Flags,
+		})
+
+		m.PutAttr(254, []byte{123})
+		if rt.DstLen > 0 {
+			m.PutAttr(linux.RTA_DST, rt.DstAddr)
+		}
+		if rt.SrcLen > 0 {
+			m.PutAttr(linux.RTA_SRC, rt.SrcAddr)
+		}
+		if rt.OutputInterface != 0 {
+			m.PutAttr(linux.RTA_OIF, rt.OutputInterface)
+		}
+		if len(rt.GatewayAddr) > 0 {
+			m.PutAttr(linux.RTA_GATEWAY, rt.GatewayAddr)
+		}
+
+		// TODO(gvisor.dev/issue/578): There are many more attributes.
+	}
+
+	return nil
+}
+
 // ProcessMessage implements netlink.Protocol.ProcessMessage.
 func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
 	// All messages start with a 1 byte protocol family.
@@ -186,6 +242,8 @@ func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageH
 		return p.dumpLinks(ctx, hdr, data, ms)
 	case linux.RTM_GETADDR:
 		return p.dumpAddrs(ctx, hdr, data, ms)
+	case linux.RTM_GETROUTE:
+		return p.dumpRoutes(ctx, hdr, data, ms)
 	default:
 		return syserr.ErrNotSupported
 	}
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index f3d6c1e9b..d0aab293d 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -271,7 +271,7 @@ func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr
 }
 
 // Accept implements socket.Socket.Accept.
-func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
+func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
 	// Netlink sockets never support accept.
 	return 0, nil, 0, syserr.ErrNotSupported
 }
@@ -289,7 +289,7 @@ func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error {
 }
 
 // GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) {
+func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
 		switch name {
@@ -379,11 +379,11 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
 }
 
 // GetSockName implements socket.Socket.GetSockName.
-func (s *Socket) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	sa := linux.SockAddrNetlink{
+	sa := &linux.SockAddrNetlink{
 		Family: linux.AF_NETLINK,
 		PortID: uint32(s.portID),
 	}
@@ -391,8 +391,8 @@ func (s *Socket) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error
 }
 
 // GetPeerName implements socket.Socket.GetPeerName.
-func (s *Socket) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
-	sa := linux.SockAddrNetlink{
+func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+	sa := &linux.SockAddrNetlink{
 		Family: linux.AF_NETLINK,
 		// TODO(b/68878065): Support non-kernel peers. For now the peer
 		// must be the kernel.
@@ -402,8 +402,8 @@ func (s *Socket) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
-	from := linux.SockAddrNetlink{
+func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
+	from := &linux.SockAddrNetlink{
 		Family: linux.AF_NETLINK,
 		PortID: 0,
 	}
@@ -511,6 +511,19 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 	return nil
 }
 
+func (s *Socket) dumpErrorMesage(ctx context.Context, hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) *syserr.Error {
+	m := ms.AddMessage(linux.NetlinkMessageHeader{
+		Type: linux.NLMSG_ERROR,
+	})
+
+	m.Put(linux.NetlinkErrorMessage{
+		Error:  int32(-err.ToLinux().Number()),
+		Header: hdr,
+	})
+	return nil
+
+}
+
 // processMessages handles each message in buf, passing it to the protocol
 // handler for final handling.
 func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error {
@@ -545,14 +558,20 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error
 			continue
 		}
 
+		ms := NewMessageSet(s.portID, hdr.Seq)
+		var err *syserr.Error
 		// TODO(b/68877377): ACKs not supported yet.
 		if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
-			return syserr.ErrNotSupported
-		}
+			err = syserr.ErrNotSupported
+		} else {
 
-		ms := NewMessageSet(s.portID, hdr.Seq)
-		if err := s.protocol.ProcessMessage(ctx, hdr, data, ms); err != nil {
-			return err
+			err = s.protocol.ProcessMessage(ctx, hdr, data, ms)
+		}
+		if err != nil {
+			ms = NewMessageSet(s.portID, hdr.Seq)
+			if err := s.dumpErrorMesage(ctx, hdr, ms, err); err != nil {
+				return err
+			}
 		}
 
 		if err := s.sendResponse(ctx, ms); err != nil {
diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD
index a536f2e44..a3585e10d 100644
--- a/pkg/sentry/socket/rpcinet/notifier/BUILD
+++ b/pkg/sentry/socket/rpcinet/notifier/BUILD
@@ -6,10 +6,11 @@ go_library(
     name = "notifier",
     srcs = ["notifier.go"],
     importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/notifier",
-    visibility = ["//pkg/sentry:internal"],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto",
         "//pkg/sentry/socket/rpcinet/conn",
         "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go
index aa157dd51..7efe4301f 100644
--- a/pkg/sentry/socket/rpcinet/notifier/notifier.go
+++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go
@@ -20,6 +20,7 @@ import (
 	"sync"
 	"syscall"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn"
 	pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -76,7 +77,7 @@ func (n *Notifier) waitFD(fd uint32, fi *fdInfo, mask waiter.EventMask) error {
 	}
 
 	e := pb.EpollEvent{
-		Events: mask.ToLinux() | -syscall.EPOLLET,
+		Events: mask.ToLinux() | unix.EPOLLET,
 		Fd:     fd,
 	}
 
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index ccaaddbfc..ddb76d9d4 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -285,7 +285,7 @@ func rpcAccept(t *kernel.Task, fd uint32, peer bool) (*pb.AcceptResponse_ResultP
 }
 
 // Accept implements socket.Socket.Accept.
-func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
 	payload, se := rpcAccept(t, s.fd, peerRequested)
 
 	// Check if we need to block.
@@ -328,6 +328,9 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		NonBlocking: flags&linux.SOCK_NONBLOCK != 0,
 	}
 	file := fs.NewFile(t, dirent, fileFlags, &socketOperations{
+		family:   s.family,
+		stype:    s.stype,
+		protocol: s.protocol,
 		wq:       &wq,
 		fd:       payload.Fd,
 		rpcConn:  s.rpcConn,
@@ -344,7 +347,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	t.Kernel().RecordSocket(file)
 
 	if peerRequested {
-		return fd, payload.Address.Address, payload.Address.Length, nil
+		return fd, socket.UnmarshalSockAddr(s.family, payload.Address.Address), payload.Address.Length, nil
 	}
 
 	return fd, nil, 0, nil
@@ -395,7 +398,7 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 }
 
 // GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) {
+func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
 	// SO_RCVTIMEO and SO_SNDTIMEO are special because blocking is performed
 	// within the sentry.
 	if level == linux.SOL_SOCKET && name == linux.SO_RCVTIMEO {
@@ -469,7 +472,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 }
 
 // GetPeerName implements socket.Socket.GetPeerName.
-func (s *socketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	stack := t.NetworkContext().(*Stack)
 	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetPeerName{&pb.GetPeerNameRequest{Fd: s.fd}}}, false /* ignoreResult */)
 	<-c
@@ -480,11 +483,11 @@ func (s *socketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *sy
 	}
 
 	addr := res.(*pb.GetPeerNameResponse_Address).Address
-	return addr.Address, addr.Length, nil
+	return socket.UnmarshalSockAddr(s.family, addr.Address), addr.Length, nil
 }
 
 // GetSockName implements socket.Socket.GetSockName.
-func (s *socketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	stack := t.NetworkContext().(*Stack)
 	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetSockName{&pb.GetSockNameRequest{Fd: s.fd}}}, false /* ignoreResult */)
 	<-c
@@ -495,7 +498,7 @@ func (s *socketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *sy
 	}
 
 	addr := res.(*pb.GetSockNameResponse_Address).Address
-	return addr.Address, addr.Length, nil
+	return socket.UnmarshalSockAddr(s.family, addr.Address), addr.Length, nil
 }
 
 func rpcIoctl(t *kernel.Task, fd, cmd uint32, arg []byte) ([]byte, error) {
@@ -682,7 +685,7 @@ func (s *socketOperations) extractControlMessages(payload *pb.RecvmsgResponse_Re
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
 	req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{
 		Fd:         s.fd,
 		Length:     uint32(dst.NumBytes()),
@@ -703,7 +706,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			}
 		}
 		c := s.extractControlMessages(res)
-		return int(res.Length), 0, res.Address.GetAddress(), res.Address.GetLength(), c, syserr.FromError(e)
+		return int(res.Length), 0, socket.UnmarshalSockAddr(s.family, res.Address.GetAddress()), res.Address.GetLength(), c, syserr.FromError(e)
 	}
 	if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain || flags&linux.MSG_DONTWAIT != 0 {
 		return 0, 0, nil, 0, socket.ControlMessages{}, err
@@ -727,7 +730,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 				}
 			}
 			c := s.extractControlMessages(res)
-			return int(res.Length), 0, res.Address.GetAddress(), res.Address.GetLength(), c, syserr.FromError(e)
+			return int(res.Length), 0, socket.UnmarshalSockAddr(s.family, res.Address.GetAddress()), res.Address.GetLength(), c, syserr.FromError(e)
 		}
 		if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain {
 			return 0, 0, nil, 0, socket.ControlMessages{}, err
diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go
index 49bd3a220..5dcb6b455 100644
--- a/pkg/sentry/socket/rpcinet/stack.go
+++ b/pkg/sentry/socket/rpcinet/stack.go
@@ -30,6 +30,7 @@ import (
 type Stack struct {
 	interfaces     map[int32]inet.Interface
 	interfaceAddrs map[int32][]inet.InterfaceAddr
+	routes         []inet.Route
 	rpcConn        *conn.RPCConnection
 	notifier       *notifier.Notifier
 }
@@ -69,6 +70,16 @@ func NewStack(fd int32) (*Stack, error) {
 		return nil, e
 	}
 
+	routes, err := stack.DoNetlinkRouteRequest(syscall.RTM_GETROUTE)
+	if err != nil {
+		return nil, fmt.Errorf("RTM_GETROUTE failed: %v", err)
+	}
+
+	stack.routes, e = hostinet.ExtractHostRoutes(routes)
+	if e != nil {
+		return nil, e
+	}
+
 	return stack, nil
 }
 
@@ -89,12 +100,20 @@ func (s *Stack) RPCWriteFile(path string, data []byte) (int64, *syserr.Error) {
 
 // Interfaces implements inet.Stack.Interfaces.
 func (s *Stack) Interfaces() map[int32]inet.Interface {
-	return s.interfaces
+	interfaces := make(map[int32]inet.Interface)
+	for k, v := range s.interfaces {
+		interfaces[k] = v
+	}
+	return interfaces
 }
 
 // InterfaceAddrs implements inet.Stack.InterfaceAddrs.
 func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
-	return s.interfaceAddrs
+	addrs := make(map[int32][]inet.InterfaceAddr)
+	for k, v := range s.interfaceAddrs {
+		addrs[k] = append([]inet.InterfaceAddr(nil), v...)
+	}
+	return addrs
 }
 
 // SupportsIPv6 implements inet.Stack.SupportsIPv6.
@@ -138,3 +157,11 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
 func (s *Stack) Statistics(stat interface{}, arg string) error {
 	return syserr.ErrEndpointOperation.ToError()
 }
+
+// RouteTable implements inet.Stack.RouteTable.
+func (s *Stack) RouteTable() []inet.Route {
+	return append([]inet.Route(nil), s.routes...)
+}
+
+// Resume implements inet.Stack.Resume.
+func (s *Stack) Resume() {}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 0efa58a58..8c250c325 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -20,8 +20,10 @@ package socket
 import (
 	"fmt"
 	"sync/atomic"
+	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -52,7 +54,7 @@ type Socket interface {
 	// Accept implements the accept4(2) linux syscall.
 	// Returns fd, real peer address length and error. Real peer address
 	// length is only set if len(peer) > 0.
-	Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error)
+	Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error)
 
 	// Bind implements the bind(2) linux syscall.
 	Bind(t *kernel.Task, sockaddr []byte) *syserr.Error
@@ -64,7 +66,7 @@ type Socket interface {
 	Shutdown(t *kernel.Task, how int) *syserr.Error
 
 	// GetSockOpt implements the getsockopt(2) linux syscall.
-	GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error)
+	GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error)
 
 	// SetSockOpt implements the setsockopt(2) linux syscall.
 	SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error
@@ -73,13 +75,13 @@ type Socket interface {
 	//
 	// addrLen is the address length to be returned to the application, not
 	// necessarily the actual length of the address.
-	GetSockName(t *kernel.Task) (addr interface{}, addrLen uint32, err *syserr.Error)
+	GetSockName(t *kernel.Task) (addr linux.SockAddr, addrLen uint32, err *syserr.Error)
 
 	// GetPeerName implements the getpeername(2) linux syscall.
 	//
 	// addrLen is the address length to be returned to the application, not
 	// necessarily the actual length of the address.
-	GetPeerName(t *kernel.Task) (addr interface{}, addrLen uint32, err *syserr.Error)
+	GetPeerName(t *kernel.Task) (addr linux.SockAddr, addrLen uint32, err *syserr.Error)
 
 	// RecvMsg implements the recvmsg(2) linux syscall.
 	//
@@ -92,7 +94,7 @@ type Socket interface {
 	// msgFlags. In that case, the caller should set MSG_CTRUNC appropriately.
 	//
 	// If err != nil, the recv was not successful.
-	RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr interface{}, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error)
+	RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error)
 
 	// SendMsg implements the sendmsg(2) linux syscall. SendMsg does not take
 	// ownership of the ControlMessage on error.
@@ -340,3 +342,31 @@ func emitUnimplementedEvent(t *kernel.Task, name int) {
 		t.Kernel().EmitUnimplementedEvent(t)
 	}
 }
+
+// UnmarshalSockAddr unmarshals memory representing a struct sockaddr to one of
+// the ABI socket address types.
+//
+// Precondition: data must be long enough to represent a socket address of the
+// given family.
+func UnmarshalSockAddr(family int, data []byte) linux.SockAddr {
+	switch family {
+	case syscall.AF_INET:
+		var addr linux.SockAddrInet
+		binary.Unmarshal(data[:syscall.SizeofSockaddrInet4], usermem.ByteOrder, &addr)
+		return &addr
+	case syscall.AF_INET6:
+		var addr linux.SockAddrInet6
+		binary.Unmarshal(data[:syscall.SizeofSockaddrInet6], usermem.ByteOrder, &addr)
+		return &addr
+	case syscall.AF_UNIX:
+		var addr linux.SockAddrUnix
+		binary.Unmarshal(data[:syscall.SizeofSockaddrUnix], usermem.ByteOrder, &addr)
+		return &addr
+	case syscall.AF_NETLINK:
+		var addr linux.SockAddrNetlink
+		binary.Unmarshal(data[:syscall.SizeofSockaddrNetlink], usermem.ByteOrder, &addr)
+		return &addr
+	default:
+		panic(fmt.Sprintf("Unsupported socket family %v", family))
+	}
+}
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 760c7beab..2ec1a662d 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -62,7 +62,7 @@ type EndpointReader struct {
 	Creds bool
 
 	// NumRights is the number of SCM_RIGHTS FDs requested.
-	NumRights uintptr
+	NumRights int
 
 	// Peek indicates that the data should not be consumed from the
 	// endpoint.
@@ -70,7 +70,7 @@ type EndpointReader struct {
 
 	// MsgSize is the size of the message that was read from. For stream
 	// sockets, it is the amount read.
-	MsgSize uintptr
+	MsgSize int64
 
 	// From, if not nil, will be set with the address read from.
 	From *tcpip.FullAddress
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 73d2df15d..4bd15808a 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -436,7 +436,7 @@ func (e *connectionedEndpoint) Bind(addr tcpip.FullAddress, commit func() *syser
 
 // SendMsg writes data and a control message to the endpoint's peer.
 // This method does not block if the data cannot be written.
-func (e *connectionedEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *syserr.Error) {
+func (e *connectionedEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) {
 	// Stream sockets do not support specifying the endpoint. Seqpacket
 	// sockets ignore the passed endpoint.
 	if e.stype == linux.SOCK_STREAM && to != nil {
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index c7f7c5b16..0322dec0b 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -99,7 +99,7 @@ func (e *connectionlessEndpoint) UnidirectionalConnect(ctx context.Context) (Con
 
 // SendMsg writes data and a control message to the specified endpoint.
 // This method does not block if the data cannot be written.
-func (e *connectionlessEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *syserr.Error) {
+func (e *connectionlessEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) {
 	if to == nil {
 		return e.baseEndpoint.SendMsg(ctx, data, c, nil)
 	}
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 7fb9cb1e0..2b0ad6395 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -121,13 +121,13 @@ type Endpoint interface {
 	// CMTruncated indicates that the numRights hint was used to receive fewer
 	// than the total available SCM_RIGHTS FDs. Additional truncation may be
 	// required by the caller.
-	RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen uintptr, cm ControlMessages, CMTruncated bool, err *syserr.Error)
+	RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, err *syserr.Error)
 
 	// SendMsg writes data and a control message to the endpoint's peer.
 	// This method does not block if the data cannot be written.
 	//
 	// SendMsg does not take ownership of any of its arguments on error.
-	SendMsg(context.Context, [][]byte, ControlMessages, BoundEndpoint) (uintptr, *syserr.Error)
+	SendMsg(context.Context, [][]byte, ControlMessages, BoundEndpoint) (int64, *syserr.Error)
 
 	// Connect connects this endpoint directly to another.
 	//
@@ -291,7 +291,7 @@ type Receiver interface {
 	// See Endpoint.RecvMsg for documentation on shared arguments.
 	//
 	// notify indicates if RecvNotify should be called.
-	Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (recvLen, msgLen uintptr, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error)
+	Recv(data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error)
 
 	// RecvNotify notifies the Receiver of a successful Recv. This must not be
 	// called while holding any endpoint locks.
@@ -331,7 +331,7 @@ type queueReceiver struct {
 }
 
 // Recv implements Receiver.Recv.
-func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
 	var m *message
 	var notify bool
 	var err *syserr.Error
@@ -344,13 +344,13 @@ func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek
 		return 0, 0, ControlMessages{}, false, tcpip.FullAddress{}, false, err
 	}
 	src := []byte(m.Data)
-	var copied uintptr
+	var copied int64
 	for i := 0; i < len(data) && len(src) > 0; i++ {
 		n := copy(data[i], src)
-		copied += uintptr(n)
+		copied += int64(n)
 		src = src[n:]
 	}
-	return copied, uintptr(len(m.Data)), m.Control, false, m.Address, notify, nil
+	return copied, int64(len(m.Data)), m.Control, false, m.Address, notify, nil
 }
 
 // RecvNotify implements Receiver.RecvNotify.
@@ -401,11 +401,11 @@ type streamQueueReceiver struct {
 	addr    tcpip.FullAddress
 }
 
-func vecCopy(data [][]byte, buf []byte) (uintptr, [][]byte, []byte) {
-	var copied uintptr
+func vecCopy(data [][]byte, buf []byte) (int64, [][]byte, []byte) {
+	var copied int64
 	for len(data) > 0 && len(buf) > 0 {
 		n := copy(data[0], buf)
-		copied += uintptr(n)
+		copied += int64(n)
 		buf = buf[n:]
 		data[0] = data[0][n:]
 		if len(data[0]) == 0 {
@@ -443,7 +443,7 @@ func (q *streamQueueReceiver) RecvMaxQueueSize() int64 {
 }
 
 // Recv implements Receiver.Recv.
-func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
 
@@ -464,7 +464,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uint
 		q.addr = m.Address
 	}
 
-	var copied uintptr
+	var copied int64
 	if peek {
 		// Don't consume control message if we are peeking.
 		c := q.control.Clone()
@@ -531,7 +531,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uint
 			break
 		}
 
-		var cpd uintptr
+		var cpd int64
 		cpd, data, q.buffer = vecCopy(data, q.buffer)
 		copied += cpd
 
@@ -569,7 +569,7 @@ type ConnectedEndpoint interface {
 	//
 	// syserr.ErrWouldBlock can be returned along with a partial write if
 	// the caller should block to send the rest of the data.
-	Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (n uintptr, notify bool, err *syserr.Error)
+	Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error)
 
 	// SendNotify notifies the ConnectedEndpoint of a successful Send. This
 	// must not be called while holding any endpoint locks.
@@ -637,7 +637,7 @@ func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
 }
 
 // Send implements ConnectedEndpoint.Send.
-func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (uintptr, bool, *syserr.Error) {
+func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
 	var l int64
 	for _, d := range data {
 		l += int64(len(d))
@@ -665,7 +665,7 @@ func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages,
 	}
 
 	l, notify, err := e.writeQueue.Enqueue(&message{Data: buffer.View(v), Control: controlMessages, Address: from}, truncate)
-	return uintptr(l), notify, err
+	return int64(l), notify, err
 }
 
 // SendNotify implements ConnectedEndpoint.SendNotify.
@@ -781,7 +781,7 @@ func (e *baseEndpoint) Connected() bool {
 }
 
 // RecvMsg reads data and a control message from the endpoint.
-func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, ControlMessages, bool, *syserr.Error) {
+func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *tcpip.FullAddress) (int64, int64, ControlMessages, bool, *syserr.Error) {
 	e.Lock()
 
 	if e.receiver == nil {
@@ -807,7 +807,7 @@ func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, n
 
 // SendMsg writes data and a control message to the endpoint's peer.
 // This method does not block if the data cannot be written.
-func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *syserr.Error) {
+func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) {
 	e.Lock()
 	if !e.Connected() {
 		e.Unlock()
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index eb262ecaf..0d0cb68df 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -116,7 +116,7 @@ func (s *SocketOperations) Endpoint() transport.Endpoint {
 
 // extractPath extracts and validates the address.
 func extractPath(sockaddr []byte) (string, *syserr.Error) {
-	addr, err := epsocket.GetAddress(linux.AF_UNIX, sockaddr, true /* strict */)
+	addr, _, err := epsocket.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */)
 	if err != nil {
 		return "", err
 	}
@@ -137,7 +137,7 @@ func extractPath(sockaddr []byte) (string, *syserr.Error) {
 
 // GetPeerName implements the linux syscall getpeername(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr, err := s.ep.GetRemoteAddress()
 	if err != nil {
 		return nil, 0, syserr.TranslateNetstackError(err)
@@ -149,7 +149,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *sy
 
 // GetSockName implements the linux syscall getsockname(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr, err := s.ep.GetLocalAddress()
 	if err != nil {
 		return nil, 0, syserr.TranslateNetstackError(err)
@@ -166,7 +166,7 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
 
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
 	return epsocket.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
 }
 
@@ -199,7 +199,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
 
 // Accept implements the linux syscall accept(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
 	// Issue the accept request to get the new endpoint.
 	ep, err := s.ep.Accept()
 	if err != nil {
@@ -223,7 +223,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		ns.SetFlags(flags.Settable())
 	}
 
-	var addr interface{}
+	var addr linux.SockAddr
 	var addrLen uint32
 	if peerRequested {
 		// Get address of the peer.
@@ -505,7 +505,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
+func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 	peek := flags&linux.MSG_PEEK != 0
 	dontWait := flags&linux.MSG_DONTWAIT != 0
@@ -535,7 +535,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		Ctx:       t,
 		Endpoint:  s.ep,
 		Creds:     wantCreds,
-		NumRights: uintptr(numRights),
+		NumRights: numRights,
 		Peek:      peek,
 	}
 	if senderRequested {
@@ -543,7 +543,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	}
 	var total int64
 	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || dontWait {
-		var from interface{}
+		var from linux.SockAddr
 		var fromLen uint32
 		if r.From != nil && len([]byte(r.From.Addr)) != 0 {
 			from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From)
@@ -578,7 +578,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 
 	for {
 		if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock {
-			var from interface{}
+			var from linux.SockAddr
 			var fromLen uint32
 			if r.From != nil {
 				from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From)
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index f297ef3b7..88765f4d6 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -16,6 +16,7 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/time",
         "//pkg/sentry/watchdog",
         "//pkg/state/statefile",
         "//pkg/syserror",
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index 026549756..9eb626b76 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/state/statefile"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -104,7 +105,7 @@ type LoadOpts struct {
 }
 
 // Load loads the given kernel, setting the provided platform and stack.
-func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack) error {
+func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack, clocks time.Clocks) error {
 	// Open the file.
 	r, m, err := statefile.NewReader(opts.Source, opts.Key)
 	if err != nil {
@@ -114,5 +115,5 @@ func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack) error {
 	previousMetadata = m
 
 	// Restore the Kernel object graph.
-	return k.LoadFrom(r, n)
+	return k.LoadFrom(r, n, clocks)
 }
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 386b40af7..f779186ad 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -332,7 +332,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string {
 
 	switch family {
 	case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX:
-		fa, err := epsocket.GetAddress(int(family), b, true /* strict */)
+		fa, _, err := epsocket.AddressAndFamily(int(family), b, true /* strict */)
 		if err != nil {
 			return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err)
 		}
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 264301bfa..1d9018c96 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -91,6 +91,10 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
 		// TODO(gvisor.dev/issue/161): In some cases SIGPIPE should
 		// also be sent to the application.
 		return nil
+	case syserror.ENOSPC:
+		// Similar to EPIPE. Return what we wrote this time, and let
+		// ENOSPC be returned on the next call.
+		return nil
 	case syserror.ECONNRESET:
 		// For TCP sendfile connections, we may have a reset. But we
 		// should just return n as the result.
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 51db2d8f7..ed996ba51 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -30,8 +30,7 @@ import (
 const _AUDIT_ARCH_X86_64 = 0xc000003e
 
 // AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
-// numbers from Linux 4.4. The entries commented out are those syscalls we
-// don't currently support.
+// numbers from Linux 4.4.
 var AMD64 = &kernel.SyscallTable{
 	OS:   abi.Linux,
 	Arch: arch.AMD64,
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 4a2b9f061..65b4a227b 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -107,19 +107,20 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 // copyOutEvents copies epoll events from the kernel to user memory.
 func copyOutEvents(t *kernel.Task, addr usermem.Addr, e []epoll.Event) error {
 	const itemLen = 12
-	if _, ok := addr.AddLength(uint64(len(e)) * itemLen); !ok {
+	buffLen := len(e) * itemLen
+	if _, ok := addr.AddLength(uint64(buffLen)); !ok {
 		return syserror.EFAULT
 	}
 
-	b := t.CopyScratchBuffer(itemLen)
+	b := t.CopyScratchBuffer(buffLen)
 	for i := range e {
-		usermem.ByteOrder.PutUint32(b[0:], e[i].Events)
-		usermem.ByteOrder.PutUint32(b[4:], uint32(e[i].Data[0]))
-		usermem.ByteOrder.PutUint32(b[8:], uint32(e[i].Data[1]))
-		if _, err := t.CopyOutBytes(addr, b); err != nil {
-			return err
-		}
-		addr += itemLen
+		usermem.ByteOrder.PutUint32(b[i*itemLen:], e[i].Events)
+		usermem.ByteOrder.PutUint32(b[i*itemLen+4:], uint32(e[i].Data[0]))
+		usermem.ByteOrder.PutUint32(b[i*itemLen+8:], uint32(e[i].Data[1]))
+	}
+
+	if _, err := t.CopyOutBytes(addr, b); err != nil {
+		return err
 	}
 
 	return nil
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 63e2c5a5d..912cbe4ff 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -120,7 +120,7 @@ func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent
 				Ino: attr.InodeID,
 				Off: offset,
 			},
-			Typ: toType(attr.Type),
+			Typ: fs.ToDirentType(attr.Type),
 		},
 		Name: []byte(name),
 	}
@@ -142,28 +142,6 @@ func smallestDirent64(a arch.Context) uint {
 	return uint(binary.Size(d.Hdr)) + a.Width()
 }
 
-// toType converts an fs.InodeOperationsInfo to a linux dirent typ field.
-func toType(nodeType fs.InodeType) uint8 {
-	switch nodeType {
-	case fs.RegularFile, fs.SpecialFile:
-		return linux.DT_REG
-	case fs.Symlink:
-		return linux.DT_LNK
-	case fs.Directory, fs.SpecialDirectory:
-		return linux.DT_DIR
-	case fs.Pipe:
-		return linux.DT_FIFO
-	case fs.CharacterDevice:
-		return linux.DT_CHR
-	case fs.BlockDevice:
-		return linux.DT_BLK
-	case fs.Socket:
-		return linux.DT_SOCK
-	default:
-		return linux.DT_UNKNOWN
-	}
-}
-
 // padRec pads the name field until the rec length is a multiple of the width,
 // which must be a power of 2. It returns the padded rec length.
 func (d *dirent) padRec(width int) uint16 {
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index 9080a10c3..8c13e2d82 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -109,9 +109,17 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return 0, nil, syserror.EINVAL
 	}
 
-	return 0, nil, fileOpOn(t, linux.AT_FDCWD, targetPath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+	if err := fileOpOn(t, linux.AT_FDCWD, targetPath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		// Mount will take a reference on rootInode if successful.
 		return t.MountNamespace().Mount(t, d, rootInode)
-	})
+	}); err != nil {
+		// Something went wrong. Drop our ref on rootInode before
+		// returning the error.
+		rootInode.DecRef()
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
 }
 
 // Umount2 implements Linux syscall umount2(2).
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index b2474e60d..3ab54271c 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -191,7 +191,6 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 }
 
 // Preadv2 implements linux syscall preadv2(2).
-// TODO(b/120162627): Implement RWF_HIPRI functionality.
 func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	// While the syscall is
 	// preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
@@ -228,6 +227,8 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 
 	// Check flags field.
+	// Note: gVisor does not implement the RWF_HIPRI feature, but the flag is
+	// accepted as a valid flag argument for preadv2.
 	if flags&^linux.RWF_VALID != 0 {
 		return 0, nil, syserror.EOPNOTSUPP
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index fa568a660..3bac4d90d 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -460,7 +460,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	// Call syscall implementation then copy both value and value len out.
-	v, e := getSockOpt(t, s, int(level), int(name), int(optLen))
+	v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen))
 	if e != nil {
 		return 0, nil, e.ToError()
 	}
@@ -483,7 +483,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 
 // getSockOpt tries to handle common socket options, or dispatches to a specific
 // socket implementation.
-func getSockOpt(t *kernel.Task, s socket.Socket, level, name, len int) (interface{}, *syserr.Error) {
+func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
 	if level == linux.SOL_SOCKET {
 		switch name {
 		case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
@@ -505,7 +505,7 @@ func getSockOpt(t *kernel.Task, s socket.Socket, level, name, len int) (interfac
 		}
 	}
 
-	return s.GetSockOpt(t, level, name, len)
+	return s.GetSockOpt(t, level, name, optValAddr, len)
 }
 
 // SetSockOpt implements the linux syscall setsockopt(2).
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index a7c98efcb..8a98fedcb 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -91,22 +91,29 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	// Get files.
+	inFile := t.GetFile(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+
+	if !inFile.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
 	outFile := t.GetFile(outFD)
 	if outFile == nil {
 		return 0, nil, syserror.EBADF
 	}
 	defer outFile.DecRef()
 
-	inFile := t.GetFile(inFD)
-	if inFile == nil {
+	if !outFile.Flags().Write {
 		return 0, nil, syserror.EBADF
 	}
-	defer inFile.DecRef()
 
-	// Verify that the outfile Append flag is not set. Note that fs.Splice
-	// itself validates that the output file is writable.
+	// Verify that the outfile Append flag is not set.
 	if outFile.Flags().Append {
-		return 0, nil, syserror.EBADF
+		return 0, nil, syserror.EINVAL
 	}
 
 	// Verify that we have a regular infile. This is a requirement; the
@@ -207,6 +214,10 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			return 0, nil, syserror.ESPIPE
 		}
 		if outOffset != 0 {
+			if !outFile.Flags().Pwrite {
+				return 0, nil, syserror.EINVAL
+			}
+
 			var offset int64
 			if _, err := t.CopyIn(outOffset, &offset); err != nil {
 				return 0, nil, err
@@ -220,6 +231,10 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			return 0, nil, syserror.ESPIPE
 		}
 		if inOffset != 0 {
+			if !inFile.Flags().Pread {
+				return 0, nil, syserror.EINVAL
+			}
+
 			var offset int64
 			if _, err := t.CopyIn(inOffset, &offset); err != nil {
 				return 0, nil, err
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 595eb9155..8ab7ffa25 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -96,7 +96,7 @@ func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 	// Load the new TaskContext.
 	maxTraversals := uint(linux.MaxSymlinkTraversals)
-	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, argv, envv, t.Arch().FeatureSet())
+	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, nil, argv, envv, t.Arch().FeatureSet())
 	if se != nil {
 		return 0, nil, se.ToError()
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index 5278c96a6..27cd2c336 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -191,7 +191,6 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 }
 
 // Pwritev2 implements linux syscall pwritev2(2).
-// TODO(b/120162627): Implement RWF_HIPRI functionality.
 // TODO(b/120161091): Implement O_SYNC and D_SYNC functionality.
 func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	// While the syscall is
@@ -227,6 +226,8 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		return 0, nil, syserror.ESPIPE
 	}
 
+	// Note: gVisor does not implement the RWF_HIPRI feature, but the flag is
+	// accepted as a valid flag argument for pwritev2.
 	if flags&^linux.RWF_VALID != 0 {
 		return uintptr(flags), nil, syserror.EOPNOTSUPP
 	}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 4de6c41cf..0f247bf77 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -18,6 +18,7 @@ go_library(
         "permissions.go",
         "resolving_path.go",
         "syscalls.go",
+        "testutil.go",
         "vfs.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/vfs",
@@ -40,7 +41,16 @@ go_test(
     name = "vfs_test",
     size = "small",
     srcs = [
+        "file_description_impl_util_test.go",
         "mount_test.go",
     ],
     embed = [":vfs"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/usermem",
+        "//pkg/syserror",
+    ],
 )
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 486893e70..ba230da72 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -15,6 +15,10 @@
 package vfs
 
 import (
+	"bytes"
+	"io"
+	"sync"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -24,6 +28,16 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// The following design pattern is strongly recommended for filesystem
+// implementations to adapt:
+//   - Have a local fileDescription struct (containing FileDescription) which
+//     embeds FileDescriptionDefaultImpl and overrides the default methods
+//     which are common to all fd implementations for that for that filesystem
+//     like StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
+//   - This should be embedded in all file description implementations as the
+//     first field by value.
+//   - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl.
+
 // FileDescriptionDefaultImpl may be embedded by implementations of
 // FileDescriptionImpl to obtain implementations of many FileDescriptionImpl
 // methods with default behavior analogous to Linux's.
@@ -115,11 +129,8 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
 
 // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
 // FileDescriptionImpl that always represent directories to obtain
-// implementations of non-directory I/O methods that return EISDIR, and
-// implementations of other methods consistent with FileDescriptionDefaultImpl.
-type DirectoryFileDescriptionDefaultImpl struct {
-	FileDescriptionDefaultImpl
-}
+// implementations of non-directory I/O methods that return EISDIR.
+type DirectoryFileDescriptionDefaultImpl struct{}
 
 // PRead implements FileDescriptionImpl.PRead.
 func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
@@ -140,3 +151,104 @@ func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src userm
 func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
 	return 0, syserror.EISDIR
 }
+
+// DynamicBytesFileDescriptionImpl may be embedded by implementations of
+// FileDescriptionImpl that represent read-only regular files whose contents
+// are backed by a bytes.Buffer that is regenerated when necessary, consistent
+// with Linux's fs/seq_file.c:single_open().
+//
+// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first
+// use.
+type DynamicBytesFileDescriptionImpl struct {
+	data     DynamicBytesSource // immutable
+	mu       sync.Mutex         // protects the following fields
+	buf      bytes.Buffer
+	off      int64
+	lastRead int64 // offset at which the last Read, PRead, or Seek ended
+}
+
+// DynamicBytesSource represents a data source for a
+// DynamicBytesFileDescriptionImpl.
+type DynamicBytesSource interface {
+	// Generate writes the file's contents to buf.
+	Generate(ctx context.Context, buf *bytes.Buffer) error
+}
+
+// SetDataSource must be called exactly once on fd before first use.
+func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
+	fd.data = data
+}
+
+// Preconditions: fd.mu must be locked.
+func (fd *DynamicBytesFileDescriptionImpl) preadLocked(ctx context.Context, dst usermem.IOSequence, offset int64, opts *ReadOptions) (int64, error) {
+	// Regenerate the buffer if it's empty, or before pread() at a new offset.
+	// Compare fs/seq_file.c:seq_read() => traverse().
+	switch {
+	case offset != fd.lastRead:
+		fd.buf.Reset()
+		fallthrough
+	case fd.buf.Len() == 0:
+		if err := fd.data.Generate(ctx, &fd.buf); err != nil {
+			fd.buf.Reset()
+			// fd.off is not updated in this case.
+			fd.lastRead = 0
+			return 0, err
+		}
+	}
+	bs := fd.buf.Bytes()
+	if offset >= int64(len(bs)) {
+		return 0, io.EOF
+	}
+	n, err := dst.CopyOut(ctx, bs[offset:])
+	fd.lastRead = offset + int64(n)
+	return int64(n), err
+}
+
+// PRead implements FileDescriptionImpl.PRead.
+func (fd *DynamicBytesFileDescriptionImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.preadLocked(ctx, dst, offset, &opts)
+	fd.mu.Unlock()
+	return n, err
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (fd *DynamicBytesFileDescriptionImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.preadLocked(ctx, dst, fd.off, &opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// Seek implements FileDescriptionImpl.Seek.
+func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as given.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	default:
+		// fs/seq_file:seq_lseek() rejects SEEK_END etc.
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if offset != fd.lastRead {
+		// Regenerate the file's contents immediately. Compare
+		// fs/seq_file.c:seq_lseek() => traverse().
+		fd.buf.Reset()
+		if err := fd.data.Generate(ctx, &fd.buf); err != nil {
+			fd.buf.Reset()
+			fd.off = 0
+			fd.lastRead = 0
+			return 0, err
+		}
+		fd.lastRead = offset
+	}
+	fd.off = offset
+	return offset, nil
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
new file mode 100644
index 000000000..511b829fc
--- /dev/null
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -0,0 +1,141 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"sync/atomic"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// fileDescription is the common fd struct which a filesystem implementation
+// embeds in all of its file description implementations as required.
+type fileDescription struct {
+	vfsfd FileDescription
+	FileDescriptionDefaultImpl
+}
+
+// genCountFD is a read-only FileDescriptionImpl representing a regular file
+// that contains the number of times its DynamicBytesSource.Generate()
+// implementation has been called.
+type genCountFD struct {
+	fileDescription
+	DynamicBytesFileDescriptionImpl
+
+	count uint64 // accessed using atomic memory ops
+}
+
+func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription {
+	var fd genCountFD
+	fd.vfsfd.Init(&fd, mnt, vfsd)
+	fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd)
+	return &fd.vfsfd
+}
+
+// Release implements FileDescriptionImpl.Release.
+func (fd *genCountFD) Release() {
+}
+
+// StatusFlags implements FileDescriptionImpl.StatusFlags.
+func (fd *genCountFD) StatusFlags(ctx context.Context) (uint32, error) {
+	return 0, nil
+}
+
+// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags.
+func (fd *genCountFD) SetStatusFlags(ctx context.Context, flags uint32) error {
+	return syserror.EPERM
+}
+
+// Stat implements FileDescriptionImpl.Stat.
+func (fd *genCountFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+	// Note that Statx.Mask == 0 in the return value.
+	return linux.Statx{}, nil
+}
+
+// SetStat implements FileDescriptionImpl.SetStat.
+func (fd *genCountFD) SetStat(ctx context.Context, opts SetStatOptions) error {
+	return syserror.EPERM
+}
+
+// Generate implements DynamicBytesSource.Generate.
+func (fd *genCountFD) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%d", atomic.AddUint64(&fd.count, 1))
+	return nil
+}
+
+func TestGenCountFD(t *testing.T) {
+	ctx := contexttest.Context(t)
+	creds := auth.CredentialsFromContext(ctx)
+
+	vfsObj := New() // vfs.New()
+	vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &NewFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("failed to create testfs root mount: %v", err)
+	}
+	vd := mntns.Root()
+	defer vd.DecRef()
+
+	fd := newGenCountFD(vd.Mount(), vd.Dentry())
+	defer fd.DecRef()
+
+	// The first read causes Generate to be called to fill the FD's buffer.
+	buf := make([]byte, 2)
+	ioseq := usermem.BytesIOSequence(buf)
+	n, err := fd.Impl().Read(ctx, ioseq, ReadOptions{})
+	if n != 1 || (err != nil && err != io.EOF) {
+		t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err)
+	}
+	if want := byte('1'); buf[0] != want {
+		t.Errorf("first Read: got byte %c, wanted %c", buf[0], want)
+	}
+
+	// A second read without seeking is still at EOF.
+	n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+	if n != 0 || err != io.EOF {
+		t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err)
+	}
+
+	// Seeking to the beginning of the file causes it to be regenerated.
+	n, err = fd.Impl().Seek(ctx, 0, linux.SEEK_SET)
+	if n != 0 || err != nil {
+		t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err)
+	}
+	n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+	if n != 1 || (err != nil && err != io.EOF) {
+		t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err)
+	}
+	if want := byte('2'); buf[0] != want {
+		t.Errorf("Read after Seek: got byte %c, wanted %c", buf[0], want)
+	}
+
+	// PRead at the beginning of the file also causes it to be regenerated.
+	n, err = fd.Impl().PRead(ctx, ioseq, 0, ReadOptions{})
+	if n != 1 || (err != nil && err != io.EOF) {
+		t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err)
+	}
+	if want := byte('3'); buf[0] != want {
+		t.Errorf("PRead: got byte %c, wanted %c", buf[0], want)
+	}
+}
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
new file mode 100644
index 000000000..70b192ece
--- /dev/null
+++ b/pkg/sentry/vfs/testutil.go
@@ -0,0 +1,139 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FDTestFilesystemType is a test-only FilesystemType that produces Filesystems
+// for which all FilesystemImpl methods taking a path return EPERM. It is used
+// to produce Mounts and Dentries for testing of FileDescriptionImpls that do
+// not depend on their originating Filesystem.
+type FDTestFilesystemType struct{}
+
+// FDTestFilesystem is a test-only FilesystemImpl produced by
+// FDTestFilesystemType.
+type FDTestFilesystem struct {
+	vfsfs Filesystem
+}
+
+// NewFilesystem implements FilesystemType.NewFilesystem.
+func (fstype FDTestFilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) {
+	var fs FDTestFilesystem
+	fs.vfsfs.Init(&fs)
+	return &fs.vfsfs, fs.NewDentry(), nil
+}
+
+// Release implements FilesystemImpl.Release.
+func (fs *FDTestFilesystem) Release() {
+}
+
+// Sync implements FilesystemImpl.Sync.
+func (fs *FDTestFilesystem) Sync(ctx context.Context) error {
+	return nil
+}
+
+// GetDentryAt implements FilesystemImpl.GetDentryAt.
+func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// LinkAt implements FilesystemImpl.LinkAt.
+func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
+	return syserror.EPERM
+}
+
+// MkdirAt implements FilesystemImpl.MkdirAt.
+func (fs *FDTestFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error {
+	return syserror.EPERM
+}
+
+// MknodAt implements FilesystemImpl.MknodAt.
+func (fs *FDTestFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error {
+	return syserror.EPERM
+}
+
+// OpenAt implements FilesystemImpl.OpenAt.
+func (fs *FDTestFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) {
+	return nil, syserror.EPERM
+}
+
+// ReadlinkAt implements FilesystemImpl.ReadlinkAt.
+func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) {
+	return "", syserror.EPERM
+}
+
+// RenameAt implements FilesystemImpl.RenameAt.
+func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error {
+	return syserror.EPERM
+}
+
+// RmdirAt implements FilesystemImpl.RmdirAt.
+func (fs *FDTestFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error {
+	return syserror.EPERM
+}
+
+// SetStatAt implements FilesystemImpl.SetStatAt.
+func (fs *FDTestFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error {
+	return syserror.EPERM
+}
+
+// StatAt implements FilesystemImpl.StatAt.
+func (fs *FDTestFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) {
+	return linux.Statx{}, syserror.EPERM
+}
+
+// StatFSAt implements FilesystemImpl.StatFSAt.
+func (fs *FDTestFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) {
+	return linux.Statfs{}, syserror.EPERM
+}
+
+// SymlinkAt implements FilesystemImpl.SymlinkAt.
+func (fs *FDTestFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error {
+	return syserror.EPERM
+}
+
+// UnlinkAt implements FilesystemImpl.UnlinkAt.
+func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error {
+	return syserror.EPERM
+}
+
+type fdTestDentry struct {
+	vfsd Dentry
+}
+
+// NewDentry returns a new Dentry.
+func (fs *FDTestFilesystem) NewDentry() *Dentry {
+	var d fdTestDentry
+	d.vfsd.Init(&d)
+	return &d.vfsd
+}
+
+// IncRef implements DentryImpl.IncRef.
+func (d *fdTestDentry) IncRef(vfsfs *Filesystem) {
+}
+
+// TryIncRef implements DentryImpl.TryIncRef.
+func (d *fdTestDentry) TryIncRef(vfsfs *Filesystem) bool {
+	return true
+}
+
+// DecRef implements DentryImpl.DecRef.
+func (d *fdTestDentry) DecRef(vfsfs *Filesystem) {
+}