23 files changed, 708 insertions, 105 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 26f68fe3d..5451f1eba 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -21,6 +21,7 @@ go_library(
         "network.go",
         "strace.go",
         "user.go",
+        "vfs.go",
     ],
     visibility = [
         "//runsc:__subpackages__",
@@ -33,6 +34,7 @@ go_library(
         "//pkg/control/server",
         "//pkg/cpuid",
         "//pkg/eventchannel",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/memutil",
         "//pkg/rand",
@@ -40,6 +42,7 @@ go_library(
         "//pkg/sentry/arch",
         "//pkg/sentry/arch:registers_go_proto",
         "//pkg/sentry/control",
+        "//pkg/sentry/devices/memdev",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/dev",
         "//pkg/sentry/fs/gofer",
@@ -49,6 +52,12 @@ go_library(
         "//pkg/sentry/fs/sys",
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/fs/tty",
+        "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/fsimpl/gofer",
+        "//pkg/sentry/fsimpl/host",
+        "//pkg/sentry/fsimpl/proc",
+        "//pkg/sentry/fsimpl/sys",
+        "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel:uncaught_signal_go_proto",
@@ -71,6 +80,7 @@ go_library(
         "//pkg/sentry/time",
         "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
         "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
         "//pkg/sentry/watchdog",
         "//pkg/sync",
         "//pkg/syserror",
@@ -114,10 +124,12 @@ go_test(
         "//pkg/p9",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sync",
         "//pkg/unet",
         "//runsc/fsgofer",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 8995d678e..b7cfb35bf 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -65,7 +65,7 @@ func newCompatEmitter(logFD int) (*compatEmitter, error) {
 
 	if logFD > 0 {
 		f := os.NewFile(uintptr(logFD), "user log file")
-		target := &log.MultiEmitter{c.sink, &log.K8sJSONEmitter{log.Writer{Next: f}}}
+		target := &log.MultiEmitter{c.sink, log.K8sJSONEmitter{&log.Writer{Next: f}}}
 		c.sink = &log.BasicLogger{Level: log.Info, Emitter: target}
 	}
 	return c, nil
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 7ea5bfade..715a19112 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -305,5 +305,10 @@ func (c *Config) ToFlags() []string {
 	if len(c.TestOnlyTestNameEnv) != 0 {
 		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
 	}
+
+	if c.VFS2 {
+		f = append(f, "--vfs2=true")
+	}
+
 	return f
 }
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 5314b0f2a..7e49f6f9f 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
+	vfshost "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 )
 
@@ -31,6 +32,10 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
 		return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
 	}
 
+	if kernel.VFS2Enabled {
+		return createFDTableVFS2(ctx, console, stdioFDs)
+	}
+
 	k := kernel.KernelFromContext(ctx)
 	fdTable := k.NewFDTable()
 	defer fdTable.DecRef()
@@ -78,3 +83,31 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
 	fdTable.IncRef()
 	return fdTable, nil
 }
+
+func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) {
+	k := kernel.KernelFromContext(ctx)
+	fdTable := k.NewFDTable()
+	defer fdTable.DecRef()
+
+	hostMount, err := vfshost.NewMount(k.VFS())
+	if err != nil {
+		return nil, fmt.Errorf("creating host mount: %w", err)
+	}
+
+	for appFD, hostFD := range stdioFDs {
+		// TODO(gvisor.dev/issue/1482): Add TTY support.
+		appFile, err := vfshost.ImportFD(hostMount, hostFD, false)
+		if err != nil {
+			return nil, err
+		}
+
+		if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
+			appFile.DecRef()
+			return nil, err
+		}
+		appFile.DecRef()
+	}
+
+	fdTable.IncRef()
+	return fdTable, nil
+}
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 06b9f888a..1828d116a 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -44,7 +44,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 		{
 			seccomp.AllowAny{},
 			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.AllowValue(syscall.O_CLOEXEC),
 		},
 	},
 	syscall.SYS_EPOLL_CREATE1: {},
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 0f62842ea..98cce60af 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -278,6 +278,9 @@ func subtargets(root string, mnts []specs.Mount) []string {
 }
 
 func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	if conf.VFS2 {
+		return setupContainerVFS2(ctx, conf, mntr, procArgs)
+	}
 	mns, err := mntr.setupFS(conf, procArgs)
 	if err != nil {
 		return err
@@ -573,6 +576,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 // should be mounted (e.g. a volume shared between containers). It must be
 // called for the root container only.
 func (c *containerMounter) processHints(conf *Config) error {
+	if conf.VFS2 {
+		return nil
+	}
 	ctx := c.k.SupervisorContext()
 	for _, hint := range c.hints.mounts {
 		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
@@ -781,9 +787,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
 	default:
-		// TODO(nlacasse): Support all the mount types and make this a fatal error.
-		// Most applications will "just work" without them, so this is a warning
-		// for now.
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
 	}
 	return fsName, opts, useOverlay, nil
@@ -824,7 +827,20 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 
 	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
 	if err != nil {
-		return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+		err := fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+		// Check to see if this is a common error due to a Linux bug.
+		// This error is generated here in order to cause it to be
+		// printed to the user using Docker via 'runsc create' etc. rather
+		// than simply printed to the logs for the 'runsc boot' command.
+		//
+		// We check the error message string rather than type because the
+		// actual error types (syscall.EIO, syscall.EPIPE) are lost by file system
+		// implementation (e.g. p9).
+		// TODO(gvisor.dev/issue/1765): Remove message when bug is resolved.
+		if strings.Contains(err.Error(), syscall.EIO.Error()) || strings.Contains(err.Error(), syscall.EPIPE.Error()) {
+			return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug"))
+		}
+		return err
 	}
 
 	// If there are submounts, we need to overlay the mount on top of a ramfs
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index e7ca98134..cf1f47bc7 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -26,7 +26,6 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
@@ -73,6 +72,8 @@ import (
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
 )
 
+var syscallTable *kernel.SyscallTable
+
 // Loader keeps state needed to start the kernel and run the container..
 type Loader struct {
 	// k is the kernel.
@@ -156,13 +157,17 @@ type Args struct {
 	Spec *specs.Spec
 	// Conf is the system configuration.
 	Conf *Config
-	// ControllerFD is the FD to the URPC controller.
+	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
+	// of this FD and may close it at any time.
 	ControllerFD int
-	// Device is an optional argument that is passed to the platform.
+	// Device is an optional argument that is passed to the platform. The Loader
+	// takes ownership of this file and may close it at any time.
 	Device *os.File
-	// GoferFDs is an array of FDs used to connect with the Gofer.
+	// GoferFDs is an array of FDs used to connect with the Gofer. The Loader
+	// takes ownership of these FDs and may close them at any time.
 	GoferFDs []int
-	// StdioFDs is the stdio for the application.
+	// StdioFDs is the stdio for the application. The Loader takes ownership of
+	// these FDs and may close them at any time.
 	StdioFDs []int
 	// Console is set to true if using TTY.
 	Console bool
@@ -175,6 +180,9 @@ type Args struct {
 	UserLogFD int
 }
 
+// make sure stdioFDs are always the same on initial start and on restore
+const startingStdioFD = 64
+
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
 func New(args Args) (*Loader, error) {
@@ -188,13 +196,14 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("setting up memory usage: %v", err)
 	}
 
-	if args.Conf.VFS2 {
-		st, ok := kernel.LookupSyscallTable(abi.Linux, arch.Host)
-		if ok {
-			vfs2.Override(st.Table)
-		}
+	// Patch the syscall table.
+	kernel.VFS2Enabled = args.Conf.VFS2
+	if kernel.VFS2Enabled {
+		vfs2.Override(syscallTable.Table)
 	}
 
+	kernel.RegisterSyscallTable(syscallTable)
+
 	// Create kernel and platform.
 	p, err := createPlatform(args.Conf, args.Device)
 	if err != nil {
@@ -319,6 +328,24 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("creating pod mount hints: %v", err)
 	}
 
+	// Make host FDs stable between invocations. Host FDs must map to the exact
+	// same number when the sandbox is restored. Otherwise the wrong FD will be
+	// used.
+	var stdioFDs []int
+	newfd := startingStdioFD
+	for _, fd := range args.StdioFDs {
+		err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC)
+		if err != nil {
+			return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+		}
+		stdioFDs = append(stdioFDs, newfd)
+		err = syscall.Close(fd)
+		if err != nil {
+			return nil, fmt.Errorf("close original stdioFDs failed: %v", err)
+		}
+		newfd++
+	}
+
 	eid := execID{cid: args.ID}
 	l := &Loader{
 		k:            k,
@@ -327,7 +354,7 @@ func New(args Args) (*Loader, error) {
 		watchdog:     dog,
 		spec:         args.Spec,
 		goferFDs:     args.GoferFDs,
-		stdioFDs:     args.StdioFDs,
+		stdioFDs:     stdioFDs,
 		rootProcArgs: procArgs,
 		sandboxID:    args.ID,
 		processes:    map[execID]*execProcess{eid: {}},
@@ -367,11 +394,16 @@ func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.
 		return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err)
 	}
 
+	wd := spec.Process.Cwd
+	if wd == "" {
+		wd = "/"
+	}
+
 	// Create the process arguments.
 	procArgs := kernel.CreateProcessArgs{
 		Argv:                    spec.Process.Args,
 		Envv:                    spec.Process.Env,
-		WorkingDirectory:        spec.Process.Cwd, // Defaults to '/' if empty.
+		WorkingDirectory:        wd,
 		Credentials:             creds,
 		Umask:                   0022,
 		Limits:                  ls,
@@ -516,7 +548,15 @@ func (l *Loader) run() error {
 		}
 
 		// Add the HOME enviroment variable if it is not already set.
-		envv, err := maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+		var envv []string
+		if kernel.VFS2Enabled {
+			envv, err = maybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
+				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+
+		} else {
+			envv, err = maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
+				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+		}
 		if err != nil {
 			return err
 		}
@@ -569,6 +609,16 @@ func (l *Loader) run() error {
 		}
 	})
 
+	// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
+	// either in createFDTable() during initial start or in descriptor.initAfterLoad()
+	// during restore, we can release l.stdioFDs now.
+	for _, fd := range l.stdioFDs {
+		err := syscall.Close(fd)
+		if err != nil {
+			return fmt.Errorf("close dup()ed stdioFDs: %v", err)
+		}
+	}
+
 	log.Infof("Process should have started...")
 	l.watchdog.Start()
 	return l.k.Start()
diff --git a/runsc/boot/loader_amd64.go b/runsc/boot/loader_amd64.go
index b9669f2ac..78df86611 100644
--- a/runsc/boot/loader_amd64.go
+++ b/runsc/boot/loader_amd64.go
@@ -17,11 +17,10 @@
 package boot
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
 )
 
 func init() {
-	// Register the global syscall table.
-	kernel.RegisterSyscallTable(linux.AMD64)
+	// Set the global syscall table.
+	syscallTable = linux.AMD64
 }
diff --git a/runsc/boot/loader_arm64.go b/runsc/boot/loader_arm64.go
index cf64d28c8..250785010 100644
--- a/runsc/boot/loader_arm64.go
+++ b/runsc/boot/loader_arm64.go
@@ -17,11 +17,10 @@
 package boot
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
 )
 
 func init() {
-	// Register the global syscall table.
-	kernel.RegisterSyscallTable(linux.ARM64)
+	// Set the global syscall table.
+	syscallTable = linux.ARM64
 }
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 44aa63196..e7c71734f 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -24,11 +24,13 @@ import (
 	"time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/control/server"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/runsc/fsgofer"
@@ -65,6 +67,11 @@ func testSpec() *specs.Spec {
 	}
 }
 
+func resetSyscallTable() {
+	kernel.VFS2Enabled = false
+	kernel.FlushSyscallTablesTestOnly()
+}
+
 // startGofer starts a new gofer routine serving 'root' path. It returns the
 // sandbox side of the connection, and a function that when called will stop the
 // gofer.
@@ -100,7 +107,7 @@ func startGofer(root string) (int, func(), error) {
 	return sandboxEnd, cleanup, nil
 }
 
-func createLoader() (*Loader, func(), error) {
+func createLoader(vfsEnabled bool) (*Loader, func(), error) {
 	fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10]))
 	if err != nil {
 		return nil, nil, err
@@ -108,12 +115,23 @@ func createLoader() (*Loader, func(), error) {
 	conf := testConfig()
 	spec := testSpec()
 
+	conf.VFS2 = vfsEnabled
+
 	sandEnd, cleanup, err := startGofer(spec.Root.Path)
 	if err != nil {
 		return nil, nil, err
 	}
 
-	stdio := []int{int(os.Stdin.Fd()), int(os.Stdout.Fd()), int(os.Stderr.Fd())}
+	// Loader takes ownership of stdio.
+	var stdio []int
+	for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
+		newFd, err := unix.Dup(int(f.Fd()))
+		if err != nil {
+			return nil, nil, err
+		}
+		stdio = append(stdio, newFd)
+	}
+
 	args := Args{
 		ID:           "foo",
 		Spec:         spec,
@@ -132,10 +150,22 @@ func createLoader() (*Loader, func(), error) {
 
 // TestRun runs a simple application in a sandbox and checks that it succeeds.
 func TestRun(t *testing.T) {
-	l, cleanup, err := createLoader()
+	defer resetSyscallTable()
+	doRun(t, false)
+}
+
+// TestRunVFS2 runs TestRun in VFSv2.
+func TestRunVFS2(t *testing.T) {
+	defer resetSyscallTable()
+	doRun(t, true)
+}
+
+func doRun(t *testing.T, vfsEnabled bool) {
+	l, cleanup, err := createLoader(vfsEnabled)
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
+
 	defer l.Destroy()
 	defer cleanup()
 
@@ -169,7 +199,18 @@ func TestRun(t *testing.T) {
 // TestStartSignal tests that the controller Start message will cause
 // WaitForStartSignal to return.
 func TestStartSignal(t *testing.T) {
-	l, cleanup, err := createLoader()
+	defer resetSyscallTable()
+	doStartSignal(t, false)
+}
+
+// TestStartSignalVFS2 does TestStartSignal with VFS2.
+func TestStartSignalVFS2(t *testing.T) {
+	defer resetSyscallTable()
+	doStartSignal(t, true)
+}
+
+func doStartSignal(t *testing.T, vfsEnabled bool) {
+	l, cleanup, err := createLoader(vfsEnabled)
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
diff --git a/runsc/boot/user.go b/runsc/boot/user.go
index f0aa52135..332e4fce5 100644
--- a/runsc/boot/user.go
+++ b/runsc/boot/user.go
@@ -23,8 +23,10 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -84,6 +86,48 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K
 		File: f,
 	}
 
+	return findHomeInPasswd(uint32(uid), r, defaultHome)
+}
+
+type fileReaderVFS2 struct {
+	ctx context.Context
+	fd  *vfs.FileDescription
+}
+
+func (r *fileReaderVFS2) Read(buf []byte) (int, error) {
+	n, err := r.fd.Read(r.ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+	return int(n), err
+}
+
+func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.KUID) (string, error) {
+	const defaultHome = "/"
+
+	root := mns.Root()
+	defer root.DecRef()
+
+	creds := auth.CredentialsFromContext(ctx)
+
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse("/etc/passwd"),
+	}
+
+	opts := &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	}
+
+	fd, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, target, opts)
+	if err != nil {
+		return defaultHome, nil
+	}
+	defer fd.DecRef()
+
+	r := &fileReaderVFS2{
+		ctx: ctx,
+		fd:  fd,
+	}
+
 	homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome)
 	if err != nil {
 		return "", err
@@ -111,6 +155,26 @@ func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.
 	if err != nil {
 		return nil, fmt.Errorf("error reading exec user: %v", err)
 	}
+
+	return append(envv, "HOME="+homeDir), nil
+}
+
+func maybeAddExecUserHomeVFS2(ctx context.Context, vmns *vfs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
+	// Check if the envv already contains HOME.
+	for _, env := range envv {
+		if strings.HasPrefix(env, "HOME=") {
+			// We have it. Return the original slice unmodified.
+			return envv, nil
+		}
+	}
+
+	// Read /etc/passwd for the user's HOME directory and set the HOME
+	// environment variable as required by POSIX if it is not overridden by
+	// the user.
+	homeDir, err := getExecUserHomeVFS2(ctx, vmns, uid)
+	if err != nil {
+		return nil, fmt.Errorf("error reading exec user: %v", err)
+	}
 	return append(envv, "HOME="+homeDir), nil
 }
 
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
new file mode 100644
index 000000000..82083c57d
--- /dev/null
+++ b/runsc/boot/vfs.go
@@ -0,0 +1,310 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"path"
+	"strconv"
+	"strings"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/devices/memdev"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	devtmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	goferimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
+	procimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
+	sysimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
+	tmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
+
+	vfsObj.MustRegisterFilesystemType(rootFsName, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList: true,
+	})
+
+	vfsObj.MustRegisterFilesystemType(bind, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList: true,
+	})
+
+	vfsObj.MustRegisterFilesystemType(devpts, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+
+	vfsObj.MustRegisterFilesystemType(devtmpfs, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(proc, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(sysfs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(tmpfs, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(nonefs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+
+	// Setup files in devtmpfs.
+	if err := memdev.Register(vfsObj); err != nil {
+		return fmt.Errorf("registering memdev: %w", err)
+	}
+	a, err := devtmpfsimpl.NewAccessor(ctx, vfsObj, creds, devtmpfsimpl.Name)
+	if err != nil {
+		return fmt.Errorf("creating devtmpfs accessor: %w", err)
+	}
+	defer a.Release()
+
+	if err := a.UserspaceInit(ctx); err != nil {
+		return fmt.Errorf("initializing userspace: %w", err)
+	}
+	if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
+		return fmt.Errorf("creating devtmpfs files: %w", err)
+	}
+	return nil
+}
+
+func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	if err := mntr.k.VFS().Init(); err != nil {
+		return fmt.Errorf("failed to initialize VFS: %w", err)
+	}
+	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+	if err != nil {
+		return fmt.Errorf("failed to setupFS: %w", err)
+	}
+	procArgs.MountNamespaceVFS2 = mns
+	return setExecutablePathVFS2(ctx, procArgs)
+}
+
+func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
+
+	exe := procArgs.Argv[0]
+
+	// Absolute paths can be used directly.
+	if path.IsAbs(exe) {
+		procArgs.Filename = exe
+		return nil
+	}
+
+	// Paths with '/' in them should be joined to the working directory, or
+	// to the root if working directory is not set.
+	if strings.IndexByte(exe, '/') > 0 {
+
+		if !path.IsAbs(procArgs.WorkingDirectory) {
+			return fmt.Errorf("working directory %q must be absolute", procArgs.WorkingDirectory)
+		}
+
+		procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe)
+		return nil
+	}
+
+	// Paths with a '/' are relative to the CWD.
+	if strings.IndexByte(exe, '/') > 0 {
+		procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe)
+		return nil
+	}
+
+	// Otherwise, We must lookup the name in the paths, starting from the
+	// root directory.
+	root := procArgs.MountNamespaceVFS2.Root()
+	defer root.DecRef()
+
+	paths := fs.GetPath(procArgs.Envv)
+	creds := procArgs.Credentials
+
+	for _, p := range paths {
+
+		binPath := path.Join(p, exe)
+
+		pop := &vfs.PathOperation{
+			Root:               root,
+			Start:              root,
+			Path:               fspath.Parse(binPath),
+			FollowFinalSymlink: true,
+		}
+
+		opts := &vfs.OpenOptions{
+			FileExec: true,
+			Flags:    linux.O_RDONLY,
+		}
+
+		dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts)
+		if err == syserror.ENOENT || err == syserror.EACCES {
+			// Didn't find it here.
+			continue
+		}
+		if err != nil {
+			return err
+		}
+		dentry.DecRef()
+
+		procArgs.Filename = binPath
+		return nil
+	}
+
+	return fmt.Errorf("executable %q not found in $PATH=%q", exe, strings.Join(paths, ":"))
+}
+
+func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+	log.Infof("Configuring container's file system with VFS2")
+
+	// Create context with root credentials to mount the filesystem (the current
+	// user may not be privileged enough).
+	rootProcArgs := *procArgs
+	rootProcArgs.WorkingDirectory = "/"
+	rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
+	rootProcArgs.Umask = 0022
+	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
+	rootCtx := procArgs.NewContext(c.k)
+
+	creds := procArgs.Credentials
+	if err := registerFilesystems(rootCtx, c.k.VFS(), creds); err != nil {
+		return nil, fmt.Errorf("register filesystems: %w", err)
+	}
+
+	fd := c.fds.remove()
+
+	opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",")
+
+	log.Infof("Mounting root over 9P, ioFD: %d", fd)
+	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts})
+	if err != nil {
+		return nil, fmt.Errorf("setting up mountnamespace: %w", err)
+	}
+
+	rootProcArgs.MountNamespaceVFS2 = mns
+
+	// Mount submounts.
+	if err := c.mountSubmountsVFS2(rootCtx, conf, mns, creds); err != nil {
+		return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
+	}
+
+	return mns, nil
+}
+
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+
+	for _, submount := range c.mounts {
+		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
+		if err := c.mountSubmountVFS2(ctx, conf, mns, creds, &submount); err != nil {
+			return err
+		}
+	}
+
+	// TODO(gvisor.dev/issue/1487): implement mountTmp from fs.go.
+
+	return c.checkDispenser()
+}
+
+// TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1 version.
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *specs.Mount) error {
+	root := mns.Root()
+	defer root.DecRef()
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(submount.Destination),
+	}
+
+	_, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount)
+	if err != nil {
+		return fmt.Errorf("mountOptions failed: %w", err)
+	}
+
+	opts := &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			Data: strings.Join(options, ","),
+		},
+		InternalMount: true,
+	}
+
+	// All writes go to upper, be paranoid and make lower readonly.
+	opts.ReadOnly = useOverlay
+
+	if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil {
+		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+	}
+	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts)
+	return nil
+}
+
+// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m specs.Mount) (string, []string, bool, error) {
+	var (
+		fsName     string
+		opts       []string
+		useOverlay bool
+	)
+
+	switch m.Type {
+	case devpts, devtmpfs, proc, sysfs:
+		fsName = m.Type
+	case nonefs:
+		fsName = sysfs
+	case tmpfs:
+		fsName = m.Type
+
+		var err error
+		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+		if err != nil {
+			return "", nil, false, err
+		}
+
+	case bind:
+		fd := c.fds.remove()
+		fsName = "9p"
+		opts = p9MountOptionsVFS2(fd, c.getMountAccessType(m))
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
+	default:
+		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+	}
+	return fsName, opts, useOverlay, nil
+}
+
+// p9MountOptions creates a slice of options for a p9 mount.
+// TODO(gvisor.dev/issue/1200): Remove this version in favor of the one in
+// fs.go when privateunixsocket lands.
+func p9MountOptionsVFS2(fd int, fa FileAccessType) []string {
+	opts := []string{
+		"trans=fd",
+		"rfdno=" + strconv.Itoa(fd),
+		"wfdno=" + strconv.Itoa(fd),
+	}
+	if fa == FileAccessShared {
+		opts = append(opts, "cache=remote_revalidating")
+	}
+	return opts
+}
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index 0c27f7313..9360d7442 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -85,7 +85,7 @@ func TestCapabilities(t *testing.T) {
 		Inheritable: caps,
 	}
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 
 	// Use --network=host to make sandbox use spec's capabilities.
 	conf.Network = boot.NetworkHost
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 02e5af3d3..28f0d54b9 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -272,9 +272,8 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 
 	root := spec.Root.Path
 	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		// FIXME: runsc can't be re-executed without
-		// /proc, so we create a tmpfs mount, mount ./proc and ./root
-		// there, then move this mount to the root and after
+		// runsc can't be re-executed without /proc, so we create a tmpfs mount,
+		// mount ./proc and ./root there, then move this mount to the root and after
 		// setCapsAndCallSelf, runsc will chroot into /root.
 		//
 		// We need a directory to construct a new root and we know that
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 651615d4c..af245b6d8 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -118,7 +118,7 @@ func receiveConsolePTY(srv *unet.ServerSocket) (*os.File, error) {
 
 // Test that an pty FD is sent over the console socket if one is provided.
 func TestConsoleSocket(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 		spec := testutil.NewSpecWithArgs("true")
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
@@ -163,7 +163,7 @@ func TestConsoleSocket(t *testing.T) {
 // Test that job control signals work on a console created with "exec -ti".
 func TestJobControlSignalExec(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 
 	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
@@ -286,7 +286,7 @@ func TestJobControlSignalExec(t *testing.T) {
 
 // Test that job control signals work on a console created with "run -ti".
 func TestJobControlSignalRootContainer(t *testing.T) {
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	// Don't let bash execute from profile or rc files, otherwise our PID
 	// counts get messed up.
 	spec := testutil.NewSpecWithArgs("/bin/bash", "--noprofile", "--norc")
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 442e80ac0..5db6d64aa 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -251,12 +251,12 @@ var noOverlay = []configOption{kvm, nonExclusiveFS}
 var all = append(noOverlay, overlay)
 
 // configs generates different configurations to run tests.
-func configs(opts ...configOption) []*boot.Config {
+func configs(t *testing.T, opts ...configOption) []*boot.Config {
 	// Always load the default config.
-	cs := []*boot.Config{testutil.TestConfig()}
+	cs := []*boot.Config{testutil.TestConfig(t)}
 
 	for _, o := range opts {
-		c := testutil.TestConfig()
+		c := testutil.TestConfig(t)
 		switch o {
 		case overlay:
 			c.Overlay = true
@@ -285,7 +285,7 @@ func TestLifecycle(t *testing.T) {
 	childReaper.Start()
 	defer childReaper.Stop()
 
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 		// The container will just sleep for a long time.  We will kill it before
 		// it finishes sleeping.
@@ -457,7 +457,7 @@ func TestExePath(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 		for _, test := range []struct {
 			path    string
@@ -521,9 +521,19 @@ func TestExePath(t *testing.T) {
 
 // Test the we can retrieve the application exit status from the container.
 func TestAppExitStatus(t *testing.T) {
+	doAppExitStatus(t, false)
+}
+
+// This is TestAppExitStatus for VFSv2.
+func TestAppExitStatusVFS2(t *testing.T) {
+	doAppExitStatus(t, true)
+}
+
+func doAppExitStatus(t *testing.T, vfs2 bool) {
 	// First container will succeed.
 	succSpec := testutil.NewSpecWithArgs("true")
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
+	conf.VFS2 = vfs2
 	rootDir, bundleDir, err := testutil.SetupContainer(succSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -573,7 +583,7 @@ func TestAppExitStatus(t *testing.T) {
 
 // TestExec verifies that a container can exec a new program.
 func TestExec(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		const uid = 343
@@ -667,7 +677,7 @@ func TestExec(t *testing.T) {
 
 // TestKillPid verifies that we can signal individual exec'd processes.
 func TestKillPid(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		app, err := testutil.FindFile("runsc/container/test_app/test_app")
@@ -743,7 +753,7 @@ func TestKillPid(t *testing.T) {
 // be the next consecutive number after the last number from the checkpointed container.
 func TestCheckpointRestore(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
-	for _, conf := range configs(noOverlay...) {
+	for _, conf := range configs(t, noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
@@ -904,7 +914,7 @@ func TestCheckpointRestore(t *testing.T) {
 // with filesystem Unix Domain Socket use.
 func TestUnixDomainSockets(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
-	for _, conf := range configs(noOverlay...) {
+	for _, conf := range configs(t, noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		// UDS path is limited to 108 chars for compatibility with older systems.
@@ -1042,7 +1052,7 @@ func TestUnixDomainSockets(t *testing.T) {
 // recreated. Then it resumes the container, verify that the file gets created
 // again.
 func TestPauseResume(t *testing.T) {
-	for _, conf := range configs(noOverlay...) {
+	for _, conf := range configs(t, noOverlay...) {
 		t.Run(fmt.Sprintf("conf: %+v", conf), func(t *testing.T) {
 			t.Logf("Running test with conf: %+v", conf)
 
@@ -1123,7 +1133,7 @@ func TestPauseResume(t *testing.T) {
 // occurs given the correct state.
 func TestPauseResumeStatus(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("sleep", "20")
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -1189,7 +1199,7 @@ func TestCapabilities(t *testing.T) {
 	uid := auth.KUID(os.Getuid() + 1)
 	gid := auth.KGID(os.Getgid() + 1)
 
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		spec := testutil.NewSpecWithArgs("sleep", "100")
@@ -1278,7 +1288,7 @@ func TestCapabilities(t *testing.T) {
 // TestRunNonRoot checks that sandbox can be configured when running as
 // non-privileged user.
 func TestRunNonRoot(t *testing.T) {
-	for _, conf := range configs(noOverlay...) {
+	for _, conf := range configs(t, noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		spec := testutil.NewSpecWithArgs("/bin/true")
@@ -1322,7 +1332,7 @@ func TestRunNonRoot(t *testing.T) {
 // TestMountNewDir checks that runsc will create destination directory if it
 // doesn't exit.
 func TestMountNewDir(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		root, err := ioutil.TempDir(testutil.TmpDir(), "root")
@@ -1351,7 +1361,7 @@ func TestMountNewDir(t *testing.T) {
 }
 
 func TestReadonlyRoot(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
@@ -1389,7 +1399,7 @@ func TestReadonlyRoot(t *testing.T) {
 }
 
 func TestUIDMap(t *testing.T) {
-	for _, conf := range configs(noOverlay...) {
+	for _, conf := range configs(t, noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 		testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount")
 		if err != nil {
@@ -1470,7 +1480,7 @@ func TestUIDMap(t *testing.T) {
 }
 
 func TestReadonlyMount(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
@@ -1527,7 +1537,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	cids := []string{
@@ -1585,7 +1595,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 
 func TestGoferExits(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -1654,7 +1664,7 @@ func TestRootNotMount(t *testing.T) {
 	spec.Root.Readonly = true
 	spec.Mounts = nil
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	if err := run(spec, conf); err != nil {
 		t.Fatalf("error running sandbox: %v", err)
 	}
@@ -1668,7 +1678,7 @@ func TestUserLog(t *testing.T) {
 
 	// sched_rr_get_interval = 148 - not implemented in gvisor.
 	spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall=148")
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -1708,7 +1718,7 @@ func TestUserLog(t *testing.T) {
 }
 
 func TestWaitOnExitedSandbox(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		// Run a shell that sleeps for 1 second and then exits with a
@@ -1763,7 +1773,7 @@ func TestWaitOnExitedSandbox(t *testing.T) {
 
 func TestDestroyNotStarted(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -1790,7 +1800,7 @@ func TestDestroyNotStarted(t *testing.T) {
 func TestDestroyStarting(t *testing.T) {
 	for i := 0; i < 10; i++ {
 		spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
-		conf := testutil.TestConfig()
+		conf := testutil.TestConfig(t)
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
@@ -1835,7 +1845,7 @@ func TestDestroyStarting(t *testing.T) {
 }
 
 func TestCreateWorkingDir(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
@@ -1908,7 +1918,7 @@ func TestMountPropagation(t *testing.T) {
 		},
 	}
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -1959,7 +1969,7 @@ func TestMountPropagation(t *testing.T) {
 }
 
 func TestMountSymlink(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
@@ -2039,7 +2049,7 @@ func TestNetRaw(t *testing.T) {
 	}
 
 	for _, enableRaw := range []bool{true, false} {
-		conf := testutil.TestConfig()
+		conf := testutil.TestConfig(t)
 		conf.EnableRaw = enableRaw
 
 		test := "--enabled"
@@ -2056,7 +2066,7 @@ func TestNetRaw(t *testing.T) {
 
 // TestOverlayfsStaleRead most basic test that '--overlayfs-stale-read' works.
 func TestOverlayfsStaleRead(t *testing.T) {
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.OverlayfsStaleRead = true
 
 	in, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.in")
@@ -2120,7 +2130,7 @@ func TestTTYField(t *testing.T) {
 
 	for _, test := range testCases {
 		t.Run(test.name, func(t *testing.T) {
-			conf := testutil.TestConfig()
+			conf := testutil.TestConfig(t)
 
 			// We will run /bin/sleep, possibly with an open TTY.
 			cmd := []string{"/bin/sleep", "10000"}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 2da93ec5b..dc2fb42ce 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -135,7 +135,7 @@ func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -173,7 +173,7 @@ func TestMultiContainerSanity(t *testing.T) {
 // TestMultiPIDNS checks that it is possible to run 2 dead-simple
 // containers in the same sandbox with different pidns.
 func TestMultiPIDNS(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -218,7 +218,7 @@ func TestMultiPIDNS(t *testing.T) {
 
 // TestMultiPIDNSPath checks the pidns path.
 func TestMultiPIDNSPath(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -289,7 +289,7 @@ func TestMultiContainerWait(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// The first container should run the entire duration of the test.
@@ -367,7 +367,7 @@ func TestExecWait(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// The first container should run the entire duration of the test.
@@ -463,7 +463,7 @@ func TestMultiContainerMount(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	containers, cleanup, err := startContainers(conf, sps, ids)
@@ -484,7 +484,7 @@ func TestMultiContainerMount(t *testing.T) {
 // TestMultiContainerSignal checks that it is possible to signal individual
 // containers without killing the entire sandbox.
 func TestMultiContainerSignal(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -585,7 +585,7 @@ func TestMultiContainerDestroy(t *testing.T) {
 		t.Fatal("error finding test_app:", err)
 	}
 
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -653,7 +653,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Note: use curly braces to keep 'sh' process around. Otherwise, shell
@@ -712,7 +712,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	for _, tc := range []struct {
@@ -804,7 +804,7 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) {
 		[]string{"/bin/sleep", "100"},
 		[]string{"/bin/sleep", "100"})
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -858,7 +858,7 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 	}
 	specs, ids := createSpecs(cmds...)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -943,7 +943,7 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Make sure overlay is enabled, and none of the root filesystems are
@@ -1006,7 +1006,7 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 	childrenSpecs := allSpecs[1:]
 	childrenIDs := allIDs[1:]
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, bundleDir, err := testutil.SetupContainer(rootSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -1080,7 +1080,7 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 // Test that pod shared mounts are properly mounted in 2 containers and that
 // changes from one container is reflected in the other.
 func TestMultiContainerSharedMount(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -1195,7 +1195,7 @@ func TestMultiContainerSharedMount(t *testing.T) {
 
 // Test that pod mounts are mounted as readonly when requested.
 func TestMultiContainerSharedMountReadonly(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -1262,7 +1262,7 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) {
 
 // Test that shared pod mounts continue to work after container is restarted.
 func TestMultiContainerSharedMountRestart(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -1381,7 +1381,7 @@ func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Setup the containers.
@@ -1463,7 +1463,7 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Create the specs.
@@ -1500,7 +1500,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	sleep := []string{"sleep", "100"}
@@ -1587,7 +1587,7 @@ func TestMultiContainerLoadSandbox(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Create containers for the sandbox.
@@ -1687,7 +1687,7 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	pod, cleanup, err := startContainers(conf, podSpecs, ids)
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
index dc4194134..f80852414 100644
--- a/runsc/container/shared_volume_test.go
+++ b/runsc/container/shared_volume_test.go
@@ -31,7 +31,7 @@ import (
 // TestSharedVolume checks that modifications to a volume mount are propagated
 // into and out of the sandbox.
 func TestSharedVolume(t *testing.T) {
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.FileAccess = boot.FileAccessShared
 	t.Logf("Running test with conf: %+v", conf)
 
@@ -190,7 +190,7 @@ func checkFile(c *Container, filename string, want []byte) error {
 // TestSharedVolumeFile tests that changes to file content outside the sandbox
 // is reflected inside.
 func TestSharedVolumeFile(t *testing.T) {
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.FileAccess = boot.FileAccessShared
 	t.Logf("Running test with conf: %+v", conf)
 
diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go
index 01c47c79f..5f1c4b7d6 100644
--- a/runsc/container/test_app/test_app.go
+++ b/runsc/container/test_app/test_app.go
@@ -96,7 +96,7 @@ func (c *uds) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 
 	listener, err := net.Listen("unix", c.socketPath)
 	if err != nil {
-		log.Fatal("error listening on socket %q:", c.socketPath, err)
+		log.Fatalf("error listening on socket %q: %v", c.socketPath, err)
 	}
 
 	go server(listener, outputFile)
diff --git a/runsc/main.go b/runsc/main.go
index 62e184ec9..2baba90f8 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -84,6 +84,7 @@ var (
 	rootless           = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
 	referenceLeakMode  = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
 	cpuNumFromQuota    = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
+	vfs2Enabled        = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
 
 	// Test flags, not to be used outside tests, ever.
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
@@ -230,6 +231,7 @@ func main() {
 		ReferenceLeakMode:  refsLeakMode,
 		OverlayfsStaleRead: *overlayfsStaleRead,
 		CPUNumFromQuota:    *cpuNumFromQuota,
+		VFS2:               *vfs2Enabled,
 
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 		TestOnlyTestNameEnv:                        *testOnlyTestNameEnv,
@@ -294,9 +296,7 @@ func main() {
 		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil {
 			cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err)
 		}
-	}
-
-	if *alsoLogToStderr {
+	} else if *alsoLogToStderr {
 		e = &log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
 	}
 
@@ -313,6 +313,7 @@ func main() {
 	log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
 	log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
 	log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
+	log.Infof("\t\tVFS2 enabled: %v", conf.VFS2)
 	log.Infof("***************************")
 
 	if *testOnlyAllowRunAsCurrentUserWithoutChroot {
@@ -342,11 +343,11 @@ func main() {
 func newEmitter(format string, logFile io.Writer) log.Emitter {
 	switch format {
 	case "text":
-		return &log.GoogleEmitter{log.Writer{Next: logFile}}
+		return log.GoogleEmitter{&log.Writer{Next: logFile}}
 	case "json":
-		return &log.JSONEmitter{log.Writer{Next: logFile}}
+		return log.JSONEmitter{&log.Writer{Next: logFile}}
 	case "json-k8s":
-		return &log.K8sJSONEmitter{log.Writer{Next: logFile}}
+		return log.K8sJSONEmitter{&log.Writer{Next: logFile}}
 	}
 	cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format)
 	panic("unreachable")
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 3b06da98b..e82bcef6f 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -18,10 +18,12 @@ package sandbox
 import (
 	"context"
 	"fmt"
+	"io"
 	"math"
 	"os"
 	"os/exec"
 	"strconv"
+	"strings"
 	"syscall"
 	"time"
 
@@ -142,7 +144,19 @@ func New(conf *boot.Config, args *Args) (*Sandbox, error) {
 	// Wait until the sandbox has booted.
 	b := make([]byte, 1)
 	if l, err := clientSyncFile.Read(b); err != nil || l != 1 {
-		return nil, fmt.Errorf("waiting for sandbox to start: %v", err)
+		err := fmt.Errorf("waiting for sandbox to start: %v", err)
+		// If the sandbox failed to start, it may be because the binary
+		// permissions were incorrect. Check the bits and return a more helpful
+		// error message.
+		//
+		// NOTE: The error message is checked because error types are lost over
+		// rpc calls.
+		if strings.Contains(err.Error(), io.EOF.Error()) {
+			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
+				return nil, fmt.Errorf("%v: %v", err, permsErr)
+			}
+		}
+		return nil, err
 	}
 
 	c.Release()
@@ -388,8 +402,6 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		nextFD++
 	}
 
-	cmd.Args = append(cmd.Args, "--panic-signal="+strconv.Itoa(int(syscall.SIGTERM)))
-
 	// Add the "boot" command to the args.
 	//
 	// All flags after this must be for the boot command
@@ -706,7 +718,19 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
 	log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr)
 	if err := specutils.StartInNS(cmd, nss); err != nil {
-		return fmt.Errorf("Sandbox: %v", err)
+		err := fmt.Errorf("starting sandbox: %v", err)
+		// If the sandbox failed to start, it may be because the binary
+		// permissions were incorrect. Check the bits and return a more helpful
+		// error message.
+		//
+		// NOTE: The error message is checked because error types are lost over
+		// rpc calls.
+		if strings.Contains(err.Error(), syscall.EACCES.Error()) {
+			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
+				return fmt.Errorf("%v: %v", err, permsErr)
+			}
+		}
+		return err
 	}
 	s.child = true
 	s.Pid = cmd.Process.Pid
@@ -1169,3 +1193,31 @@ func deviceFileForPlatform(name string) (*os.File, error) {
 	}
 	return f, nil
 }
+
+// checkBinaryPermissions verifies that the required binary bits are set on
+// the runsc executable.
+func checkBinaryPermissions(conf *boot.Config) error {
+	// All platforms need the other exe bit
+	neededBits := os.FileMode(0001)
+	if conf.Platform == platforms.Ptrace {
+		// Ptrace needs the other read bit
+		neededBits |= os.FileMode(0004)
+	}
+
+	exePath, err := os.Executable()
+	if err != nil {
+		return fmt.Errorf("getting exe path: %v", err)
+	}
+
+	// Check the permissions of the runsc binary and print an error if it
+	// doesn't match expectations.
+	info, err := os.Stat(exePath)
+	if err != nil {
+		return fmt.Errorf("stat file: %v", err)
+	}
+
+	if info.Mode().Perm()&neededBits != neededBits {
+		return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath)))
+	}
+	return nil
+}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index d3c2e4e78..837d5e238 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -92,6 +92,12 @@ func ValidateSpec(spec *specs.Spec) error {
 		log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
 	}
 
+	// PR_SET_NO_NEW_PRIVS is assumed to always be set.
+	// See kernel.Task.updateCredsForExecLocked.
+	if !spec.Process.NoNewPrivileges {
+		log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.")
+	}
+
 	// TODO(gvisor.dev/issue/510): Apply seccomp to application inside sandbox.
 	if spec.Linux != nil && spec.Linux.Seccomp != nil {
 		log.Warningf("Seccomp spec is being ignored")
@@ -528,3 +534,8 @@ func EnvVar(env []string, name string) (string, bool) {
 	}
 	return "", false
 }
+
+// FaqErrorMsg returns an error message pointing to the FAQ.
+func FaqErrorMsg(anchor, msg string) string {
+	return fmt.Sprintf("%s; see https://gvisor.dev/faq#%s for more details", msg, anchor)
+}
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index 51e487715..5e09f8f16 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -31,11 +31,13 @@ import (
 	"os"
 	"os/exec"
 	"os/signal"
+	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"sync/atomic"
 	"syscall"
+	"testing"
 	"time"
 
 	"github.com/cenkalti/backoff"
@@ -81,17 +83,16 @@ func ConfigureExePath() error {
 
 // TestConfig returns the default configuration to use in tests. Note that
 // 'RootDir' must be set by caller if required.
-func TestConfig() *boot.Config {
+func TestConfig(t *testing.T) *boot.Config {
 	logDir := ""
 	if dir, ok := os.LookupEnv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
 		logDir = dir + "/"
 	}
 	return &boot.Config{
 		Debug:              true,
-		DebugLog:           logDir,
+		DebugLog:           path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%"),
 		LogFormat:          "text",
 		DebugLogFormat:     "text",
-		AlsoLogToStderr:    true,
 		LogPackets:         true,
 		Network:            boot.NetworkNone,
 		Strace:             true,